1#define GGML_COMMON_IMPL_CPP
2#define GGML_COMMON_DECL_CPP
3#include "ggml-common.h"
4#include "ggml-backend-impl.h"
5
6#include "ggml-impl.h"
7#include "ggml-cpu.h"
8#include "ggml-cpu-impl.h"
9#include "simd-mappings.h"
10#include "traits.h"
11
12#include <cmath>
13#include <cstring>
14#include <cassert>
15#include <cstdlib> // for qsort
16#include <cstdio> // for GGML_ASSERT
17
18#define GGML_CPU_CLANG_WORKAROUND
19#include "../../repack.h"
20
21#if defined(__GNUC__)
22#pragma GCC diagnostic ignored "-Woverlength-strings"
23#endif
24
25#define UNUSED GGML_UNUSED
26
27#if defined(__AVX__)
28#if defined(__F16C__)
29#if defined(__AVX512F__)
30#define GGML_F32Cx8x2_LOAD(x, y) _mm512_cvtph_ps(_mm256_set_m128i(_mm_loadu_si128((const __m128i *)(y)), _mm_loadu_si128((const __m128i *)(x))))
31#define GGML_F32Cx16_REPEAT_LOAD(x) _mm512_cvtph_ps(_mm256_set_m128i(x, x))
32#endif
33// the _mm256_cvt intrinsics require F16C
34#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
35#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask) _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
36#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
37#else
38#if defined(__AVX512F__)
39static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
40 float tmp[16];
41
42 for (int i = 0; i < 8; i++) {
43 tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
44 }
45
46 for (int i = 0; i < 8; i++) {
47 tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]);
48 }
49
50 return _mm512_loadu_ps(tmp);
51}
52static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
53 float tmp[16];
54 uint16_t tmphalf[8];
55 _mm_storeu_si128((__m128i*)tmphalf, x);
56
57 for (int i = 0; i < 4; i++) {
58 tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
59 tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
60 tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
61 tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
62 }
63
64 return _mm512_loadu_ps(tmp);
65}
66#endif
67static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
68 float tmp[8];
69
70 for (int i = 0; i < 8; i++) {
71 tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
72 }
73
74 return _mm256_loadu_ps(tmp);
75}
76static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
77 float tmp[8];
78
79 for (int i = 0; i < 4; i++) {
80 tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
81 tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]);
82 }
83
84 return _mm256_loadu_ps(tmp);
85}
86static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrangeMask) {
87 uint16_t tmphalf[8];
88 float tmp[8];
89
90 _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
91 for (int i = 0; i < 8; i++) {
92 tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
93 }
94
95 return _mm256_loadu_ps(tmp);
96}
97
98#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
99#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask) __avx_repeat_f32cx8_load(x)
100#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) __avx_rearranged_f32cx8_load(x, arrangeMask)
101#if defined(__AVX512F__)
102#define GGML_F32Cx8x2_LOAD(x, y) __avx512_f32cx8x2_load(x, y)
103#define GGML_F32Cx16_REPEAT_LOAD(x) __avx512_repeat_f32cx16_load(x)
104#endif
105#endif
106#endif
107
108static inline int nearest_int(float fval) {
109 assert(fabsf(fval) <= 4194303.f);
110 float val = fval + 12582912.f;
111 int i; memcpy(dest: &i, src: &val, n: sizeof(int));
112 return (i & 0x007fffff) - 0x00400000;
113}
114
115#if defined(__AVX2__) || defined(__AVX512F__)
116#if defined(__AVX512F__)
117// add int16_t pairwise and return as 512 bit int vector, then add the accumulator
118static inline __m512i sum_i16_pairs_acc_int32x16(const __m512i acc, const __m512i x) {
119 const __m512i ones = _mm512_set1_epi16(1);
120 return _mm512_add_epi32(acc, _mm512_madd_epi16(ones, x));
121}
122
123static inline __m512i mul_sum_us8_pairs_acc_int32x16(const __m512i acc, const __m512i ax, const __m512i sy) {
124#if defined(__AVX512VNNI__)
125 return _mm512_dpbusd_epi32(acc, ax, sy);
126#else
127 // Perform multiplication and create 16-bit values
128 const __m512i dot = _mm512_maddubs_epi16(ax, sy);
129 return sum_i16_pairs_acc_int32x16(acc, dot);
130#endif
131}
132
133// multiply int8_t, add results pairwise twice and return as 512 bit int vector,then add the accumulator
134static inline __m512i mul_sum_i8_pairs_acc_int32x16(const __m512i acc, const __m512i x, const __m512i y) {
135 const __m512i zero = _mm512_setzero_si512();
136 // Get absolute values of x vectors
137 const __m512i ax = _mm512_abs_epi8(x);
138 // Sign the values of the y vectors
139 __mmask64 blt0 = _mm512_movepi8_mask(x);
140 const __m512i sy = _mm512_mask_sub_epi8(y, blt0, zero, y);
141 return mul_sum_us8_pairs_acc_int32x16(acc, ax, sy);
142}
143#endif
144
145// add int16_t pairwise and return as 256 bit int vector, then add the accumulator
146static inline __m256i sum_i16_pairs_acc_int32x8(const __m256i acc, const __m256i x) {
147 const __m256i ones = _mm256_set1_epi16(w: 1);
148 return _mm256_add_epi32(a: acc, b: _mm256_madd_epi16(a: ones, b: x));
149}
150
151static inline __m256i mul_sum_us8_pairs_acc_int32x8(const __m256i acc, const __m256i ax, const __m256i sy) {
152#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
153 return _mm256_dpbusd_epi32(acc, ax, sy);
154#elif defined(__AVXVNNI__)
155 return _mm256_dpbusd_avx_epi32(S: acc, A: ax, B: sy);
156#else
157 // Perform multiplication and create 16-bit values
158 const __m256i dot = _mm256_maddubs_epi16(ax, sy);
159 return sum_i16_pairs_acc_int32x8(acc, dot);
160#endif
161}
162
163// Integer variant of the function defined in ggml-quants.c
164// multiply int8_t, add results pairwise twice and return as 256 bit int vector, then add the accumulator
165static inline __m256i mul_sum_i8_pairs_acc_int32x8(const __m256i acc, const __m256i x, const __m256i y) {
166#if defined(__AVXVNNIINT8__)
167 return _mm256_dpbssd_epi32(acc, x, y);
168#else
169 // Get absolute values of x vectors
170 const __m256i ax = _mm256_sign_epi8(a: x, b: x);
171 // Sign the values of the y vectors
172 const __m256i sy = _mm256_sign_epi8(a: y, b: x);
173 return mul_sum_us8_pairs_acc_int32x8(acc, ax, sy);
174#endif
175}
176#endif
177
178void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
179 assert(QK8_0 == 32);
180 assert(k % QK8_0 == 0);
181 const int nb = k / QK8_0;
182
183 block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
184
185#if defined(__AVX2__) || defined(__AVX__)
186 float id[4];
187 __m256 srcv[4][4];
188 __m256 idvec[4];
189
190 for (int i = 0; i < nb; i++) {
191 for (int row_iter = 0; row_iter < 4; row_iter++) {
192 // Load elements into 4 AVX vectors
193 __m256 v0 = _mm256_loadu_ps( p: x + row_iter * k + i * 32 );
194 __m256 v1 = _mm256_loadu_ps( p: x + row_iter * k + i * 32 + 8 );
195 __m256 v2 = _mm256_loadu_ps( p: x + row_iter * k + i * 32 + 16 );
196 __m256 v3 = _mm256_loadu_ps( p: x + row_iter * k + i * 32 + 24 );
197
198 // Compute max(abs(e)) for the block
199 const __m256 signBit = _mm256_set1_ps( w: -0.0f );
200 __m256 maxAbs = _mm256_andnot_ps( a: signBit, b: v0 );
201 maxAbs = _mm256_max_ps( a: maxAbs, b: _mm256_andnot_ps( a: signBit, b: v1 ) );
202 maxAbs = _mm256_max_ps( a: maxAbs, b: _mm256_andnot_ps( a: signBit, b: v2 ) );
203 maxAbs = _mm256_max_ps( a: maxAbs, b: _mm256_andnot_ps( a: signBit, b: v3 ) );
204
205 __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), b: _mm256_castps256_ps128( a: maxAbs ) );
206 max4 = _mm_max_ps( a: max4, b: _mm_movehl_ps( a: max4, b: max4 ) );
207 max4 = _mm_max_ss( a: max4, b: _mm_movehdup_ps( a: max4 ) );
208 const float maxScalar = _mm_cvtss_f32( a: max4 );
209
210 // Divided by 127.f to mirror results in quantize_row_q8_0
211 const float d = maxScalar / 127.f;
212 id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
213
214 // Store the scale for the individual block
215 y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
216
217 // Store the values in blocks of eight values - Aim is to use these later for block interleaving
218 srcv[row_iter][0] = v0;
219 srcv[row_iter][1] = v1;
220 srcv[row_iter][2] = v2;
221 srcv[row_iter][3] = v3;
222 idvec[row_iter] = _mm256_set1_ps(w: id[row_iter]);
223 }
224
225 // The loop iterates four times - The aim is to get 4 corresponding chunks of eight bytes from the original weight blocks that are interleaved
226 for (int j = 0; j < 4; j++) {
227 // Apply the multiplier
228 __m256 v0 = _mm256_mul_ps(a: srcv[0][j], b: idvec[0]);
229 __m256 v1 = _mm256_mul_ps(a: srcv[1][j], b: idvec[1]);
230 __m256 v2 = _mm256_mul_ps(a: srcv[2][j], b: idvec[2]);
231 __m256 v3 = _mm256_mul_ps(a: srcv[3][j], b: idvec[3]);
232
233 // Round to nearest integer
234 v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
235 v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
236 v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
237 v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
238
239 // Convert floats to integers
240 __m256i i0 = _mm256_cvtps_epi32( a: v0 );
241 __m256i i1 = _mm256_cvtps_epi32( a: v1 );
242 __m256i i2 = _mm256_cvtps_epi32( a: v2 );
243 __m256i i3 = _mm256_cvtps_epi32( a: v3 );
244
245#if defined(__AVX2__)
246 // Convert int32 to int16
247 i0 = _mm256_packs_epi32( a: i0, b: i1 );
248 i2 = _mm256_packs_epi32( a: i2, b: i3 );
249 // Convert int16 to int8
250 i0 = _mm256_packs_epi16( a: i0, b: i2 );
251
252 // Permute and store the quantized weights in the required order after the pack instruction
253 const __m256i perm = _mm256_setr_epi32( i0: 0, i1: 4, i2: 1, i3: 5, i4: 2, i5: 6, i6: 3, i7: 7 );
254 i0 = _mm256_permutevar8x32_epi32( a: i0, b: perm );
255
256 _mm256_storeu_si256(p: (__m256i *)(y[i].qs + 32 * j), a: i0);
257#else
258 // Since we don't have in AVX some necessary functions,
259 // we split the registers in half and call AVX2 analogs from SSE
260 __m128i ni0 = _mm256_castsi256_si128( i0 );
261 __m128i ni1 = _mm256_extractf128_si256( i0, 1);
262 __m128i ni2 = _mm256_castsi256_si128( i1 );
263 __m128i ni3 = _mm256_extractf128_si256( i1, 1);
264 __m128i ni4 = _mm256_castsi256_si128( i2 );
265 __m128i ni5 = _mm256_extractf128_si256( i2, 1);
266 __m128i ni6 = _mm256_castsi256_si128( i3 );
267 __m128i ni7 = _mm256_extractf128_si256( i3, 1);
268
269 // Convert int32 to int16
270 ni0 = _mm_packs_epi32( ni0, ni1 );
271 ni2 = _mm_packs_epi32( ni2, ni3 );
272 ni4 = _mm_packs_epi32( ni4, ni5 );
273 ni6 = _mm_packs_epi32( ni6, ni7 );
274 // Convert int16 to int8
275 ni0 = _mm_packs_epi16( ni0, ni2 );
276 ni4 = _mm_packs_epi16( ni4, ni6 );
277 _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j), ni0);
278 _mm_storeu_si128((__m128i *)(y[i].qs + 32 * j + 16), ni4);
279#endif
280 }
281 }
282
283#else
284 UNUSED(nb);
285 UNUSED(y);
286 ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
287#endif
288}
289
290void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
291 assert(QK_K == 256);
292 assert(k % QK_K == 0);
293 const int nb = k / QK_K;
294
295 block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
296
297#if defined(__AVX2__)
298 float iscale[4];
299 __m256 srcv[4][32];
300 __m256 iscale_vec[4];
301
302 for (int i = 0; i < nb; i++) {
303 for (int row_iter = 0; row_iter < 4; row_iter++) {
304 // Load elements into 4 AVX vectors
305 __m256 v0 = _mm256_loadu_ps( p: x + row_iter * k + i * 256 );
306 __m256 v1 = _mm256_loadu_ps( p: x + row_iter * k + i * 256 + 8 );
307 __m256 v2 = _mm256_loadu_ps( p: x + row_iter * k + i * 256 + 16 );
308 __m256 v3 = _mm256_loadu_ps( p: x + row_iter * k + i * 256 + 24 );
309
310 // Compute max(abs(e)) for the block
311 const __m256 signBit = _mm256_set1_ps( w: -0.0f );
312 __m256 abs0 = _mm256_andnot_ps( a: signBit, b: v0 );
313 __m256 abs1 = _mm256_andnot_ps( a: signBit, b: v1 );
314 __m256 abs2 = _mm256_andnot_ps( a: signBit, b: v2 );
315 __m256 abs3 = _mm256_andnot_ps( a: signBit, b: v3 );
316
317 __m256 maxAbs = _mm256_max_ps( a: abs0, b: abs1 );
318 maxAbs = _mm256_max_ps( a: maxAbs, b: abs2 );
319 maxAbs = _mm256_max_ps( a: maxAbs, b: abs3 );
320
321 __m256 mask0 = _mm256_cmp_ps( maxAbs, v0, _CMP_EQ_OQ );
322 __m256 mask1 = _mm256_cmp_ps( maxAbs, v1, _CMP_EQ_OQ );
323 __m256 mask2 = _mm256_cmp_ps( maxAbs, v2, _CMP_EQ_OQ );
324 __m256 mask3 = _mm256_cmp_ps( maxAbs, v3, _CMP_EQ_OQ );
325
326 __m256 maskAbs = _mm256_or_ps(a: _mm256_or_ps(a: mask0, b: mask1),b: _mm256_or_ps(a: mask2, b: mask3));
327
328 srcv[row_iter][0] = v0;
329 srcv[row_iter][1] = v1;
330 srcv[row_iter][2] = v2;
331 srcv[row_iter][3] = v3;
332
333 for (int sb = 1; sb < 8; sb++) {
334 // Temporarily stores absolute quant values
335 __m256 tempAbs = maxAbs;
336
337 // Load elements into 4 AVX vectors
338 __m256 v0 = _mm256_loadu_ps( p: x + row_iter * k + i * 256 + sb * 32);
339 __m256 v1 = _mm256_loadu_ps( p: x + row_iter * k + i * 256 + sb * 32 + 8 );
340 __m256 v2 = _mm256_loadu_ps( p: x + row_iter * k + i * 256 + sb * 32 + 16 );
341 __m256 v3 = _mm256_loadu_ps( p: x + row_iter * k + i * 256 + sb * 32 + 24 );
342
343 // Compute max(abs(e)) for the block
344 __m256 abs0 = _mm256_andnot_ps( a: signBit, b: v0 );
345 __m256 abs1 = _mm256_andnot_ps( a: signBit, b: v1 );
346 __m256 abs2 = _mm256_andnot_ps( a: signBit, b: v2 );
347 __m256 abs3 = _mm256_andnot_ps( a: signBit, b: v3 );
348
349 maxAbs = _mm256_max_ps( a: maxAbs, b: abs0 );
350 maxAbs = _mm256_max_ps( a: maxAbs, b: abs1 );
351 maxAbs = _mm256_max_ps( a: maxAbs, b: abs2 );
352 maxAbs = _mm256_max_ps( a: maxAbs, b: abs3 );
353
354 __m256 mask_prev = _mm256_cmp_ps( tempAbs, maxAbs, _CMP_EQ_OQ );
355 maskAbs = _mm256_and_ps( a: maskAbs, b: mask_prev );
356
357 mask0 = _mm256_cmp_ps( maxAbs, v0, _CMP_EQ_OQ );
358 mask1 = _mm256_cmp_ps( maxAbs, v1, _CMP_EQ_OQ );
359 mask2 = _mm256_cmp_ps( maxAbs, v2, _CMP_EQ_OQ );
360 mask3 = _mm256_cmp_ps( maxAbs, v3, _CMP_EQ_OQ );
361
362 __m256 mask_curr = _mm256_or_ps(a: _mm256_or_ps(a: mask0, b: mask1),b: _mm256_or_ps(a: mask2, b: mask3));
363 maskAbs = _mm256_or_ps(a: maskAbs, b: mask_curr);
364
365 srcv[row_iter][sb * 4] = v0;
366 srcv[row_iter][sb * 4 + 1] = v1;
367 srcv[row_iter][sb * 4 + 2] = v2;
368 srcv[row_iter][sb * 4 + 3] = v3;
369 }
370
371 __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), b: _mm256_castps256_ps128( a: maxAbs ) );
372 max4 = _mm_max_ps( a: max4, b: _mm_movehl_ps( a: max4, b: max4 ) );
373 max4 = _mm_max_ss( a: max4, b: _mm_movehdup_ps( a: max4 ) );
374 const float maxScalar = _mm_cvtss_f32( a: max4 );
375
376 __m256 maxScalarVec = _mm256_set1_ps(w: maxScalar);
377
378 __m256 mask_next = _mm256_cmp_ps( maxScalarVec, maxAbs, _CMP_EQ_OQ );
379 __m256 finalMask = _mm256_and_ps(a: maskAbs, b: mask_next);
380
381 const int mask = _mm256_movemask_ps(a: finalMask);
382 iscale[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
383
384 if(mask) {
385 iscale[row_iter] = ( maxScalar != 0.0f ) ? -127.f / maxScalar: 0.0f;
386 }
387
388 y[i].d[row_iter] = maxScalar ? 1/iscale[row_iter] : 0;
389 iscale_vec[row_iter] = _mm256_set1_ps(w: iscale[row_iter]);
390 }
391
392 __m256i quants_interleaved[32];
393 for (int j = 0; j < 32; j++) {
394 // Apply the multiplier
395 __m256 v0 = _mm256_mul_ps(a: srcv[0][j], b: iscale_vec[0]);
396 __m256 v1 = _mm256_mul_ps(a: srcv[1][j], b: iscale_vec[1]);
397 __m256 v2 = _mm256_mul_ps(a: srcv[2][j], b: iscale_vec[2]);
398 __m256 v3 = _mm256_mul_ps(a: srcv[3][j], b: iscale_vec[3]);
399
400 // Round to nearest integer
401 v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
402 v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
403 v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
404 v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
405
406 // Convert floats to integers
407 __m256i i0 = _mm256_cvtps_epi32( a: v0 );
408 __m256i i1 = _mm256_cvtps_epi32( a: v1 );
409 __m256i i2 = _mm256_cvtps_epi32( a: v2 );
410 __m256i i3 = _mm256_cvtps_epi32( a: v3 );
411
412 // Convert int32 to int16
413 i0 = _mm256_packs_epi32( a: i0, b: i1 );
414 i2 = _mm256_packs_epi32( a: i2, b: i3 );
415 // Convert int16 to int8
416 i0 = _mm256_packs_epi16( a: i0, b: i2 );
417
418 // Permute and store the quantized weights in the required order after the pack instruction
419 const __m256i perm = _mm256_setr_epi32( i0: 0, i1: 4, i2: 1, i3: 5, i4: 2, i5: 6, i6: 3, i7: 7 );
420 i0 = _mm256_permutevar8x32_epi32( a: i0, b: perm );
421
422 _mm256_storeu_si256(p: (__m256i *)(y[i].qs + 32 * j), a: i0);
423 quants_interleaved[j] = i0;
424 }
425
426 // Masks to shuffle the quants of corresonding sub blocks for rearraning quants for vectorized bsums computation
427 __m256i shuffle_mask_sb2 = _mm256_castsi128_si256(a: _mm_setr_epi8(b0: 0, b1: 1, b2: 0, b3: 1, b4: 4, b5: 5, b6: 6, b7: 7, b8: 8, b9: 9, b10: 8, b11: 9, b12: 12, b13: 13, b14: 14, b15: 15));
428 shuffle_mask_sb2 = _mm256_permute2f128_si256(shuffle_mask_sb2, shuffle_mask_sb2, 0);
429 __m256i shuffle_mask_sb3 = _mm256_castsi128_si256(a: _mm_setr_epi8(b0: 0, b1: 1, b2: 2, b3: 3, b4: 0, b5: 1, b6: 6, b7: 7, b8: 8, b9: 9, b10: 10, b11: 11, b12: 8, b13: 9, b14: 14, b15: 15));
430 shuffle_mask_sb3 = _mm256_permute2f128_si256(shuffle_mask_sb3, shuffle_mask_sb3, 0);
431 __m256i shuffle_mask_sb4 = _mm256_castsi128_si256(a: _mm_setr_epi8(b0: 0, b1: 1, b2: 2, b3: 3, b4: 4, b5: 5, b6: 0, b7: 1, b8: 8, b9: 9, b10: 10, b11: 11, b12: 12, b13: 13, b14: 8, b15: 9));
432 shuffle_mask_sb4 = _mm256_permute2f128_si256(shuffle_mask_sb4, shuffle_mask_sb4, 0);
433
434 for (int k = 0; k < 4; k++) {
435 // Quants from four different sub blocks are taken
436 __m256i q0 = quants_interleaved[k * 8 + 0];
437 __m256i q1 = quants_interleaved[k * 8 + 1];
438 __m256i q2 = quants_interleaved[k * 8 + 2];
439 __m256i q3 = quants_interleaved[k * 8 + 3];
440 __m256i q4 = quants_interleaved[k * 8 + 4];
441 __m256i q5 = quants_interleaved[k * 8 + 5];
442 __m256i q6 = quants_interleaved[k * 8 + 6];
443 __m256i q7 = quants_interleaved[k * 8 + 7];
444
445
446 // The below code block has the first half of different sub blocks shuffled and blended so as to process 2 values from each sub block at a time
447 __m256i sb2_h1_shuffled = _mm256_shuffle_epi8(a: q2, b: shuffle_mask_sb2);
448 __m256i sb_h1_interleaved = _mm256_blend_epi16(q0, sb2_h1_shuffled, 34);
449 __m256i sb3_h1_shuffled = _mm256_shuffle_epi8(a: q4, b: shuffle_mask_sb3);
450 sb_h1_interleaved = _mm256_blend_epi16(sb_h1_interleaved, sb3_h1_shuffled, 68);
451 __m256i sb4_h1_shuffled = _mm256_shuffle_epi8(a: q6, b: shuffle_mask_sb4);
452 sb_h1_interleaved = _mm256_blend_epi16(sb_h1_interleaved, sb4_h1_shuffled, 136);
453
454 __m256i one = _mm256_set1_epi8(b: 1);
455 __m256i bsums_r1 = _mm256_maddubs_epi16(a: one, b: sb_h1_interleaved);
456
457 for (int l = 0; l < 3; l++) {
458 // Quants value shifted to process next two values from each sub block
459 q0 = _mm256_srli_epi64(a: q0, count: 16);
460 q2 = _mm256_srli_epi64(a: q2, count: 16);
461 q4 = _mm256_srli_epi64(a: q4, count: 16);
462 q6 = _mm256_srli_epi64(a: q6, count: 16);
463
464 sb2_h1_shuffled = _mm256_shuffle_epi8(a: q2, b: shuffle_mask_sb2);
465 sb_h1_interleaved = _mm256_blend_epi16(q0, sb2_h1_shuffled, 34);
466 sb3_h1_shuffled = _mm256_shuffle_epi8(a: q4, b: shuffle_mask_sb3);
467 sb_h1_interleaved = _mm256_blend_epi16(sb_h1_interleaved, sb3_h1_shuffled, 68);
468 sb4_h1_shuffled = _mm256_shuffle_epi8(a: q6, b: shuffle_mask_sb4);
469 sb_h1_interleaved = _mm256_blend_epi16(sb_h1_interleaved, sb4_h1_shuffled, 136);
470
471 bsums_r1 = _mm256_add_epi16(a: bsums_r1, b: _mm256_maddubs_epi16(a: one, b: sb_h1_interleaved));
472 }
473
474 // The below code block has the second half of different sub blocks shuffled and blended so as to process 2 values from each sub block at a time
475 __m256i sb2_h2_shuffled = _mm256_shuffle_epi8(a: q3, b: shuffle_mask_sb2);
476 __m256i sb_h2_interleaved = _mm256_blend_epi16(q1, sb2_h2_shuffled, 34);
477 __m256i sb3_h2_shuffled = _mm256_shuffle_epi8(a: q5, b: shuffle_mask_sb3);
478 sb_h2_interleaved = _mm256_blend_epi16(sb_h2_interleaved, sb3_h2_shuffled, 68);
479 __m256i sb4_h2_shuffled = _mm256_shuffle_epi8(a: q7, b: shuffle_mask_sb4);
480 sb_h2_interleaved = _mm256_blend_epi16(sb_h2_interleaved, sb4_h2_shuffled, 136);
481
482 __m256i bsums_r2 = _mm256_maddubs_epi16(a: one, b: sb_h2_interleaved);
483
484 for (int l = 0; l < 3; l++) {
485 // Quants value shifted to process next two values from each sub block
486 q1 = _mm256_srli_epi64(a: q1, count: 16);
487 q3 = _mm256_srli_epi64(a: q3, count: 16);
488 q5 = _mm256_srli_epi64(a: q5, count: 16);
489 q7 = _mm256_srli_epi64(a: q7, count: 16);
490
491 sb2_h2_shuffled = _mm256_shuffle_epi8(a: q3, b: shuffle_mask_sb2);
492 sb_h2_interleaved = _mm256_blend_epi16(q1, sb2_h2_shuffled, 34);
493 sb3_h2_shuffled = _mm256_shuffle_epi8(a: q5, b: shuffle_mask_sb3);
494 sb_h2_interleaved = _mm256_blend_epi16(sb_h2_interleaved, sb3_h2_shuffled, 68);
495 sb4_h2_shuffled = _mm256_shuffle_epi8(a: q7, b: shuffle_mask_sb4);
496 sb_h2_interleaved = _mm256_blend_epi16(sb_h2_interleaved, sb4_h2_shuffled, 136);
497
498 bsums_r2 = _mm256_add_epi16(a: bsums_r2, b: _mm256_maddubs_epi16(a: one, b: sb_h2_interleaved));
499 }
500
501 // Overall bsums in interleaved fashion computed by adding results of both halves
502 __m256i bsums_r = _mm256_add_epi16(a: bsums_r1, b: bsums_r2);
503 _mm256_storeu_si256(p: (__m256i *)(y[i].bsums + 16 * k), a: bsums_r);
504 }
505 }
506
507#else
508 UNUSED(nb);
509 UNUSED(y);
510 ggml_quantize_mat_q8_K_4x8_generic(x, vy, k);
511#endif
512}
513
514//
515// GEMV/GEMM templates
516//
517
518#if defined(__AVX2__) || defined(__AVX512F__)
519
520// GEMV for 8x blocks of 32 4-bit quants with a single scale factor per block
521template<typename block_tx8>
522static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, __m256i signextendlut) {
523 static_assert(
524 std::is_same_v<block_tx8, block_q4_0x8> ||
525 std::is_same_v<block_tx8, block_iq4_nlx8>,
526 "Unsupported block type");
527
528 const int qk = QK8_0;
529 const int nb = n / qk;
530
531 UNUSED(bs);
532
533 __m128i changemask = _mm_set_epi8(b15: 15, b14: 14, b13: 7, b12: 6, b11: 13, b10: 12, b9: 5, b8: 4, b7: 11, b6: 10, b5: 3, b4: 2, b3: 9, b2: 8, b1: 1, b0: 0);
534 __m256i finalpermutemask = _mm256_set_epi32(i0: 7, i1: 5, i2: 3, i3: 1, i4: 6, i5: 4, i6: 2, i7: 0);
535
536 // Permute mask used for easier vector processing at later stages
537 const __m256i m4b = _mm256_set1_epi8(b: 0x0F);
538
539 int64_t b_nb = n / 32;
540
541 const block_tx8 * b_ptr_start = (const block_tx8 *)vx;
542 const block_q8_0 * a_ptr_start = (const block_q8_0 *)vy;
543
544 // Process Q8_0 blocks one by one
545 for (int64_t y = 0; y < nr; y++) {
546
547 // Pointers to LHS blocks of block_q8_0 format
548 const block_q8_0 * a_ptr = a_ptr_start + (y * nb);
549
550 // Take group of eight blocks at each pass of the loop and perform dot product operation
551 for (int64_t x = 0; x < nc / 8; x++) {
552
553 // Pointers to RHS blocks
554 const block_tx8 * b_ptr = b_ptr_start + (x * b_nb);
555
556 // Master FP accumulator
557 __m256 acc_row = _mm256_setzero_ps();
558
559 for (int64_t b = 0; b < nb; b++) {
560 // Load 8 blocks of 32 interleaved as 8 bytes (B0 - B7)
561 const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs));
562 const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs) + 1);
563 const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs) + 2);
564 const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs) + 3);
565
566 // 4-bit -> 8-bit - Sign is maintained
567 const __m256i rhs_vec_0123_0 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_vec_0123_0, b: m4b)); // B0(0-7) B1(0-7) B2(0-7) B3(0-7)
568 const __m256i rhs_vec_4567_0 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_vec_4567_0, b: m4b)); // B4(0-7) B5(0-7) B6(0-7) B7(0-7)
569 const __m256i rhs_vec_0123_1 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_vec_0123_1, b: m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
570 const __m256i rhs_vec_4567_1 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_vec_4567_1, b: m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
571
572 const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_0, count: 4), b: m4b)); // B0(16-23) B1(16-23) B2(16-23) B3(16-23)
573 const __m256i rhs_vec_4567_2 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_0, count: 4), b: m4b)); // B4(16-23) B5(16-23) B6(16-23) B7(16-23)
574 const __m256i rhs_vec_0123_3 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_1, count: 4), b: m4b)); // B0(24-31) B1(24-31) B2(24-31) B3(24-31)
575 const __m256i rhs_vec_4567_3 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_1, count: 4), b: m4b)); // B4(24-31) B5(24-31) B6(24-31) B7(24-31)
576
577 // Load the scale values for the 8 blocks interleaved in block_tx8
578 __m256 col_scale_f32;
579 if constexpr (
580 std::is_same_v<block_tx8, block_q4_0x8> ||
581 std::is_same_v<block_tx8, block_iq4_nlx8>) {
582 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
583 }
584
585 // Load and convert to FP32 scale from block_q8_0
586 const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
587
588 // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
589 __m256i lhs_vec_0 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)a_ptr[b].qs));
590 __m256i lhs_vec_1 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 16)));
591
592 lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0); // A0 (0-15) A0(0-15)
593 lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0); // A0 (16-31) A0(16-31))
594
595 __m256i iacc = _mm256_setzero_si256();
596
597 // Dot product done within 32 bit lanes and accumulated in the same vector
598 // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
599 // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
600 // ...........................................................................
601 // B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
602
603 iacc = mul_sum_i8_pairs_acc_int32x8(acc: iacc, _mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0));
604 iacc = mul_sum_i8_pairs_acc_int32x8(acc: iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85));
605
606 iacc = mul_sum_i8_pairs_acc_int32x8(acc: iacc, _mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170));
607 iacc = mul_sum_i8_pairs_acc_int32x8(acc: iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255));
608
609 iacc = mul_sum_i8_pairs_acc_int32x8(acc: iacc, _mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0));
610 iacc = mul_sum_i8_pairs_acc_int32x8(acc: iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85));
611
612 iacc = mul_sum_i8_pairs_acc_int32x8(acc: iacc, _mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170));
613 iacc = mul_sum_i8_pairs_acc_int32x8(acc: iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255));
614
615 // Accumulated values multipled with appropriate scales
616 acc_row = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc), B: _mm256_mul_ps(a: col_scale_f32, b: row_scale_f32), C: acc_row);
617 }
618
619 // Accumulated output values permuted so as to be stored in appropriate order post accumulation
620 acc_row = _mm256_permutevar8x32_ps(a: acc_row, b: finalpermutemask);
621 _mm256_storeu_ps(p: s + (y * nr + x * 8), a: acc_row);
622 }
623 }
624}
625
626// GEMM for 8x blocks of 32 4-bit quants with a single scale factor per block
627template<typename block_tx8>
628static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc, __m256i signextendlut) {
629 static_assert(
630 std::is_same_v<block_tx8, block_q4_0x8> ||
631 std::is_same_v<block_tx8, block_iq4_nlx8>,
632 "Unsupported block type");
633
634 const int qk = QK8_0;
635 const int nb = n / qk;
636
637 const block_tx8 * b_ptr_start = (const block_tx8 *)vx;
638 const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
639
640 int64_t b_nb = n / 32;
641 int64_t y = 0;
642 // Mask to mask out nibbles from packed bytes
643 const __m256i m4b = _mm256_set1_epi8(b: 0x0F);
644 const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
645 // Permute mask used for easier vector processing at later stages
646 __m256i requiredOrder = _mm256_set_epi32(i0: 3, i1: 2, i2: 1, i3: 0, i4: 7, i5: 6, i6: 5, i7: 4);
647 int64_t xstart = 0;
648 int anr = nr - nr%16; // Used to align nr with boundary of 16
649#ifdef __AVX512F__
650 int anc = nc - nc%16; // Used to align nc with boundary of 16
651 // Mask to mask out nibbles from packed bytes expanded to 512 bit length
652 const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
653 // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
654 __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
655
656 // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
657 for (; y < anr / 4; y += 4) {
658
659 const block_q8_0x4 * a_ptrs[4];
660
661 a_ptrs[0] = a_ptr_start + (y * nb);
662 for (int i = 0; i < 3; ++i) {
663 a_ptrs[i + 1] = a_ptrs[i] + nb;
664 }
665
666 // Take group of two block_tx8 structures at each pass of the loop and perform dot product operation
667 for (int64_t x = 0; x < anc / 8; x += 2) {
668
669 const block_tx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
670 const block_tx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
671
672 // Master FP accumulators
673 __m512 acc_rows[16];
674 for (int i = 0; i < 16; i++) {
675 acc_rows[i] = _mm512_setzero_ps();
676 }
677
678 for (int64_t b = 0; b < nb; b++) {
679 // Load the sixteen blocks of quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
680 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
681 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
682 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
683 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
684
685 const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
686 const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
687 const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
688 const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
689
690 // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
691 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
692 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
693 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
694 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
695
696 const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
697 const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
698 const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
699 const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
700
701 const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
702 const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
703 const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
704 const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
705
706 // 4-bit -> 8-bit - Sign is maintained
707 const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
708 const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
709
710 const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
711 const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
712
713 const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
714 const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
715
716 const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
717 const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
718
719 // Shuffle pattern one - right side input
720 const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
721 const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
722
723 const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
724 const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
725
726 const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
727 const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
728
729 const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
730 const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
731
732 // Shuffle pattern two - right side input
733
734 const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
735 const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
736
737 const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
738 const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
739
740 const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
741 const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
742
743 const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
744 const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
745
746 // Scale values - Load the weight scale values of two block_tx8
747 __m512 col_scale_f32;
748 if constexpr (
749 std::is_same_v<block_tx8, block_q4_0x8> ||
750 std::is_same_v<block_tx8, block_iq4_nlx8>) {
751 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
752 }
753
754 // Process LHS in pairs of rows
755 for (int rp = 0; rp < 4; rp++) {
756
757 // Load the four blocks of quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
758 // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
759 __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
760 __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
761 __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
762 __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
763 __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
764 __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
765 __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
766 __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
767 __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
768 __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
769 __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
770 __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
771
772 __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
773 __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
774 __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
775 __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
776 __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
777 __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
778 __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
779 __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
780
781 // Shuffle pattern one - left side input
782
783 const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
784 const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
785
786 const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
787 const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
788
789 const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
790 const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
791
792 const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
793 const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
794
795 // Shuffle pattern two - left side input
796
797 const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
798 const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
799
800 const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
801 const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
802
803 const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
804 const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
805
806 const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
807 const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
808
809 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
810 // Resembles MMLAs into 2x2 matrices in ARM Version
811 const __m512i zero = _mm512_setzero_epi32();
812 __m512i iacc_mat_00_sp1 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1), lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1), lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1);
813 __m512i iacc_mat_01_sp1 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1), lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1), lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1);
814 __m512i iacc_mat_10_sp1 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1), lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1), lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1);
815 __m512i iacc_mat_11_sp1 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1), lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1), lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1);
816 __m512i iacc_mat_00_sp2 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2), lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2), lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2);
817 __m512i iacc_mat_01_sp2 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2), lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2), lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2);
818 __m512i iacc_mat_10_sp2 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2), lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2), lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2);
819 __m512i iacc_mat_11_sp2 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2), lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2), lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2);
820
821 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
822 __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
823 __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
824 __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
825 __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
826
827
828 // Straighten out to make 4 row vectors
829 __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
830 __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
831 __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
832 __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);
833
834 // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
835 const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
836 const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
837
838 // Multiply with appropiate scales and accumulate
839 acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
840 acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
841 acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
842 acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
843 }
844 }
845
846 // Store the accumulated values
847 for (int i = 0; i < 16; i++) {
848 _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
849 }
850 }
851 }
852
853 // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
854 for (; y < nr / 4; y ++) {
855 const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
856
857 // Take group of two block_tx8 structures at each pass of the loop and perform dot product operation
858 for (int64_t x = 0; x < anc / 8; x += 2) {
859
860 const block_tx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
861 const block_tx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
862
863 // Master FP accumulators
864 __m512 acc_rows[4];
865 for (int i = 0; i < 4; i++) {
866 acc_rows[i] = _mm512_setzero_ps();
867 }
868
869 for (int64_t b = 0; b < nb; b++) {
870 // Load the sixteen blocks of quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
871 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
872 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
873 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
874 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
875
876 const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
877 const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
878 const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
879 const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
880
881 // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
882 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
883 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
884 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
885 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
886
887 const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
888 const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
889 const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
890 const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
891
892 const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
893 const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
894 const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
895 const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
896
897 // 4-bit -> 8-bit - Sign is maintained
898 const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
899 const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
900
901 const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
902 const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
903
904 const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
905 const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
906
907 const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
908 const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
909
910 // Shuffle pattern one - right side input
911 const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
912 const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
913
914 const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
915 const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
916
917 const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
918 const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
919
920 const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
921 const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
922
923 // Shuffle pattern two - right side input
924
925 const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
926 const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
927
928 const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
929 const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
930
931 const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
932 const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
933
934 const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
935 const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
936
937
938 // Scale values - Load the weight scale values of two block_tx8
939 __m512 col_scale_f32;
940 if constexpr (
941 std::is_same_v<block_tx8, block_q4_0x8> ||
942 std::is_same_v<block_tx8, block_iq4_nlx8>) {
943 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
944 }
945
946 // Load the four blocks of quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
947 // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
948 __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
949 __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
950 __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
951 __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
952 __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
953 __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
954 __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
955 __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
956 __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
957 __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
958 __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
959 __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
960
961 __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
962 __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
963 __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
964 __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
965 __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
966 __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
967 __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
968 __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
969
970 // Shuffle pattern one - left side input
971
972 const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
973 const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
974
975 const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
976 const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
977
978 const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
979 const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
980
981 const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
982 const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
983
984 // Shuffle pattern two - left side input
985
986 const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
987 const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
988
989 const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
990 const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
991
992 const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
993 const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
994
995 const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
996 const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
997
998 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
999 // Resembles MMLAs into 2x2 matrices in ARM Version
1000 const __m512i zero = _mm512_setzero_epi32();
1001 __m512i iacc_mat_00_sp1 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1), lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1), lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1);
1002 __m512i iacc_mat_01_sp1 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1), lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1), lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1);
1003 __m512i iacc_mat_10_sp1 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1), lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1), lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1);
1004 __m512i iacc_mat_11_sp1 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1), lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1), lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1);
1005 __m512i iacc_mat_00_sp2 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2), lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2), lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2);
1006 __m512i iacc_mat_01_sp2 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2), lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2), lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2);
1007 __m512i iacc_mat_10_sp2 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2), lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2), lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2);
1008 __m512i iacc_mat_11_sp2 = mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(mul_sum_i8_pairs_acc_int32x16(zero, lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2), lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2), lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2);
1009
1010 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
1011 __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
1012 __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
1013 __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
1014 __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
1015
1016
1017 // Straighten out to make 4 row vectors
1018 __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
1019 __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
1020 __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
1021 __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);
1022
1023 // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
1024 const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
1025 const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
1026
1027 // Multiply with appropiate scales and accumulate
1028 acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
1029 acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
1030 acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
1031 acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
1032 }
1033
1034 // Store the accumulated values
1035 for (int i = 0; i < 4; i++) {
1036 _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
1037 }
1038 }
1039 }
1040 if (anc != nc) {
1041 xstart = anc/8;
1042 y = 0;
1043 }
1044#endif // __AVX512F__
1045
1046 // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
1047
1048 for (; y < anr / 4; y += 4) {
1049 const block_q8_0x4 * a_ptrs[4];
1050
1051 a_ptrs[0] = a_ptr_start + (y * nb);
1052 for (int i = 0; i < 3; ++i) {
1053 a_ptrs[i + 1] = a_ptrs[i] + nb;
1054 }
1055
1056 // Take group of eight block_tx8 structures at each pass of the loop and perform dot product operation
1057 for (int64_t x = xstart; x < nc / 8; x++) {
1058
1059 const block_tx8 * b_ptr = b_ptr_start + (x * b_nb);
1060
1061 // Master FP accumulators
1062 __m256 acc_rows[16];
1063 for (int i = 0; i < 16; i++) {
1064 acc_rows[i] = _mm256_setzero_ps();
1065 }
1066
1067 for (int64_t b = 0; b < nb; b++) {
1068 // Load the eight blocks of quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
1069 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs));
1070 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 32));
1071 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 64));
1072 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 96));
1073
1074 // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
1075 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
1076 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
1077 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
1078 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
1079
1080 // 4-bit -> 8-bit - Sign is maintained
1081 const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_mat_0145_0, b: m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
1082 const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_mat_2367_0, b: m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
1083
1084 const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_mat_0145_1, b: m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
1085 const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_mat_2367_1, b: m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
1086
1087 const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 4), b: m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
1088 const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 4), b: m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
1089
1090 const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 4), b: m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
1091 const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 4), b: m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
1092
1093 // Shuffle pattern one - right side input
1094 const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
1095 const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
1096
1097 const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
1098 const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
1099
1100 const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
1101 const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
1102
1103 const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
1104 const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
1105
1106 // Shuffle pattern two - right side input
1107
1108 const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
1109 const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
1110
1111 const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
1112 const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
1113
1114 const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
1115 const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
1116
1117 const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
1118 const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
1119
1120 // Scale values - Load the wight scale values of block_tx8
1121 __m256 col_scale_f32;
1122 if constexpr (
1123 std::is_same_v<block_tx8, block_q4_0x8> ||
1124 std::is_same_v<block_tx8, block_iq4_nlx8>) {
1125 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
1126 }
1127
1128 // Process LHS in groups of four
1129 for (int rp = 0; rp < 4; rp++) {
1130 // Load the four blocks of quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
1131 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
1132 __m256i lhs_mat_0123_0 = _mm256_loadu_si256(p: (const __m256i *)((a_ptrs[rp][b].qs)));
1133 __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
1134 __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
1135 __m256i lhs_mat_0123_1 = _mm256_loadu_si256(p: (const __m256i *)((a_ptrs[rp][b].qs + 32)));
1136 __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
1137 __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
1138 __m256i lhs_mat_0123_2 = _mm256_loadu_si256(p: (const __m256i *)((a_ptrs[rp][b].qs + 64)));
1139 __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
1140 __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
1141 __m256i lhs_mat_0123_3 = _mm256_loadu_si256(p: (const __m256i *)((a_ptrs[rp][b].qs + 96)));
1142 __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
1143 __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
1144
1145 // Shuffle pattern one - left side input
1146 const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
1147 const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
1148
1149 const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
1150 const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
1151
1152 const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
1153 const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
1154
1155 const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
1156 const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
1157
1158 // Shuffle pattern two - left side input
1159 const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
1160 const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
1161
1162 const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
1163 const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
1164
1165 const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
1166 const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
1167
1168 const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
1169 const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
1170
1171 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
1172 // Resembles MMLAs into 2x2 matrices in ARM Version
1173 const __m256i zero = _mm256_setzero_si256();
1174 __m256i iacc_mat_00_sp1 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_01_3_sp1, y: rhs_mat_0145_3_sp1), x: lhs_mat_01_2_sp1, y: rhs_mat_0145_2_sp1), x: lhs_mat_01_1_sp1, y: rhs_mat_0145_1_sp1), x: lhs_mat_01_0_sp1, y: rhs_mat_0145_0_sp1);
1175 __m256i iacc_mat_01_sp1 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_01_3_sp1, y: rhs_mat_2367_3_sp1), x: lhs_mat_01_2_sp1, y: rhs_mat_2367_2_sp1), x: lhs_mat_01_1_sp1, y: rhs_mat_2367_1_sp1), x: lhs_mat_01_0_sp1, y: rhs_mat_2367_0_sp1);
1176 __m256i iacc_mat_10_sp1 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_23_3_sp1, y: rhs_mat_0145_3_sp1), x: lhs_mat_23_2_sp1, y: rhs_mat_0145_2_sp1), x: lhs_mat_23_1_sp1, y: rhs_mat_0145_1_sp1), x: lhs_mat_23_0_sp1, y: rhs_mat_0145_0_sp1);
1177 __m256i iacc_mat_11_sp1 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_23_3_sp1, y: rhs_mat_2367_3_sp1), x: lhs_mat_23_2_sp1, y: rhs_mat_2367_2_sp1), x: lhs_mat_23_1_sp1, y: rhs_mat_2367_1_sp1), x: lhs_mat_23_0_sp1, y: rhs_mat_2367_0_sp1);
1178 __m256i iacc_mat_00_sp2 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_01_3_sp2, y: rhs_mat_0145_3_sp2), x: lhs_mat_01_2_sp2, y: rhs_mat_0145_2_sp2), x: lhs_mat_01_1_sp2, y: rhs_mat_0145_1_sp2), x: lhs_mat_01_0_sp2, y: rhs_mat_0145_0_sp2);
1179 __m256i iacc_mat_01_sp2 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_01_3_sp2, y: rhs_mat_2367_3_sp2), x: lhs_mat_01_2_sp2, y: rhs_mat_2367_2_sp2), x: lhs_mat_01_1_sp2, y: rhs_mat_2367_1_sp2), x: lhs_mat_01_0_sp2, y: rhs_mat_2367_0_sp2);
1180 __m256i iacc_mat_10_sp2 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_23_3_sp2, y: rhs_mat_0145_3_sp2), x: lhs_mat_23_2_sp2, y: rhs_mat_0145_2_sp2), x: lhs_mat_23_1_sp2, y: rhs_mat_0145_1_sp2), x: lhs_mat_23_0_sp2, y: rhs_mat_0145_0_sp2);
1181 __m256i iacc_mat_11_sp2 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_23_3_sp2, y: rhs_mat_2367_3_sp2), x: lhs_mat_23_2_sp2, y: rhs_mat_2367_2_sp2), x: lhs_mat_23_1_sp2, y: rhs_mat_2367_1_sp2), x: lhs_mat_23_0_sp2, y: rhs_mat_2367_0_sp2);
1182
1183 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
1184 __m256i iacc_mat_00 = _mm256_add_epi32(a: iacc_mat_00_sp1, b: iacc_mat_00_sp2);
1185 __m256i iacc_mat_01 = _mm256_add_epi32(a: iacc_mat_01_sp1, b: iacc_mat_01_sp2);
1186 __m256i iacc_mat_10 = _mm256_add_epi32(a: iacc_mat_10_sp1, b: iacc_mat_10_sp2);
1187 __m256i iacc_mat_11 = _mm256_add_epi32(a: iacc_mat_11_sp1, b: iacc_mat_11_sp2);
1188
1189 // Straighten out to make 4 row vectors
1190 __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
1191 __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
1192 __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
1193 __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
1194
1195 // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
1196 const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
1197
1198 // Multiply with appropiate scales and accumulate
1199 acc_rows[rp * 4] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_0), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_rows[rp * 4]);
1200 acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_1), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_rows[rp * 4 + 1]);
1201 acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_2), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_rows[rp * 4 + 2]);
1202 acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_3), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_rows[rp * 4 + 3]);
1203 }
1204 }
1205
1206 // Store the accumulated values
1207 for (int i = 0; i < 16; i++) {
1208 _mm256_storeu_ps(p: (float *)(s + ((y * 4 + i) * bs + x * 8)), a: acc_rows[i]);
1209 }
1210 }
1211 }
1212
1213 // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
1214 for (; y < nr / 4; y ++) {
1215 const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
1216
1217 // Load the eight blocks of quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
1218 for (int64_t x = xstart; x < nc / 8; x++) {
1219 const block_tx8 * b_ptr = b_ptr_start + (x * b_nb);
1220
1221 // Master FP accumulators
1222 __m256 acc_rows[4];
1223 for (int i = 0; i < 4; i++) {
1224 acc_rows[i] = _mm256_setzero_ps();
1225 }
1226
1227 for (int64_t b = 0; b < nb; b++) {
1228 // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
1229 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs));
1230 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 32));
1231 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 64));
1232 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 96));
1233
1234 // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
1235 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
1236 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
1237 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
1238 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
1239
1240 // 4-bit -> 8-bit - Sign is maintained
1241 const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_mat_0145_0, b: m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
1242 const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_mat_2367_0, b: m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
1243
1244 const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_mat_0145_1, b: m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
1245 const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: rhs_raw_mat_2367_1, b: m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
1246
1247 const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 4), b: m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
1248 const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 4), b: m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
1249
1250 const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 4), b: m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
1251 const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(a: signextendlut, b: _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 4), b: m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
1252
1253 // Shuffle pattern one - right side input
1254 const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
1255 const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
1256
1257 const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
1258 const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
1259
1260 const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
1261 const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
1262
1263 const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
1264 const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
1265
1266 // Shuffle pattern two - right side input
1267
1268 const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
1269 const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
1270
1271 const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
1272 const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
1273
1274 const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
1275 const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
1276
1277 const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
1278 const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
1279
1280 // Scale values - Load the wight scale values of block_tx8
1281 __m256 col_scale_f32;
1282 if constexpr (
1283 std::is_same_v<block_tx8, block_q4_0x8> ||
1284 std::is_same_v<block_tx8, block_iq4_nlx8>) {
1285 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
1286 }
1287
1288 // Load the four blocks of quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
1289 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
1290 __m256i lhs_mat_0123_0 = _mm256_loadu_si256(p: (const __m256i *)((a_ptr[b].qs)));
1291 __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
1292 __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
1293 __m256i lhs_mat_0123_1 = _mm256_loadu_si256(p: (const __m256i *)((a_ptr[b].qs + 32)));
1294 __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
1295 __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
1296 __m256i lhs_mat_0123_2 = _mm256_loadu_si256(p: (const __m256i *)((a_ptr[b].qs + 64)));
1297 __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
1298 __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
1299 __m256i lhs_mat_0123_3 = _mm256_loadu_si256(p: (const __m256i *)((a_ptr[b].qs + 96)));
1300 __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
1301 __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
1302
1303 // Shuffle pattern one - left side input
1304
1305 const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
1306 const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
1307
1308 const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
1309 const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
1310
1311 const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
1312 const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
1313
1314 const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
1315 const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
1316
1317 // Shuffle pattern two - left side input
1318
1319 const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
1320 const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
1321
1322 const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
1323 const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
1324
1325 const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
1326 const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
1327
1328 const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
1329 const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
1330
1331 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
1332 // Resembles MMLAs into 2x2 matrices in ARM Version
1333 const __m256i zero = _mm256_setzero_si256();
1334 __m256i iacc_mat_00_sp1 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_01_3_sp1, y: rhs_mat_0145_3_sp1), x: lhs_mat_01_2_sp1, y: rhs_mat_0145_2_sp1), x: lhs_mat_01_1_sp1, y: rhs_mat_0145_1_sp1), x: lhs_mat_01_0_sp1, y: rhs_mat_0145_0_sp1);
1335 __m256i iacc_mat_01_sp1 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_01_3_sp1, y: rhs_mat_2367_3_sp1), x: lhs_mat_01_2_sp1, y: rhs_mat_2367_2_sp1), x: lhs_mat_01_1_sp1, y: rhs_mat_2367_1_sp1), x: lhs_mat_01_0_sp1, y: rhs_mat_2367_0_sp1);
1336 __m256i iacc_mat_10_sp1 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_23_3_sp1, y: rhs_mat_0145_3_sp1), x: lhs_mat_23_2_sp1, y: rhs_mat_0145_2_sp1), x: lhs_mat_23_1_sp1, y: rhs_mat_0145_1_sp1), x: lhs_mat_23_0_sp1, y: rhs_mat_0145_0_sp1);
1337 __m256i iacc_mat_11_sp1 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_23_3_sp1, y: rhs_mat_2367_3_sp1), x: lhs_mat_23_2_sp1, y: rhs_mat_2367_2_sp1), x: lhs_mat_23_1_sp1, y: rhs_mat_2367_1_sp1), x: lhs_mat_23_0_sp1, y: rhs_mat_2367_0_sp1);
1338 __m256i iacc_mat_00_sp2 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_01_3_sp2, y: rhs_mat_0145_3_sp2), x: lhs_mat_01_2_sp2, y: rhs_mat_0145_2_sp2), x: lhs_mat_01_1_sp2, y: rhs_mat_0145_1_sp2), x: lhs_mat_01_0_sp2, y: rhs_mat_0145_0_sp2);
1339 __m256i iacc_mat_01_sp2 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_01_3_sp2, y: rhs_mat_2367_3_sp2), x: lhs_mat_01_2_sp2, y: rhs_mat_2367_2_sp2), x: lhs_mat_01_1_sp2, y: rhs_mat_2367_1_sp2), x: lhs_mat_01_0_sp2, y: rhs_mat_2367_0_sp2);
1340 __m256i iacc_mat_10_sp2 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_23_3_sp2, y: rhs_mat_0145_3_sp2), x: lhs_mat_23_2_sp2, y: rhs_mat_0145_2_sp2), x: lhs_mat_23_1_sp2, y: rhs_mat_0145_1_sp2), x: lhs_mat_23_0_sp2, y: rhs_mat_0145_0_sp2);
1341 __m256i iacc_mat_11_sp2 = mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: mul_sum_i8_pairs_acc_int32x8(acc: zero, x: lhs_mat_23_3_sp2, y: rhs_mat_2367_3_sp2), x: lhs_mat_23_2_sp2, y: rhs_mat_2367_2_sp2), x: lhs_mat_23_1_sp2, y: rhs_mat_2367_1_sp2), x: lhs_mat_23_0_sp2, y: rhs_mat_2367_0_sp2);
1342
1343 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
1344 __m256i iacc_mat_00 = _mm256_add_epi32(a: iacc_mat_00_sp1, b: iacc_mat_00_sp2);
1345 __m256i iacc_mat_01 = _mm256_add_epi32(a: iacc_mat_01_sp1, b: iacc_mat_01_sp2);
1346 __m256i iacc_mat_10 = _mm256_add_epi32(a: iacc_mat_10_sp1, b: iacc_mat_10_sp2);
1347 __m256i iacc_mat_11 = _mm256_add_epi32(a: iacc_mat_11_sp1, b: iacc_mat_11_sp2);
1348
1349
1350 // Straighten out to make 4 row vectors
1351 __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
1352 __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
1353 __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
1354 __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
1355
1356 // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
1357 const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
1358
1359 // Multiply with appropiate scales and accumulate
1360 acc_rows[0] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_0), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_rows[0]);
1361 acc_rows[1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_1), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_rows[1]);
1362 acc_rows[2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_2), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_rows[2]);
1363 acc_rows[3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_3), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_rows[3]);
1364 }
1365
1366 // Store the accumulated values
1367 for (int i = 0; i < 4; i++) {
1368 _mm256_storeu_ps(p: (float *)(s + ((y * 4 + i) * bs + x * 8)), a: acc_rows[i]);
1369 }
1370 }
1371 }
1372}
1373
1374#endif // defined(__AVX2__) || defined(__AVX512F__)
1375
1376void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1377#if defined(__AVX2__) || defined(__AVX512F__)
1378 {
1379 // Lookup table to convert signed nibbles to signed bytes
1380 __m256i signextendlut = _mm256_castsi128_si256(a: _mm_set_epi8(b15: -1, b14: -2, b13: -3, b12: -4, b11: -5, b10: -6, b9: -7, b8: -8, b7: 7, b6: 6, b5: 5, b4: 4, b3: 3, b2: 2, b1: 1, b0: 0));
1381 signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
1382
1383 gemv_q4_b32_8x8_q8_0_lut_avx<block_q4_0x8>(n, s, bs, vx, vy, nr, nc, signextendlut);
1384
1385 return;
1386 }
1387#endif
1388
1389 ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
1390}
1391
1392void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1393 const int qk = QK_K;
1394 const int nb = n / qk;
1395 const int ncols_interleaved = 8;
1396 const int blocklen = 8;
1397 static const uint32_t kmask1 = 0x3f3f3f3f;
1398 static const uint32_t kmask2 = 0x0f0f0f0f;
1399 static const uint32_t kmask3 = 0x03030303;
1400
1401 assert (n % qk == 0);
1402 assert (nc % ncols_interleaved == 0);
1403
1404 UNUSED(s);
1405 UNUSED(bs);
1406 UNUSED(vx);
1407 UNUSED(vy);
1408 UNUSED(nr);
1409 UNUSED(nc);
1410 UNUSED(nb);
1411 UNUSED(ncols_interleaved);
1412 UNUSED(blocklen);
1413
1414#if defined(__AVX2__)
1415 // Lookup table to convert signed nibbles to signed bytes
1416 __m256i signextendlut = _mm256_castsi128_si256(a: _mm_set_epi8(b15: -1, b14: -2, b13: -3, b12: -4, b11: -5, b10: -6, b9: -7, b8: -8, b7: 7, b6: 6, b5: 5, b4: 4, b3: 3, b2: 2, b1: 1, b0: 0));
1417 signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
1418 // Shuffle masks to rearrange delta and scale values to multiply with appropriate scales
1419 __m128i deltamask = _mm_set_epi8(b15: 15, b14: 14, b13: 7, b12: 6, b11: 13, b10: 12, b9: 5, b8: 4, b7: 11, b6: 10, b5: 3, b4: 2, b3: 9, b2: 8, b1: 1, b0: 0);
1420 __m128i scalemask = _mm_set_epi8(b15: 7, b14: 7, b13: 3, b12: 3, b11: 6, b10: 6, b9: 2, b8: 2, b7: 5, b6: 5, b5: 1, b4: 1, b3: 4, b2: 4, b1: 0, b0: 0);
1421 // Permute mask used for easier vector processing at later stages
1422 __m256i finalpermutemask = _mm256_set_epi32(i0: 7, i1: 5, i2: 3, i3: 1, i4: 6, i5: 4, i6: 2, i7: 0);
1423
1424 // Mask to extract nibbles from bytes
1425 const __m256i m4b = _mm256_set1_epi8(b: 0x0F);
1426
1427 int64_t b_nb = n / QK_K;
1428
1429 const block_q4_Kx8 * b_ptr_start = (const block_q4_Kx8 *)vx;
1430 const block_q8_K * a_ptr_start = (const block_q8_K *)vy;
1431
1432 // Process Q8_K blocks one by one
1433 for (int64_t y = 0; y < nr; y++) {
1434
1435 // Pointers to LHS blocks of block_q8_K format
1436 const block_q8_K * a_ptr = a_ptr_start + (y * nb);
1437
1438 // Take group of eight interleaved block_q4_K structures at each pass of the loop and perform dot product operation
1439 for (int64_t x = 0; x < nc / 8; x++) {
1440
1441 // Pointers to RHS blocks
1442 const block_q4_Kx8 * b_ptr = b_ptr_start + (x * b_nb);
1443
1444 // Master FP accumulators
1445 __m256 acc_row = _mm256_setzero_ps();
1446 __m256 acc_min_rows = _mm256_setzero_ps();
1447
1448 for (int64_t b = 0; b < nb; b++) {
1449
1450 // Load and convert to FP32 scale from block_q8_K
1451 const __m256 row_scale_f32 = _mm256_set1_ps(w: (a_ptr[b].d));
1452
1453 // Load the scale values for the 8 blocks interleaved in block_q4_Kx8
1454 // col_scale_f32 rearranged so as to multiply with appropriate quants
1455 const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, deltamask);
1456 const __m256 col_dmin_f32 = GGML_F32Cx8_LOAD(b_ptr[b].dmin);
1457
1458 __m256i iacc_b = _mm256_setzero_si256();
1459 __m256i iacc_min_b = _mm256_setzero_si256();
1460
1461 const __m256i q8sums = _mm256_loadu_si256(p: (const __m256i * )(a_ptr[b].bsums));
1462 __m256i q8s = _mm256_castsi128_si256(a: _mm_hadd_epi16(a: _mm256_castsi256_si128(a: q8sums), _mm256_extracti128_si256(q8sums, 1)));
1463 q8s = _mm256_permute2f128_si256(q8s, q8s, 0);
1464
1465 // Processes two sub blocks from each Q4_K in each iteration
1466 for (int sb = 0; sb < QK_K / 64; sb++) {
1467
1468 // Load the eight block_q4_K for two sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
1469 const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + sb * 256));
1470 const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 32 + sb * 256));
1471 const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 64 + sb * 256));
1472 const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 96 + sb * 256));
1473 const __m256i rhs_raw_vec_0123_2 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 128 + sb * 256));
1474 const __m256i rhs_raw_vec_4567_2 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 160 + sb * 256));
1475 const __m256i rhs_raw_vec_0123_3 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 192 + sb * 256));
1476 const __m256i rhs_raw_vec_4567_3 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 224 + sb * 256));
1477
1478 // 4-bit -> 8-bit
1479 // Values of the first sub block of eight block_q4_K structures for the sb loop
1480 const __m256i rhs_vec_0123_00 = _mm256_and_si256(a: rhs_raw_vec_0123_0, b: m4b);
1481 const __m256i rhs_vec_4567_00 = _mm256_and_si256(a: rhs_raw_vec_4567_0, b: m4b);
1482 const __m256i rhs_vec_0123_01 = _mm256_and_si256(a: rhs_raw_vec_0123_1, b: m4b);
1483 const __m256i rhs_vec_4567_01 = _mm256_and_si256(a: rhs_raw_vec_4567_1, b: m4b);
1484 const __m256i rhs_vec_0123_02 = _mm256_and_si256(a: rhs_raw_vec_0123_2, b: m4b);
1485 const __m256i rhs_vec_4567_02 = _mm256_and_si256(a: rhs_raw_vec_4567_2, b: m4b);
1486 const __m256i rhs_vec_0123_03 = _mm256_and_si256(a: rhs_raw_vec_0123_3, b: m4b);
1487 const __m256i rhs_vec_4567_03 = _mm256_and_si256(a: rhs_raw_vec_4567_3, b: m4b);
1488
1489 // Values of the second sub block of eight block_q4_K structures when sb = 1
1490 const __m256i rhs_vec_0123_10 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_0, count: 4), b: m4b);
1491 const __m256i rhs_vec_4567_10 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_0, count: 4), b: m4b);
1492 const __m256i rhs_vec_0123_11 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_1, count: 4), b: m4b);
1493 const __m256i rhs_vec_4567_11 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_1, count: 4), b: m4b);
1494 const __m256i rhs_vec_0123_12 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_2, count: 4), b: m4b);
1495 const __m256i rhs_vec_4567_12 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_2, count: 4), b: m4b);
1496 const __m256i rhs_vec_0123_13 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_3, count: 4), b: m4b);
1497 const __m256i rhs_vec_4567_13 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_3, count: 4), b: m4b);
1498
1499 uint32_t utmp_0[4], utmp_1[4];
1500
1501 // Scales and Mins of corresponding sub blocks from different Q8_K structures are stored together
1502 // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop
1503 memcpy(dest: utmp_0, src: b_ptr[b].scales + 24 * sb, n: 12);
1504 utmp_0[3] = ((utmp_0[2] >> 4) & kmask2) | (((utmp_0[1] >> 6) & kmask3) << 4);
1505 const uint32_t uaux_0 = utmp_0[1] & kmask1;
1506 utmp_0[1] = (utmp_0[2] & kmask2) | (((utmp_0[0] >> 6) & kmask3) << 4);
1507 utmp_0[2] = uaux_0;
1508 utmp_0[0] &= kmask1;
1509
1510 // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop
1511 memcpy(dest: utmp_1, src: b_ptr[b].scales + 12 + sb * 24, n: 12);
1512 utmp_1[3] = ((utmp_1[2] >> 4) & kmask2) | (((utmp_1[1] >> 6) & kmask3) << 4);
1513 const uint32_t uaux_1 = utmp_1[1] & kmask1;
1514 utmp_1[1] = (utmp_1[2] & kmask2) | (((utmp_1[0] >> 6) & kmask3) << 4);
1515 utmp_1[2] = uaux_1;
1516 utmp_1[0] &= kmask1;
1517
1518 // Scales of first sub block in the sb loop
1519 const __m128i mins_and_scales_0 = _mm_set_epi32(i3: utmp_0[3], i2: utmp_0[2], i1: utmp_0[1], i0: utmp_0[0]);
1520 __m128i scales_rearrange_0 = _mm_shuffle_epi8(a: mins_and_scales_0, b: scalemask);
1521 __m256i scales_0 = _mm256_cvtepu8_epi16(V: scales_rearrange_0);
1522
1523 // Scales of second sub block in the sb loop
1524 __m128i mins_and_scales_1 = _mm_set_epi32(i3: utmp_1[3], i2: utmp_1[2], i1: utmp_1[1], i0: utmp_1[0]);
1525 __m128i scales_rearrange_1 = _mm_shuffle_epi8(a: mins_and_scales_1, b: scalemask);
1526 __m256i scales_1 = _mm256_cvtepu8_epi16(V: scales_rearrange_1);
1527
1528 // Mins of first and second sub block of Q4_K block are arranged side by side
1529 __m256i mins_01 = _mm256_cvtepu8_epi16(V: _mm_unpacklo_epi8(_mm_shuffle_epi32(mins_and_scales_0, 78), _mm_shuffle_epi32(mins_and_scales_1, 78)));
1530
1531 // Load the two sub block values corresponding to sb in block_q8_K in batches of 16 bytes and replicate the same across 256 bit vector
1532 __m256i lhs_vec_00 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + sb * 64)));
1533 __m256i lhs_vec_01 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 16 + sb * 64)));
1534 __m256i lhs_vec_10 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 32 + sb * 64)));
1535 __m256i lhs_vec_11 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 48 + sb * 64)));
1536
1537 lhs_vec_00 = _mm256_permute2f128_si256(lhs_vec_00, lhs_vec_00, 0);
1538 lhs_vec_01 = _mm256_permute2f128_si256(lhs_vec_01, lhs_vec_01, 0);
1539 lhs_vec_10 = _mm256_permute2f128_si256(lhs_vec_10, lhs_vec_10, 0);
1540 lhs_vec_11 = _mm256_permute2f128_si256(lhs_vec_11, lhs_vec_11, 0);
1541
1542 // Dot product done within 32 bit lanes and accumulated in the same vector
1543 // First done for first sub block and thenn for second sub block in each sb
1544 // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
1545 // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
1546 // ...........................................................................
1547 // B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
1548
1549
1550 __m256i iacc_0 = _mm256_setzero_si256();
1551 __m256i iacc_1 = _mm256_setzero_si256();
1552
1553 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_00 ,_mm256_shuffle_epi32(rhs_vec_4567_00, 177), 170), _mm256_shuffle_epi32(lhs_vec_00, 0)));
1554 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_00, 177) ,rhs_vec_4567_00, 170), _mm256_shuffle_epi32(lhs_vec_00, 85)));
1555
1556 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_01 ,_mm256_shuffle_epi32(rhs_vec_4567_01, 177), 170), _mm256_shuffle_epi32(lhs_vec_00, 170)));
1557 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_01, 177) ,rhs_vec_4567_01, 170), _mm256_shuffle_epi32(lhs_vec_00, 255)));
1558
1559 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_02 ,_mm256_shuffle_epi32(rhs_vec_4567_02, 177), 170), _mm256_shuffle_epi32(lhs_vec_01, 0)));
1560 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_02, 177) ,rhs_vec_4567_02, 170), _mm256_shuffle_epi32(lhs_vec_01, 85)));
1561
1562 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_03 ,_mm256_shuffle_epi32(rhs_vec_4567_03, 177), 170), _mm256_shuffle_epi32(lhs_vec_01, 170)));
1563 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_03, 177) ,rhs_vec_4567_03, 170), _mm256_shuffle_epi32(lhs_vec_01, 255)));
1564
1565 iacc_0 = _mm256_madd_epi16(a: iacc_0, b: scales_0);
1566
1567 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_10 ,_mm256_shuffle_epi32(rhs_vec_4567_10, 177), 170), _mm256_shuffle_epi32(lhs_vec_10, 0)));
1568 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_10, 177) ,rhs_vec_4567_10, 170), _mm256_shuffle_epi32(lhs_vec_10, 85)));
1569
1570 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_11 ,_mm256_shuffle_epi32(rhs_vec_4567_11, 177), 170), _mm256_shuffle_epi32(lhs_vec_10, 170)));
1571 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_11, 177) ,rhs_vec_4567_11, 170), _mm256_shuffle_epi32(lhs_vec_10, 255)));
1572
1573 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_12 ,_mm256_shuffle_epi32(rhs_vec_4567_12, 177), 170), _mm256_shuffle_epi32(lhs_vec_11, 0)));
1574 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_12, 177) ,rhs_vec_4567_12, 170), _mm256_shuffle_epi32(lhs_vec_11, 85)));
1575
1576 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_13 ,_mm256_shuffle_epi32(rhs_vec_4567_13, 177), 170), _mm256_shuffle_epi32(lhs_vec_11, 170)));
1577 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_13, 177) ,rhs_vec_4567_13, 170), _mm256_shuffle_epi32(lhs_vec_11, 255)));
1578
1579 iacc_1 = _mm256_madd_epi16(a: iacc_1, b: scales_1);
1580
1581 // Accumulate the iacc value for one sb
1582 __m256i iacc_sb = _mm256_add_epi32(a: iacc_0, b: iacc_1);
1583
1584 // Broadcast the bsums of the two sub blocks of the iteration of Q8_K across the vector
1585 // Multiply-Add with corresponding mins of Q4_Kx8 with bsums
1586 __m256i q8s_sb = _mm256_shuffle_epi32(q8s, 0);
1587 __m256i iacc_min_sb = _mm256_madd_epi16(a: q8s_sb, b: mins_01);
1588 q8s = _mm256_bsrli_epi128(q8s, 4);
1589
1590 // Accumulate for the complete block
1591 iacc_b = _mm256_add_epi32(a: iacc_b, b: iacc_sb);
1592 iacc_min_b = _mm256_add_epi32(a: iacc_min_b, b: iacc_min_sb);
1593 }
1594
1595 // Multiply-Add with scale values for the complete super block
1596 acc_row = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_b), B: _mm256_mul_ps(a: col_scale_f32, b: row_scale_f32), C: acc_row);
1597 acc_min_rows = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_min_b), B: _mm256_mul_ps(a: col_dmin_f32, b: row_scale_f32), C: acc_min_rows);
1598
1599 }
1600
1601 // Accumulated output values permuted so as to be stored in appropriate order post accumulation
1602 acc_row = _mm256_permutevar8x32_ps(a: acc_row, b: finalpermutemask);
1603 _mm256_storeu_ps(p: s + (y * nr + x * 8), a: _mm256_sub_ps(a: acc_row, b: acc_min_rows));
1604 }
1605 }
1606
1607#else
1608 UNUSED(kmask1);
1609 UNUSED(kmask2);
1610 UNUSED(kmask3);
1611 ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
1612#endif
1613}
1614
1615void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1616#if defined(__AVX2__)
1617 __m256i signextendlut = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i*)kvalues_iq4nl));
1618 signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
1619
1620 gemv_q4_b32_8x8_q8_0_lut_avx<block_iq4_nlx8>(n, s, bs, vx, vy, nr, nc, signextendlut);
1621
1622 return;
1623#endif
1624
1625 ggml_gemv_iq4_nl_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
1626}
1627
1628void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1629 const int qk = QK_K;
1630 const int nb = n / qk;
1631 const int ncols_interleaved = 8;
1632 const int blocklen = 8;
1633
1634 assert (n % qk == 0);
1635 assert (nc % ncols_interleaved == 0);
1636
1637 UNUSED(s);
1638 UNUSED(bs);
1639 UNUSED(vx);
1640 UNUSED(vy);
1641 UNUSED(nr);
1642 UNUSED(nc);
1643 UNUSED(nb);
1644 UNUSED(ncols_interleaved);
1645 UNUSED(blocklen);
1646
1647#if defined(__AVX2__)
1648 // Lookup table to convert signed nibbles to signed bytes
1649 __m256i signextendlut = _mm256_castsi128_si256(a: _mm_set_epi8(b15: -1, b14: -2, b13: -3, b12: -4, b11: -5, b10: -6, b9: -7, b8: -8, b7: 7, b6: 6, b5: 5, b4: 4, b3: 3, b2: 2, b1: 1, b0: 0));
1650 signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
1651 // Shuffle masks to rearrange delta values to multiply with appropriate scales
1652 __m128i deltamask = _mm_set_epi8(b15: 15, b14: 14, b13: 7, b12: 6, b11: 13, b10: 12, b9: 5, b8: 4, b7: 11, b6: 10, b5: 3, b4: 2, b3: 9, b2: 8, b1: 1, b0: 0);
1653 // Permute mask used for easier vector processing at later stages
1654 __m256i finalpermutemask = _mm256_set_epi32(i0: 7, i1: 5, i2: 3, i3: 1, i4: 6, i5: 4, i6: 2, i7: 0);
1655
1656 const __m256i m3b = _mm256_set1_epi8(b: 3);
1657 const __m128i m4b_sse = _mm_set1_epi8(b: 0xF);
1658
1659 //Mask to get appropriate scales
1660 __m128i scalemask1 = _mm_set_epi8(b15: 14,b14: 14,b13: 6,b12: 6,b11: 12,b10: 12,b9: 4,b8: 4,b7: 10,b6: 10,b5: 2,b4: 2,b3: 8,b2: 8,b1: 0,b0: 0);
1661 __m128i scalemask2 = _mm_set_epi8(b15: 15,b14: 15,b13: 7,b12: 7,b11: 13,b10: 13,b9: 5,b8: 5,b7: 11,b6: 11,b5: 3,b4: 3,b3: 9,b2: 9,b1: 1,b0: 1);
1662
1663 int64_t b_nb = n / QK_K;
1664
1665 const block_q2_Kx8 * b_ptr_start = (const block_q2_Kx8 *)vx;
1666 const block_q8_K * a_ptr_start = (const block_q8_K *)vy;
1667
1668 // Process Q8_K blocks one by one
1669 for (int64_t y = 0; y < nr; y++) {
1670
1671 // Pointers to LHS blocks of block_q8_K format
1672 const block_q8_K * a_ptr = a_ptr_start + (y * nb);
1673
1674 // Take group of eight interleaved block_q2_K structures at each pass of the loop and perform dot product operation
1675 for(int64_t x = 0; x < nc / 8; x++) {
1676
1677 // Pointers to RHS blocks
1678 const block_q2_Kx8 * b_ptr = b_ptr_start + (x * b_nb);
1679
1680 // Master FP accumulators
1681 __m256 acc_row = _mm256_setzero_ps();
1682 __m256 acc_min_rows = _mm256_setzero_ps();
1683
1684 for (int64_t b = 0; b < nb; b++) {
1685
1686 // Load and convert to FP32 delta from block_q8_K
1687 const __m256 row_scale_f32 = _mm256_set1_ps(w: (a_ptr[b].d));
1688
1689 // Load the delta values for the 8 blocks interleaved in block_q2_Kx8
1690 // col_scale_f32 rearranged so as to multiply with appropriate quants
1691 const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, deltamask);
1692 const __m256 col_dmin_f32 = GGML_F32Cx8_LOAD(b_ptr[b].dmin);
1693
1694 __m256i iacc_b = _mm256_setzero_si256();
1695 __m256i iacc_min_b = _mm256_setzero_si256();
1696
1697 // Processes eight sub blocks from each Q2_K in each iteration
1698 for(int sb = 0; sb < QK_K / 128; sb++) {
1699
1700 // Load the eight block_q2_K for eight sub blocks quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
1701 const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + sb * 256));
1702 const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 32 + sb * 256));
1703 const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 64 + sb * 256));
1704 const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 96 + sb * 256));
1705 const __m256i rhs_raw_vec_0123_2 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 128 + sb * 256));
1706 const __m256i rhs_raw_vec_4567_2 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 160 + sb * 256));
1707 const __m256i rhs_raw_vec_0123_3 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 192 + sb * 256));
1708 const __m256i rhs_raw_vec_4567_3 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 224 + sb * 256));
1709
1710 // 2-bit -> 8-bit
1711 // Values of the 0th,2nd,4th,6th sub blocks of eight block_q2_K structures for the sb loop
1712 const __m256i rhs_vec_0123_00 = _mm256_and_si256(a: rhs_raw_vec_0123_0, b: m3b); //B00(0-7) B01(0-7) B02(0-7) B03(0-7)
1713 const __m256i rhs_vec_0123_20 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_0, count: 2), b: m3b); //B20(0-7) B21(0-7) B22(0-7) B23(0-7)
1714 const __m256i rhs_vec_0123_40 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_0, count: 4), b: m3b); //B40(0-7) B41(0-7) B42(0-7) B43(0-7)
1715 const __m256i rhs_vec_0123_60 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_0, count: 6), b: m3b); //B60(0-7) B61(0-7) B62(0-7) B63(0-7)
1716
1717 const __m256i rhs_vec_4567_00 = _mm256_and_si256(a: rhs_raw_vec_4567_0, b: m3b); //B04(0-7) B05(0-7) B06(0-7) B07(0-7)
1718 const __m256i rhs_vec_4567_20 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_0, count: 2), b: m3b); //B24(0-7) B25(0-7) B26(0-7) B27(0-7)
1719 const __m256i rhs_vec_4567_40 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_0, count: 4), b: m3b); //B44(0-7) B45(0-7) B46(0-7) B47(0-7)
1720 const __m256i rhs_vec_4567_60 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_0, count: 6), b: m3b); //B64(0-7) B65(0-7) B66(0-7) B67(0-7)
1721
1722 const __m256i rhs_vec_0123_01 = _mm256_and_si256(a: rhs_raw_vec_0123_1, b: m3b); //B00(8-15) B01(8-15) B02(8-15) B03(8-15)
1723 const __m256i rhs_vec_0123_21 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_1, count: 2), b: m3b); //B20(8-15) B21(8-15) B22(8-15) B23(8-15)
1724 const __m256i rhs_vec_0123_41 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_1, count: 4), b: m3b); //B40(8-15) B41(8-15) B42(8-15) B43(8-15)
1725 const __m256i rhs_vec_0123_61 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_1, count: 6), b: m3b); //B60(8-15) B61(8-15) B62(8-15) B63(8-15)
1726
1727 const __m256i rhs_vec_4567_01 = _mm256_and_si256(a: rhs_raw_vec_4567_1, b: m3b); //B04(8-15) B05(8-15) B06(8-15) B07(8-15)
1728 const __m256i rhs_vec_4567_21 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_1, count: 2), b: m3b); //B24(8-15) B25(8-15) B26(8-15) B27(8-15)
1729 const __m256i rhs_vec_4567_41 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_1, count: 4), b: m3b); //B44(8-15) B45(8-15) B46(8-15) B47(8-15)
1730 const __m256i rhs_vec_4567_61 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_1, count: 6), b: m3b); //B64(8-15) B65(8-15) B66(8-15) B67(8-15)
1731
1732 // Values of the 1st,3rd,5th,7th sub blocks of eight block_q2_K structures for the sb loop
1733 const __m256i rhs_vec_0123_10 = _mm256_and_si256(a: rhs_raw_vec_0123_2, b: m3b); //B10(0-7) B11(0-7) B12(0-7) B13(0-7)
1734 const __m256i rhs_vec_0123_30 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_2, count: 2), b: m3b); //B30(0-7) B31(0-7) B32(0-7) B33(0-7)
1735 const __m256i rhs_vec_0123_50 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_2, count: 4), b: m3b); //B50(0-7) B51(0-7) B52(0-7) B53(0-7)
1736 const __m256i rhs_vec_0123_70 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_2, count: 6), b: m3b); //B70(0-7) B71(0-7) B72(0-7) B73(0-7)
1737
1738 const __m256i rhs_vec_4567_10 = _mm256_and_si256(a: rhs_raw_vec_4567_2, b: m3b); //B14(0-7) B15(0-7) B16(0-7) B17(0-7)
1739 const __m256i rhs_vec_4567_30 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_2, count: 2), b: m3b); //B34(0-7) B35(0-7) B36(0-7) B37(0-7)
1740 const __m256i rhs_vec_4567_50 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_2, count: 4), b: m3b); //B54(0-7) B55(0-7) B56(0-7) B57(0-7)
1741 const __m256i rhs_vec_4567_70 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_2, count: 6), b: m3b); //B74(0-7) B75(0-7) B76(0-7) B77(0-7)
1742
1743 const __m256i rhs_vec_0123_11 = _mm256_and_si256(a: rhs_raw_vec_0123_3, b: m3b); //B10(8-15) B11(8-15) B12(8-15) B13(8-15)
1744 const __m256i rhs_vec_0123_31 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_3, count: 2), b: m3b); //B30(8-15) B31(8-15) B32(8-15) B33(8-15)
1745 const __m256i rhs_vec_0123_51 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_3, count: 4), b: m3b); //B50(8-15) B51(8-15) B52(8-15) B53(8-15)
1746 const __m256i rhs_vec_0123_71 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_0123_3, count: 6), b: m3b); //B70(8-15) B71(8-15) B72(8-15) B73(8-15)
1747
1748 const __m256i rhs_vec_4567_11 = _mm256_and_si256(a: rhs_raw_vec_4567_3, b: m3b); //B14(8-15) B15(8-15) B16(8-15) B17(8-15)
1749 const __m256i rhs_vec_4567_31 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_3, count: 2), b: m3b); //B34(8-15) B35(8-15) B36(8-15) B37(8-15)
1750 const __m256i rhs_vec_4567_51 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_3, count: 4), b: m3b); //B54(8-15) B55(8-15) B56(8-15) B57(8-15)
1751 const __m256i rhs_vec_4567_71 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_vec_4567_3, count: 6), b: m3b); //B74(8-15) B75(8-15) B76(8-15) B77(8-15)
1752
1753 //Scales and Mins of corresponding sub blocks from different Q2_K structures are stored together
1754 //s00 m00 s01 m01 s10 m10 s11 m11 s20 m20 s21 m21 s30 m30 s31 m31 s40 m40 s41 m41 s50 m50 s51 m51 s60 m60 s61 m61 s70 m70 s71 m71
1755
1756 const __m128i mins_and_scales_01 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + sb * 64));
1757 const __m128i mins_and_scales_23 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 16 + sb * 64));
1758 const __m128i mins_and_scales_45 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 32 + sb * 64));
1759 const __m128i mins_and_scales_67 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 48 + sb * 64));
1760
1761 // Extract scales which is lower half from mins_and_scales
1762 const __m128i scales_01 = _mm_and_si128(a: mins_and_scales_01, b: m4b_sse);
1763 const __m128i scales_23 = _mm_and_si128(a: mins_and_scales_23, b: m4b_sse);
1764 const __m128i scales_45 = _mm_and_si128(a: mins_and_scales_45, b: m4b_sse);
1765 const __m128i scales_67 = _mm_and_si128(a: mins_and_scales_67, b: m4b_sse);
1766
1767 // Extract mins which is upper half from mins_and_scales
1768 const __m256i mins_01 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_01, count: 4), b: m4b_sse));
1769 const __m256i mins_23 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_23, count: 4), b: m4b_sse));
1770 const __m256i mins_45 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_45, count: 4), b: m4b_sse));
1771 const __m256i mins_67 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_67, count: 4), b: m4b_sse));
1772
1773 // Scales of sub blocks in the sb loop
1774 // Scales of the 0th sub block from each super block
1775 __m128i scales_rearrange_0 = _mm_shuffle_epi8(a: scales_01, b: scalemask1);
1776 __m256i scales_0 = _mm256_cvtepu8_epi16(V: scales_rearrange_0);
1777
1778 // Scales of the 1st sub block from each super block
1779 __m128i scales_rearrange_1 = _mm_shuffle_epi8(a: scales_01, b: scalemask2);
1780 __m256i scales_1 = _mm256_cvtepu8_epi16(V: scales_rearrange_1);
1781
1782 // Scales of the 2nd sub block from each super block
1783 __m128i scales_rearrange_2 = _mm_shuffle_epi8(a: scales_23, b: scalemask1);
1784 __m256i scales_2 = _mm256_cvtepu8_epi16(V: scales_rearrange_2);
1785
1786 // Scales of the 3rd sub block from each super block
1787 __m128i scales_rearrange_3 = _mm_shuffle_epi8(a: scales_23, b: scalemask2);
1788 __m256i scales_3 = _mm256_cvtepu8_epi16(V: scales_rearrange_3);
1789
1790 // Scales of the 4th sub block from each super block
1791 __m128i scales_rearrange_4 = _mm_shuffle_epi8(a: scales_45, b: scalemask1);
1792 __m256i scales_4 = _mm256_cvtepu8_epi16(V: scales_rearrange_4);
1793
1794 // Scales of the 5th sub block from each super block
1795 __m128i scales_rearrange_5 = _mm_shuffle_epi8(a: scales_45, b: scalemask2);
1796 __m256i scales_5 = _mm256_cvtepu8_epi16(V: scales_rearrange_5);
1797
1798 // Scales of the 6th sub block from each super block
1799 __m128i scales_rearrange_6 = _mm_shuffle_epi8(a: scales_67, b: scalemask1);
1800 __m256i scales_6 = _mm256_cvtepu8_epi16(V: scales_rearrange_6);
1801
1802 // Scales of the 7th sub block from each super block
1803 __m128i scales_rearrange_7 = _mm_shuffle_epi8(a: scales_67, b: scalemask2);
1804 __m256i scales_7 = _mm256_cvtepu8_epi16(V: scales_rearrange_7);
1805
1806 // Load the sub block values corresponding to sb in block_q8_K in batches of 16 bytes and replicate the same across 256 bit vector
1807 __m256i lhs_vec_0 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + sb * 128)));
1808 __m256i lhs_vec_1 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 16 + sb * 128)));
1809 __m256i lhs_vec_2 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 32 + sb * 128)));
1810 __m256i lhs_vec_3 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 48 + sb * 128)));
1811 __m256i lhs_vec_4 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 64 + sb * 128)));
1812 __m256i lhs_vec_5 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 80 + sb * 128)));
1813 __m256i lhs_vec_6 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 96 + sb * 128)));
1814 __m256i lhs_vec_7 = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].qs + 112 + sb * 128)));
1815
1816 lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0);
1817 lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0);
1818 lhs_vec_2 = _mm256_permute2f128_si256(lhs_vec_2, lhs_vec_2, 0);
1819 lhs_vec_3 = _mm256_permute2f128_si256(lhs_vec_3, lhs_vec_3, 0);
1820 lhs_vec_4 = _mm256_permute2f128_si256(lhs_vec_4, lhs_vec_4, 0);
1821 lhs_vec_5 = _mm256_permute2f128_si256(lhs_vec_5, lhs_vec_5, 0);
1822 lhs_vec_6 = _mm256_permute2f128_si256(lhs_vec_6, lhs_vec_6, 0);
1823 lhs_vec_7 = _mm256_permute2f128_si256(lhs_vec_7, lhs_vec_7, 0);
1824
1825 __m256i iacc_0 = _mm256_setzero_si256();
1826 __m256i iacc_1 = _mm256_setzero_si256();
1827 __m256i iacc_2 = _mm256_setzero_si256();
1828 __m256i iacc_3 = _mm256_setzero_si256();
1829 __m256i iacc_4 = _mm256_setzero_si256();
1830 __m256i iacc_5 = _mm256_setzero_si256();
1831 __m256i iacc_6 = _mm256_setzero_si256();
1832 __m256i iacc_7 = _mm256_setzero_si256();
1833
1834 // Dot product done within 32 bit lanes and accumulated in the same vector
1835 // First done for 0th sub block and then for seven (1st - 7th) other sub blocks processed for each sb (sb < QK_K/128 loop) // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
1836 // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
1837 // B0(8-11) B4(8-11) B1(8-11) B5(8-11) B2(8-11) B6(8-11) B3(8-11) B7(8-11) with A0(8-11)
1838 // B0(12-15) B4(12-15) B1(12-15) B5(12-15) B2(12-15) B6(12-15) B3(12-15) B7(12-15) with A0(12-15)
1839
1840 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_00 ,_mm256_shuffle_epi32(rhs_vec_4567_00, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
1841 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_00, 177) ,rhs_vec_4567_00, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
1842
1843 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_01 ,_mm256_shuffle_epi32(rhs_vec_4567_01, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
1844 iacc_0 = _mm256_add_epi16(a: iacc_0, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_01, 177) ,rhs_vec_4567_01, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
1845
1846 iacc_0 = _mm256_madd_epi16(a: iacc_0, b: scales_0);
1847
1848 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_10 ,_mm256_shuffle_epi32(rhs_vec_4567_10, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
1849 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_10, 177) ,rhs_vec_4567_10, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
1850
1851 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_11 ,_mm256_shuffle_epi32(rhs_vec_4567_11, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
1852 iacc_1 = _mm256_add_epi16(a: iacc_1, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_11, 177) ,rhs_vec_4567_11, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
1853
1854 iacc_1 = _mm256_madd_epi16(a: iacc_1, b: scales_1);
1855
1856 iacc_2 = _mm256_add_epi16(a: iacc_2, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_20 ,_mm256_shuffle_epi32(rhs_vec_4567_20, 177), 170), _mm256_shuffle_epi32(lhs_vec_2, 0)));
1857 iacc_2 = _mm256_add_epi16(a: iacc_2, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_20, 177) ,rhs_vec_4567_20, 170), _mm256_shuffle_epi32(lhs_vec_2, 85)));
1858
1859 iacc_2 = _mm256_add_epi16(a: iacc_2, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_21 ,_mm256_shuffle_epi32(rhs_vec_4567_21, 177), 170), _mm256_shuffle_epi32(lhs_vec_2, 170)));
1860 iacc_2 = _mm256_add_epi16(a: iacc_2, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_21, 177) ,rhs_vec_4567_21, 170), _mm256_shuffle_epi32(lhs_vec_2, 255)));
1861
1862 iacc_2 = _mm256_madd_epi16(a: iacc_2, b: scales_2);
1863
1864 iacc_3 = _mm256_add_epi16(a: iacc_3, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_30 ,_mm256_shuffle_epi32(rhs_vec_4567_30, 177), 170), _mm256_shuffle_epi32(lhs_vec_3, 0)));
1865 iacc_3 = _mm256_add_epi16(a: iacc_3, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_30, 177) ,rhs_vec_4567_30, 170), _mm256_shuffle_epi32(lhs_vec_3, 85)));
1866
1867 iacc_3 = _mm256_add_epi16(a: iacc_3, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_31 ,_mm256_shuffle_epi32(rhs_vec_4567_31, 177), 170), _mm256_shuffle_epi32(lhs_vec_3, 170)));
1868 iacc_3 = _mm256_add_epi16(a: iacc_3, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_31, 177) ,rhs_vec_4567_31, 170), _mm256_shuffle_epi32(lhs_vec_3, 255)));
1869
1870 iacc_3 = _mm256_madd_epi16(a: iacc_3, b: scales_3);
1871
1872 iacc_4 = _mm256_add_epi16(a: iacc_4, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_40 ,_mm256_shuffle_epi32(rhs_vec_4567_40, 177), 170), _mm256_shuffle_epi32(lhs_vec_4, 0)));
1873 iacc_4 = _mm256_add_epi16(a: iacc_4, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_40, 177) ,rhs_vec_4567_40, 170), _mm256_shuffle_epi32(lhs_vec_4, 85)));
1874
1875 iacc_4 = _mm256_add_epi16(a: iacc_4, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_41 ,_mm256_shuffle_epi32(rhs_vec_4567_41, 177), 170), _mm256_shuffle_epi32(lhs_vec_4, 170)));
1876 iacc_4 = _mm256_add_epi16(a: iacc_4, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_41, 177) ,rhs_vec_4567_41, 170), _mm256_shuffle_epi32(lhs_vec_4, 255)));
1877
1878 iacc_4 = _mm256_madd_epi16(a: iacc_4, b: scales_4);
1879
1880 iacc_5 = _mm256_add_epi16(a: iacc_5, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_50 ,_mm256_shuffle_epi32(rhs_vec_4567_50, 177), 170), _mm256_shuffle_epi32(lhs_vec_5, 0)));
1881 iacc_5 = _mm256_add_epi16(a: iacc_5, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_50, 177) ,rhs_vec_4567_50, 170), _mm256_shuffle_epi32(lhs_vec_5, 85)));
1882
1883 iacc_5 = _mm256_add_epi16(a: iacc_5, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_51 ,_mm256_shuffle_epi32(rhs_vec_4567_51, 177), 170), _mm256_shuffle_epi32(lhs_vec_5, 170)));
1884 iacc_5 = _mm256_add_epi16(a: iacc_5, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_51, 177) ,rhs_vec_4567_51, 170), _mm256_shuffle_epi32(lhs_vec_5, 255)));
1885
1886 iacc_5 = _mm256_madd_epi16(a: iacc_5, b: scales_5);
1887
1888 iacc_6 = _mm256_add_epi16(a: iacc_6, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_60 ,_mm256_shuffle_epi32(rhs_vec_4567_60, 177), 170), _mm256_shuffle_epi32(lhs_vec_6, 0)));
1889 iacc_6 = _mm256_add_epi16(a: iacc_6, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_60, 177) ,rhs_vec_4567_60, 170), _mm256_shuffle_epi32(lhs_vec_6, 85)));
1890
1891 iacc_6 = _mm256_add_epi16(a: iacc_6, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_61 ,_mm256_shuffle_epi32(rhs_vec_4567_61, 177), 170), _mm256_shuffle_epi32(lhs_vec_6, 170)));
1892 iacc_6 = _mm256_add_epi16(a: iacc_6, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_61, 177) ,rhs_vec_4567_61, 170), _mm256_shuffle_epi32(lhs_vec_6, 255)));
1893
1894 iacc_6 = _mm256_madd_epi16(a: iacc_6, b: scales_6);
1895
1896 iacc_7 = _mm256_add_epi16(a: iacc_7, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_70 ,_mm256_shuffle_epi32(rhs_vec_4567_70, 177), 170), _mm256_shuffle_epi32(lhs_vec_7, 0)));
1897 iacc_7 = _mm256_add_epi16(a: iacc_7, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_70, 177) ,rhs_vec_4567_70, 170), _mm256_shuffle_epi32(lhs_vec_7, 85)));
1898
1899 iacc_7 = _mm256_add_epi16(a: iacc_7, b: _mm256_maddubs_epi16(_mm256_blend_epi32(rhs_vec_0123_71 ,_mm256_shuffle_epi32(rhs_vec_4567_71, 177), 170), _mm256_shuffle_epi32(lhs_vec_7, 170)));
1900 iacc_7 = _mm256_add_epi16(a: iacc_7, b: _mm256_maddubs_epi16(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_71, 177) ,rhs_vec_4567_71, 170), _mm256_shuffle_epi32(lhs_vec_7, 255)));
1901
1902 iacc_7 = _mm256_madd_epi16(a: iacc_7, b: scales_7);
1903
1904 // Accumulate the iacc value for one sb
1905 __m256i iacc_sb = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_0, b: iacc_1), b: _mm256_add_epi32(a: iacc_2, b: iacc_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_4, b: iacc_5), b: _mm256_add_epi32(a: iacc_6, b: iacc_7)));
1906
1907 __m128i q8sums = _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].bsums + sb * 8));
1908 __m256i q8s = _mm256_castsi128_si256(a: q8sums);
1909 q8s= _mm256_permute2f128_si256(q8s, q8s, 0);
1910
1911 // Broadcast the bsums of the two corresponding subblocks of q8_k
1912 // Multiply-Add with corresponding mins of Q2_Kx8 with bsums
1913 __m256i iacc_min_sb_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(q8s, 0), b: mins_01);
1914 __m256i iacc_min_sb_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(q8s, 85), b: mins_23);
1915 __m256i iacc_min_sb_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(q8s, 170), b: mins_45);
1916 __m256i iacc_min_sb_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(q8s, 255), b: mins_67);
1917
1918 __m256i iacc_min_sb = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_min_sb_01, b: iacc_min_sb_23), b: _mm256_add_epi32(a: iacc_min_sb_45,b: iacc_min_sb_67));
1919
1920 // Accumulate for the complete block
1921 iacc_b = _mm256_add_epi32(a: iacc_b, b: iacc_sb);
1922 iacc_min_b = _mm256_add_epi32(a: iacc_min_b, b: iacc_min_sb);
1923 }
1924
1925 //Multiply-Add with scale values for complete super block
1926 acc_row = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_b), B: _mm256_mul_ps(a: col_scale_f32, b: row_scale_f32), C: acc_row);
1927 acc_min_rows = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_min_b), B: _mm256_mul_ps(a: col_dmin_f32, b: row_scale_f32), C: acc_min_rows);
1928 }
1929 // Accumulated output values permuted so as to be stored in appropriate order post accumulation
1930 acc_row = _mm256_permutevar8x32_ps(a: acc_row, b: finalpermutemask);
1931 _mm256_storeu_ps(p: s + (y * nr + x * 8), a: _mm256_sub_ps(a: acc_row, b: acc_min_rows));
1932 }
1933 }
1934#else
1935
1936 ggml_gemv_q2_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
1937
1938#endif
1939}
1940
1941void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1942#if defined(__AVX2__) || defined(__AVX512F__)
1943 {
1944 // Lookup table to convert signed nibbles to signed bytes
1945 __m256i signextendlut = _mm256_castsi128_si256(a: _mm_set_epi8(b15: -1, b14: -2, b13: -3, b12: -4, b11: -5, b10: -6, b9: -7, b8: -8, b7: 7, b6: 6, b5: 5, b4: 4, b3: 3, b2: 2, b1: 1, b0: 0));
1946 signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
1947
1948 gemm_q4_b32_8x8_q8_0_lut_avx<block_q4_0x8>(n, s, bs, vx, vy, nr, nc, signextendlut);
1949
1950 return;
1951 }
1952#endif // defined(__AVX2__) || defined(__AVX512F__)
1953
1954 ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
1955}
1956
1957void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1958 const int qk = QK_K;
1959 const int nb = n / qk;
1960 const int ncols_interleaved = 8;
1961 const int blocklen = 8;
1962 static const uint32_t kmask1 = 0x3f3f3f3f;
1963 static const uint32_t kmask2 = 0x0f0f0f0f;
1964 static const uint32_t kmask3 = 0x03030303;
1965
1966 assert (n % qk == 0);
1967 assert (nr % 4 == 0);
1968 assert (nc % ncols_interleaved == 0);
1969
1970 UNUSED(s);
1971 UNUSED(bs);
1972 UNUSED(vx);
1973 UNUSED(vy);
1974 UNUSED(nr);
1975 UNUSED(nc);
1976 UNUSED(nb);
1977 UNUSED(ncols_interleaved);
1978 UNUSED(blocklen);
1979
1980#if defined(__AVX2__) || defined(__AVX512F__)
1981 const block_q4_Kx8 * b_ptr_start = (const block_q4_Kx8 * ) vx;
1982 const block_q8_Kx4 * a_ptr_start = (const block_q8_Kx4 * ) vy;
1983 int64_t b_nb = n / QK_K;
1984 int64_t y = 0;
1985
1986 // Mask to mask out nibbles from packed bytes
1987 const __m256i m4b = _mm256_set1_epi8(b: 0x0F);
1988 // Permute mask used for easier vector processing at later stages
1989 __m256i requiredOrder = _mm256_set_epi32(i0: 3, i1: 2, i2: 1, i3: 0, i4: 7, i5: 6, i6: 5, i7: 4);
1990 int64_t xstart = 0;
1991 int anr = nr - nr % 16;; // Used to align nr with boundary of 16
1992#ifdef __AVX512F__
1993 int anc = nc - nc % 16; // Used to align nc with boundary of 16
1994 // Mask to mask out nibbles from packed bytes expanded to 512 bit length
1995 const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
1996 //Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
1997 for (; y < anr / 4; y += 4) {
1998
1999 const block_q8_Kx4 * a_ptrs[4];
2000
2001 a_ptrs[0] = a_ptr_start + (y * nb);
2002 for (int i = 0; i < 3; ++i) {
2003 a_ptrs[i + 1] = a_ptrs[i] + nb;
2004 }
2005
2006 // Take group of eight block_q4_kx8 structures at each pass of the loop and perform dot product operation
2007 for (int64_t x = 0; x < anc / 8; x += 2) {
2008
2009 const block_q4_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
2010 const block_q4_Kx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
2011
2012 // Master FP accumulators
2013 __m512 acc_rows[16];
2014 for (int i = 0; i < 16; i++) {
2015 acc_rows[i] = _mm512_setzero_ps();
2016 }
2017
2018 __m512 acc_min_rows[16];
2019 for (int i = 0; i < 16; i++) {
2020 acc_min_rows[i] = _mm512_setzero_ps();
2021 }
2022
2023 // For super block
2024 for (int64_t b = 0; b < nb; b++) {
2025 // Scale values - Load the sixteen scale values from two block_q4_kx8 structures
2026 const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
2027
2028 // dmin values - Load the sixteen dmin values from two block_q4_kx8 structures
2029 const __m512 col_dmin_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].dmin, b_ptr_1[b].dmin);
2030
2031 // Loop to iterate over the eight sub blocks of a super block - two sub blocks are processed per iteration
2032 for (int sb = 0; sb < QK_K / 64; sb++) {
2033
2034 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + sb * 256));
2035 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 32 + sb * 256));
2036 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 64 + sb * 256));
2037 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 96 + sb * 256));
2038 const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 128 + sb * 256));
2039 const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 160 + sb * 256));
2040 const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 192 + sb * 256));
2041 const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 224 + sb * 256));
2042
2043 const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + sb * 256));
2044 const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 32 + sb * 256));
2045 const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 64 + sb * 256));
2046 const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 96 + sb * 256));
2047 const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 128 + sb * 256));
2048 const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 160 + sb * 256));
2049 const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 192 + sb * 256));
2050 const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 224 + sb * 256));
2051
2052 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2053 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2054 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2055 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2056 const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240);
2057 const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240);
2058 const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240);
2059 const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240);
2060
2061 const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
2062 const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
2063 const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
2064 const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
2065 const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240);
2066 const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240);
2067 const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240);
2068 const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240);
2069
2070 const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
2071 const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
2072 const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
2073 const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
2074
2075 const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1);
2076 const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1);
2077 const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1);
2078 const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1);
2079
2080 //4-bit -> 8-bit
2081 const __m512i rhs_mat_014589CD_00 = _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) B08(0-7) B09(0-7) B0C(0-7) B0D(0-7)
2082 const __m512i rhs_mat_2367ABEF_00 = _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) B0A(0-7) B0B(0-7) B0E(0-7) B0F(0-7)
2083 const __m512i rhs_mat_014589CD_01 = _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) B08(8-15) B09(8-15) B0C(8-15) B0D(8-15)
2084 const __m512i rhs_mat_2367ABEF_01 = _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) B0A(8-15) B0B(8-15) B0E(8-15) B0F(8-15)
2085
2086 const __m512i rhs_mat_014589CD_02 = _mm512_and_si512(rhs_raw_mat_014589CD_2, m4bexpanded); //B00(16-23) B01(16-23) B04(16-23) B05(16-23) B08(16-23) B09(16-23) B0C(16-23) B0D(16-23)
2087 const __m512i rhs_mat_2367ABEF_02 = _mm512_and_si512(rhs_raw_mat_2367ABEF_2, m4bexpanded); //B02(16-23) B03(16-23) B06(16-23) B07(16-23) B0A(16-23) B0B(16-23) B0E(16-23) B0F(16-23)
2088 const __m512i rhs_mat_014589CD_03 = _mm512_and_si512(rhs_raw_mat_014589CD_3, m4bexpanded); //B00(24-31) B01(24-31) B04(24-31) B05(24-31) B08(24-31) B09(24-31) B0C(24-31) B0D(24-31)
2089 const __m512i rhs_mat_2367ABEF_03 = _mm512_and_si512(rhs_raw_mat_2367ABEF_3, m4bexpanded); //B02(24-31) B03(24-31) B06(24-31) B07(24-31) B0A(24-31) B0B(24-31) B0E(24-31) B0F(24-31)
2090
2091 const __m512i rhs_mat_014589CD_10 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) B18(0-7) B19(0-7) B1C(0-7) B1D(0-7)
2092 const __m512i rhs_mat_2367ABEF_10 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) B1A(0-7) B1B(0-7) B1E(0-7) B1F(0-7)
2093 const __m512i rhs_mat_014589CD_11 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) B18(8-15) B19(8-15) B1C(8-15) B1D(8-15)
2094 const __m512i rhs_mat_2367ABEF_11 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) B1A(8-15) B1B(8-15) B1E(8-15) B1F(8-15)
2095
2096 const __m512i rhs_mat_014589CD_12 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m4bexpanded); //B10(16-23) B11(16-23) B14(16-23) B15(16-23) B18(16-23) B19(16-23) B1C(16-23) B1D(16-23)
2097 const __m512i rhs_mat_2367ABEF_12 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m4bexpanded); //B12(16-23) B13(16-23) B16(16-23) B17(16-23) B1A(16-23) B1B(16-23) B1E(16-23) B1F(16-23)
2098 const __m512i rhs_mat_014589CD_13 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m4bexpanded); //B10(24-31) B11(24-31) B14(24-31) B15(24-31) B18(24-31) B19(24-31) B1C(24-31) B1D(24-31)
2099 const __m512i rhs_mat_2367ABEF_13 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m4bexpanded); //B12(24-31) B13(24-31) B16(24-31) B17(24-31) B1A(24-31) B1B(24-31) B1E(24-31) B1F(24-31)
2100
2101 // Shuffle pattern one - right side input
2102 const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3)
2103 const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) B0A(0-3) B0B(0-3) B0A(0-3) B0B(0-3) B0E(0-3) B0F(0-3) B0E(0-3) B0F(0-3)
2104 const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11)
2105 const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) B0A(8-11) B0B(8-11) B0A(8-11) B0B(8-11) B0E(8-11) B0F(8-11) B0E(8-11) B0F(8-11)
2106 const __m512i rhs_mat_014589CD_02_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_02, (_MM_PERM_ENUM)136); //B00(16-19) B01(16-19) B00(16-19) B01(16-19) B04(16-19) B05(16-19) B04(16-19) B05(16-19) B08(16-19) B09(16-19) B08(16-19) B09(16-19) B0C(16-19) B0D(16-19) B0C(16-19) B0D(16-19)
2107 const __m512i rhs_mat_2367ABEF_02_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_02, (_MM_PERM_ENUM)136); //B02(16-19) B03(16-19) B02(16-19) B03(16-19) B06(16-19) B07(16-19) B06(16-19) B07(16-19) B0A(16-19) B0B(16-19) B0A(16-19) B0B(16-19) B0E(16-19) B0F(16-19) B0E(16-19) B0F(16-19)
2108 const __m512i rhs_mat_014589CD_03_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_03, (_MM_PERM_ENUM)136); //B00(24-27) B01(24-27) B00(24-27) B01(24-27) B04(24-27) B05(24-27) B04(24-27) B05(24-27) B08(24-27) B09(24-27) B08(24-27) B09(24-27) B0C(24-27) B0D(24-27) B0C(24-27) B0D(24-27)
2109 const __m512i rhs_mat_2367ABEF_03_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_03, (_MM_PERM_ENUM)136); //B02(24-27) B03(24-27) B02(24-27) B03(24-27) B06(24-27) B07(24-27) B06(24-27) B07(24-27) B0A(24-27) B0B(24-27) B0A(24-27) B0B(24-27) B0E(24-27) B0F(24-27) B0E(24-27) B0F(24-27)
2110
2111 const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) B18(0-3) B19(0-3) B18(0-3) B19(0-3) B1C(0-3) B1D(0-3) B1C(0-3) B1D(0-3)
2112 const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) B1A(0-3) B1B(0-3) B1A(0-3) B1B(0-3) B1E(0-3) B1F(0-3) B1E(0-3) B1F(0-3)
2113 const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) B18(8-11) B19(8-11) B18(8-11) B19(8-11) B1C(8-11) B1D(8-11) B1C(8-11) B1D(8-11)
2114 const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) B1A(8-11) B1B(8-11) B1A(8-11) B1B(8-11) B1E(8-11) B1F(8-11) B1E(8-11) B1F(8-11)
2115 const __m512i rhs_mat_014589CD_12_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_12, (_MM_PERM_ENUM)136); //B10(16-19) B11(16-19) B10(16-19) B11(16-19) B14(16-19) B15(16-19) B14(16-19) B15(16-19) B18(16-19) B19(16-19) B18(16-19) B19(16-19) B1C(16-19) B1D(16-19) B1C(16-19) B1D(16-19)
2116 const __m512i rhs_mat_2367ABEF_12_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_12, (_MM_PERM_ENUM)136); //B12(16-19) B13(16-19) B12(16-19) B13(16-19) B16(16-19) B17(16-19) B16(16-19) B17(16-19) B1A(16-19) B1B(16-19) B1A(16-19) B1B(16-19) B1E(16-19) B1F(16-19) B1E(16-19) B1F(16-19)
2117 const __m512i rhs_mat_014589CD_13_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_13, (_MM_PERM_ENUM)136); //B10(24-27) B11(24-27) B10(24-27) B11(24-27) B14(24-27) B15(24-27) B14(24-27) B15(24-27) B18(24-27) B19(24-27) B18(24-27) B19(24-27) B1C(24-27) B1D(24-27) B1C(24-27) B1D(24-27)
2118 const __m512i rhs_mat_2367ABEF_13_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_13, (_MM_PERM_ENUM)136); //B12(24-27) B13(24-27) B12(24-27) B13(24-27) B16(24-27) B17(24-27) B16(24-27) B17(24-27) B1A(24-27) B1B(24-27) B1A(24-27) B1B(24-27) B1E(24-27) B1F(24-27) B1E(24-27) B1F(24-27)
2119
2120 // Shuffle pattern two - right side input
2121 const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) B08(4-7) B09(4-7) B08(4-7) B09(4-7) B0C(4-7) B0D(4-7) B0C(4-7) B0D(4-7)
2122 const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) B0A(4-7) B0B(4-7) B0A(4-7) B0B(4-7) B0E(4-7) B0F(4-7) B0E(4-7) B0F(4-7)
2123 const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) B08(12-15) B09(12-15) B08(12-15) B09(12-15) B0C(12-15) B0D(12-15) B0C(12-15) B0D(12-15)
2124 const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) B0A(12-15) B0B(12-15) B0A(12-15) B0B(12-15) B0E(12-15) B0F(12-15) B0E(12-15) B0F(12-15)
2125 const __m512i rhs_mat_014589CD_02_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_02, (_MM_PERM_ENUM)221); //B00(20-23) B01(20-23) B00(20-23) B01(20-23) B04(20-23) B05(20-23) B04(20-23) B05(20-23) B08(20-23) B09(20-23) B08(20-23) B09(20-23) B0C(20-23) B0D(20-23) B0C(20-23) B0D(20-23)
2126 const __m512i rhs_mat_2367ABEF_02_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_02, (_MM_PERM_ENUM)221); //B02(20-23) B03(20-23) B02(20-23) B03(20-23) B06(20-23) B07(20-23) B06(20-23) B07(20-23) B0A(20-23) B0B(20-23) B0A(20-23) B0B(20-23) B0E(20-23) B0F(20-23) B0E(20-23) B0F(20-23)
2127 const __m512i rhs_mat_014589CD_03_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_03, (_MM_PERM_ENUM)221); //B00(28-31) B01(28-31) B00(28-31) B01(28-31) B04(28-31) B05(28-31) B04(28-31) B05(28-31) B08(28-31) B09(28-31) B08(28-31) B09(28-31) B0C(28-31) B0D(28-31) B0C(28-31) 0BD(28-31)
2128 const __m512i rhs_mat_2367ABEF_03_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_03, (_MM_PERM_ENUM)221); //B02(28-31) B03(28-31) B02(28-31) B03(28-31) B06(28-31) B07(28-31) B06(28-31) B07(28-31) B0A(28-31) B0B(28-31) B0A(28-31) B0B(28-31) B0E(28-31) B0F(28-31) B0E(28-31) B0F(28-31)
2129
2130 const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) B18(4-7) B19(4-7) B18(4-7) B19(4-7) B1C(4-7) B1D(4-7) B1C(4-7) B1D(4-7)
2131 const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) B1A(4-7) B1B(4-7) B1A(4-7) B1B(4-7) B1E(4-7) B1F(4-7) B1E(4-7) B1F(4-7)
2132 const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) B18(12-15) B19(12-15) B18(12-15) B19(12-15) B1C(12-15) B1D(12-15) B1C(12-15) B1D(12-15)
2133 const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) B1A(12-15) B1B(12-15) B1A(12-15) B1B(12-15) B1E(12-15) B1F(12-15) B1E(12-15) B1F(12-15)
2134 const __m512i rhs_mat_014589CD_12_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_12, (_MM_PERM_ENUM)221); //B10(20-23) B11(20-23) B10(20-23) B11(20-23) B14(20-23) B15(20-23) B14(20-23) B15(20-23) B18(20-23) B19(20-23) B18(20-23) B19(20-23) B1C(20-23) B1D(20-23) B1C(20-23) B1D(20-23)
2135 const __m512i rhs_mat_2367ABEF_12_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_12, (_MM_PERM_ENUM)221); //B12(20-23) B13(20-23) B12(20-23) B13(20-23) B16(20-23) B17(20-23) B16(20-23) B17(20-23) B1A(20-23) B1B(20-23) B1A(20-23) B1B(20-23) B1E(20-23) B1F(20-23) B1E(20-23) B1F(20-23)
2136 const __m512i rhs_mat_014589CD_13_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_13, (_MM_PERM_ENUM)221); //B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31) B18(28-31) B19(28-31) B18(28-31) B19(28-31) B1C(28-31) B1D(28-31) B1C(28-31) B1D(28-31)
2137 const __m512i rhs_mat_2367ABEF_13_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_13, (_MM_PERM_ENUM)221); //B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31) B1A(28-31) B1B(28-31) B1A(28-31) B1B(28-31) B1E(28-31) B1F(28-31) B1E(28-31) B1F(28-31)
2138
2139 uint32_t utmp_00[4], utmp_01[4], utmp_10[4], utmp_11[4];
2140
2141 // Scales and Mins of corresponding sub blocks from different Q4_K structures are stored together
2142 // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop
2143 memcpy(utmp_00, b_ptr_0[b].scales + 24 * sb, 12);
2144 utmp_00[3] = ((utmp_00[2] >> 4) & kmask2) | (((utmp_00[1] >> 6) & kmask3) << 4);
2145 const uint32_t uaux_00 = utmp_00[1] & kmask1;
2146 utmp_00[1] = (utmp_00[2] & kmask2) | (((utmp_00[0] >> 6) & kmask3) << 4);
2147 utmp_00[2] = uaux_00;
2148 utmp_00[0] &= kmask1;
2149
2150 // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop
2151 memcpy(utmp_01, b_ptr_0[b].scales + 12 + sb * 24, 12);
2152 utmp_01[3] = ((utmp_01[2] >> 4) & kmask2) | (((utmp_01[1] >> 6) & kmask3) << 4);
2153 const uint32_t uaux_01 = utmp_01[1] & kmask1;
2154 utmp_01[1] = (utmp_01[2] & kmask2) | (((utmp_01[0] >> 6) & kmask3) << 4);
2155 utmp_01[2] = uaux_01;
2156 utmp_01[0] &= kmask1;
2157
2158 memcpy(utmp_10, b_ptr_1[b].scales + sb * 24, 12);
2159 utmp_10[3] = ((utmp_10[2] >> 4) & kmask2) | (((utmp_10[1] >> 6) & kmask3) << 4);
2160 const uint32_t uaux_10 = utmp_10[1] & kmask1;
2161 utmp_10[1] = (utmp_10[2] & kmask2) | (((utmp_10[0] >> 6) & kmask3) << 4);
2162 utmp_10[2] = uaux_10;
2163 utmp_10[0] &= kmask1;
2164
2165 // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop
2166 memcpy(utmp_11, b_ptr_1[b].scales + 12 + sb * 24, 12);
2167 utmp_11[3] = ((utmp_11[2] >> 4) & kmask2) | (((utmp_11[1] >> 6) & kmask3) << 4);
2168 const uint32_t uaux_11 = utmp_11[1] & kmask1;
2169 utmp_11[1] = (utmp_11[2] & kmask2) | (((utmp_11[0] >> 6) & kmask3) << 4);
2170 utmp_11[2] = uaux_11;
2171 utmp_11[0] &= kmask1;
2172
2173 // Scales of first sub block in the sb loop
2174 const __m256i mins_and_scales_0 = _mm256_set_epi32(utmp_10[3], utmp_10[2], utmp_10[1], utmp_10[0], utmp_00[3], utmp_00[2], utmp_00[1], utmp_00[0]);
2175 const __m512i scales_0 = _mm512_cvtepu8_epi16(_mm256_unpacklo_epi8(mins_and_scales_0, mins_and_scales_0));
2176
2177 // Scales of second sub block in the sb loop
2178 const __m256i mins_and_scales_1 = _mm256_set_epi32(utmp_11[3], utmp_11[2], utmp_11[1], utmp_11[0], utmp_01[3], utmp_01[2], utmp_01[1], utmp_01[0]);
2179 const __m512i scales_1 = _mm512_cvtepu8_epi16(_mm256_unpacklo_epi8(mins_and_scales_1, mins_and_scales_1));
2180
2181 // Mins of first and second sub block of Q4_K block are arranged side by side
2182 const __m512i mins_01 = _mm512_cvtepu8_epi16(_mm256_unpacklo_epi8(_mm256_shuffle_epi32(mins_and_scales_0, 78), _mm256_shuffle_epi32(mins_and_scales_1, 78)));
2183
2184 const __m512i scale_014589CD_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)68);
2185 const __m512i scale_2367ABEF_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)238);
2186
2187 const __m512i scale_014589CD_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)68);
2188 const __m512i scale_2367ABEF_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)238);
2189
2190 for (int rp = 0; rp < 4; rp++) {
2191
2192 // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
2193 // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
2194 __m256i lhs_mat_ymm_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 256 * sb)));
2195 __m256i lhs_mat_ymm_01_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 0);
2196 __m256i lhs_mat_ymm_23_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 17);
2197 __m256i lhs_mat_ymm_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 32 + 256 * sb)));
2198 __m256i lhs_mat_ymm_01_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 0);
2199 __m256i lhs_mat_ymm_23_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 17);
2200 __m256i lhs_mat_ymm_0123_02 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 64 + 256 * sb)));
2201 __m256i lhs_mat_ymm_01_02 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_02, lhs_mat_ymm_0123_02, 0);
2202 __m256i lhs_mat_ymm_23_02 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_02, lhs_mat_ymm_0123_02, 17);
2203 __m256i lhs_mat_ymm_0123_03 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 96 + 256 * sb)));
2204 __m256i lhs_mat_ymm_01_03 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_03, lhs_mat_ymm_0123_03, 0);
2205 __m256i lhs_mat_ymm_23_03 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_03, lhs_mat_ymm_0123_03, 17);
2206 __m256i lhs_mat_ymm_0123_10 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 128 + 256 * sb)));
2207 __m256i lhs_mat_ymm_01_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 0);
2208 __m256i lhs_mat_ymm_23_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 17);
2209 __m256i lhs_mat_ymm_0123_11 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 160 + 256 * sb)));
2210 __m256i lhs_mat_ymm_01_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 0);
2211 __m256i lhs_mat_ymm_23_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 17);
2212 __m256i lhs_mat_ymm_0123_12 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 192 + 256 * sb)));
2213 __m256i lhs_mat_ymm_01_12 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_12, lhs_mat_ymm_0123_12, 0);
2214 __m256i lhs_mat_ymm_23_12 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_12, lhs_mat_ymm_0123_12, 17);
2215 __m256i lhs_mat_ymm_0123_13 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 224 + 256 * sb)));
2216 __m256i lhs_mat_ymm_01_13 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_13, lhs_mat_ymm_0123_13, 0);
2217 __m256i lhs_mat_ymm_23_13 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_13, lhs_mat_ymm_0123_13, 17);
2218
2219 __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1);
2220 __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1);
2221 __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1);
2222 __m512i lhs_mat_23_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_01), lhs_mat_ymm_23_01, 1);
2223 __m512i lhs_mat_01_02 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_02), lhs_mat_ymm_01_02, 1);
2224 __m512i lhs_mat_23_02 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_02), lhs_mat_ymm_23_02, 1);
2225 __m512i lhs_mat_01_03 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_03), lhs_mat_ymm_01_03, 1);
2226 __m512i lhs_mat_23_03 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_03), lhs_mat_ymm_23_03, 1);
2227
2228 __m512i lhs_mat_01_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_10), lhs_mat_ymm_01_10, 1);
2229 __m512i lhs_mat_23_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_10), lhs_mat_ymm_23_10, 1);
2230 __m512i lhs_mat_01_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_11), lhs_mat_ymm_01_11, 1);
2231 __m512i lhs_mat_23_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_11), lhs_mat_ymm_23_11, 1);
2232 __m512i lhs_mat_01_12 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_12), lhs_mat_ymm_01_12, 1);
2233 __m512i lhs_mat_23_12 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_12), lhs_mat_ymm_23_12, 1);
2234 __m512i lhs_mat_01_13 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_13), lhs_mat_ymm_01_13, 1);
2235 __m512i lhs_mat_23_13 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_13), lhs_mat_ymm_23_13, 1);
2236
2237 // Bsums are loaded - four bsums are loaded (for two sub blocks) for the different Q8_K blocks
2238 __m256i lhs_bsums_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].bsums + 16 * sb)));
2239 __m256i lhs_bsums_hsum_ymm_0123_01 = _mm256_castsi128_si256(_mm_hadd_epi16(_mm256_castsi256_si128(lhs_bsums_0123_01), _mm256_extractf128_si256(lhs_bsums_0123_01, 1)));
2240 lhs_bsums_hsum_ymm_0123_01 = _mm256_permute2x128_si256(lhs_bsums_hsum_ymm_0123_01, lhs_bsums_hsum_ymm_0123_01, 0);
2241 __m512i lhs_bsums_hsum_0123_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_hsum_ymm_0123_01), lhs_bsums_hsum_ymm_0123_01, 1);
2242
2243 // Shuffle pattern one - left side input
2244 const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3)
2245 const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3)
2246 const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11)
2247 const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11)
2248 const __m512i lhs_mat_01_02_sp1 = _mm512_shuffle_epi32(lhs_mat_01_02, (_MM_PERM_ENUM)160); //A00(16-19) A00(16-19) A01(16-19) A01(16-19) A00(16-19) A00(16-19) A01(16-19) A01(16-19) A00(16-19) A00(16-19) A01(16-19) A01(16-19) A00(16-19) A00(16-19) A01(16-19) A01(16-19)
2249 const __m512i lhs_mat_23_02_sp1 = _mm512_shuffle_epi32(lhs_mat_23_02, (_MM_PERM_ENUM)160); //A02(16-19) A02(16-19) A03(16-19) A03(16-19) A02(16-19) A02(16-19) A03(16-19) A03(16-19) A02(16-19) A02(16-19) A03(16-19) A03(16-19) A02(16-19) A02(16-19) A03(16-19) A03(16-19)
2250 const __m512i lhs_mat_01_03_sp1 = _mm512_shuffle_epi32(lhs_mat_01_03, (_MM_PERM_ENUM)160); //A00(24-27) A00(24-27) A01(24-27) A01(24-27) A00(24-27) A00(24-27) A01(24-27) A01(24-27) A00(24-27) A00(24-27) A01(24-27) A01(24-27) A00(24-27) A00(24-27) A01(24-27) A01(24-27)
2251 const __m512i lhs_mat_23_03_sp1 = _mm512_shuffle_epi32(lhs_mat_23_03, (_MM_PERM_ENUM)160); //A02(24-27) A02(24-27) A03(24-27) A03(24-27) A02(24-27) A02(24-27) A03(24-27) A03(24-27) A02(24-27) A02(24-27) A03(24-27) A03(24-27) A02(24-27) A02(24-27) A03(24-27) A03(24-27)
2252
2253 const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3)
2254 const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3)
2255 const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11)
2256 const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11)
2257 const __m512i lhs_mat_01_12_sp1 = _mm512_shuffle_epi32(lhs_mat_01_12, (_MM_PERM_ENUM)160); //A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19)
2258 const __m512i lhs_mat_23_12_sp1 = _mm512_shuffle_epi32(lhs_mat_23_12, (_MM_PERM_ENUM)160); //A12(16-19) A12(16-19) A13(16-19) A13(16-19) A12(16-19) A12(16-19) A13(16-19) A13(16-19) A12(16-19) A12(16-19) A13(16-19) A13(16-19) A12(16-19) A12(16-19) A13(16-19) A13(16-19)
2259 const __m512i lhs_mat_01_13_sp1 = _mm512_shuffle_epi32(lhs_mat_01_13, (_MM_PERM_ENUM)160); //A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27)
2260 const __m512i lhs_mat_23_13_sp1 = _mm512_shuffle_epi32(lhs_mat_23_13, (_MM_PERM_ENUM)160); //A12(24-27) A12(24-27) A13(24-27) A13(24-27) A12(24-27) A12(24-27) A13(24-27) A13(24-27) A12(24-27) A12(24-27) A13(24-27) A13(24-27) A12(24-27) A12(24-27) A13(24-27) A13(24-27)
2261
2262 const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7)
2263 const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7)
2264 const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15)
2265 const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15)
2266 const __m512i lhs_mat_01_02_sp2 = _mm512_shuffle_epi32(lhs_mat_01_02, (_MM_PERM_ENUM)245); //A00(20-23) A00(20-23) A01(20-23) A01(20-23) A00(20-23) A00(20-23) A01(20-23) A01(20-23) A00(20-23) A00(20-23) A01(20-23) A01(20-23) A00(20-23) A00(20-23) A01(20-23) A01(20-23)
2267 const __m512i lhs_mat_23_02_sp2 = _mm512_shuffle_epi32(lhs_mat_23_02, (_MM_PERM_ENUM)245); //A02(20-23) A02(20-23) A03(20-23) A03(20-23) A02(20-23) A02(20-23) A03(20-23) A03(20-23) A02(20-23) A02(20-23) A03(20-23) A03(20-23) A02(20-23) A02(20-23) A03(20-23) A03(20-23)
2268 const __m512i lhs_mat_01_03_sp2 = _mm512_shuffle_epi32(lhs_mat_01_03, (_MM_PERM_ENUM)245); //A00(28-31) A00(28-31) A01(28-31) A01(28-31) A00(28-31) A00(28-31) A01(28-31) A01(28-31) A00(28-31) A00(28-31) A01(28-31) A01(28-31) A00(28-31) A00(28-31) A01(28-31) A01(28-31)
2269 const __m512i lhs_mat_23_03_sp2 = _mm512_shuffle_epi32(lhs_mat_23_03, (_MM_PERM_ENUM)245); //A02(28-31) A02(28-31) A03(28-31) A03(28-31) A02(28-31) A02(28-31) A03(28-31) A03(28-31) A02(28-31) A02(28-31) A03(28-31) A03(28-31) A02(28-31) A02(28-31) A03(28-31) A03(28-31)
2270
2271 const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7)
2272 const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7)
2273 const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15)
2274 const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15)
2275 const __m512i lhs_mat_01_12_sp2 = _mm512_shuffle_epi32(lhs_mat_01_12, (_MM_PERM_ENUM)245); //A10(20-23) A10(20-23) A11(20-23) A11(20-23) A10(20-23) A10(20-23) A11(20-23) A11(20-23) A10(20-23) A10(20-23) A11(20-23) A11(20-23) A10(20-23) A10(20-23) A11(20-23) A11(20-23)
2276 const __m512i lhs_mat_23_12_sp2 = _mm512_shuffle_epi32(lhs_mat_23_12, (_MM_PERM_ENUM)245); //A12(20-23) A12(20-23) A13(20-23) A13(20-23) A12(20-23) A12(20-23) A13(20-23) A13(20-23) A12(20-23) A12(20-23) A13(20-23) A13(20-23) A12(20-23) A12(20-23) A13(20-23) A13(20-23)
2277 const __m512i lhs_mat_01_13_sp2 = _mm512_shuffle_epi32(lhs_mat_01_13, (_MM_PERM_ENUM)245); //A10(28-31) A10(28-31) A11(28-31) A11(28-31) A10(28-31) A10(28-31) A11(28-31) A11(28-31) A10(28-31) A10(28-31) A11(28-31) A11(28-31) A10(28-31) A10(28-31) A11(28-31) A11(28-31)
2278 const __m512i lhs_mat_23_13_sp2 = _mm512_shuffle_epi32(lhs_mat_23_13, (_MM_PERM_ENUM)245); //A12(28-31) A12(28-31) A13(28-31) A13(28-31) A12(28-31) A12(28-31) A13(28-31) A13(28-31) A12(28-31) A12(28-31) A13(28-31) A13(28-31) A12(28-31) A12(28-31) A13(28-31) A13(28-31)
2279
2280 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2281 __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_03_sp1, lhs_mat_01_03_sp1), _mm512_maddubs_epi16(rhs_mat_014589CD_02_sp1, lhs_mat_01_02_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1));
2282 __m512i iacc_mat_01_0_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_03_sp1, lhs_mat_01_03_sp1), _mm512_maddubs_epi16(rhs_mat_2367ABEF_02_sp1, lhs_mat_01_02_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_01_01_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_01_00_sp1));
2283 __m512i iacc_mat_10_0_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_03_sp1, lhs_mat_23_03_sp1), _mm512_maddubs_epi16(rhs_mat_014589CD_02_sp1, lhs_mat_23_02_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_23_01_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_23_00_sp1));
2284 __m512i iacc_mat_11_0_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_03_sp1, lhs_mat_23_03_sp1), _mm512_maddubs_epi16(rhs_mat_2367ABEF_02_sp1, lhs_mat_23_02_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_23_01_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_23_00_sp1));
2285 __m512i iacc_mat_00_1_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_13_sp1, lhs_mat_01_13_sp1), _mm512_maddubs_epi16(rhs_mat_014589CD_12_sp1, lhs_mat_01_12_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_01_11_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_01_10_sp1));
2286 __m512i iacc_mat_01_1_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_13_sp1, lhs_mat_01_13_sp1), _mm512_maddubs_epi16(rhs_mat_2367ABEF_12_sp1, lhs_mat_01_12_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_01_11_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_01_10_sp1));
2287 __m512i iacc_mat_10_1_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_13_sp1, lhs_mat_23_13_sp1), _mm512_maddubs_epi16(rhs_mat_014589CD_12_sp1, lhs_mat_23_12_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_23_11_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_23_10_sp1));
2288 __m512i iacc_mat_11_1_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_13_sp1, lhs_mat_23_13_sp1), _mm512_maddubs_epi16(rhs_mat_2367ABEF_12_sp1, lhs_mat_23_12_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_23_11_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_23_10_sp1));
2289
2290 __m512i iacc_mat_00_0_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_03_sp2, lhs_mat_01_03_sp2), _mm512_maddubs_epi16(rhs_mat_014589CD_02_sp2, lhs_mat_01_02_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_01_01_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_01_00_sp2));
2291 __m512i iacc_mat_01_0_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_03_sp2, lhs_mat_01_03_sp2), _mm512_maddubs_epi16(rhs_mat_2367ABEF_02_sp2, lhs_mat_01_02_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_01_01_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_01_00_sp2));
2292 __m512i iacc_mat_10_0_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_03_sp2, lhs_mat_23_03_sp2), _mm512_maddubs_epi16(rhs_mat_014589CD_02_sp2, lhs_mat_23_02_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_23_01_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_23_00_sp2));
2293 __m512i iacc_mat_11_0_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_03_sp2, lhs_mat_23_03_sp2), _mm512_maddubs_epi16(rhs_mat_2367ABEF_02_sp2, lhs_mat_23_02_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_23_01_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_23_00_sp2));
2294 __m512i iacc_mat_00_1_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_13_sp2, lhs_mat_01_13_sp2), _mm512_maddubs_epi16(rhs_mat_014589CD_12_sp2, lhs_mat_01_12_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_01_11_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_01_10_sp2));
2295 __m512i iacc_mat_01_1_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_13_sp2, lhs_mat_01_13_sp2), _mm512_maddubs_epi16(rhs_mat_2367ABEF_12_sp2, lhs_mat_01_12_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_01_11_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_01_10_sp2));
2296 __m512i iacc_mat_10_1_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_13_sp2, lhs_mat_23_13_sp2), _mm512_maddubs_epi16(rhs_mat_014589CD_12_sp2, lhs_mat_23_12_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_23_11_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_23_10_sp2));
2297 __m512i iacc_mat_11_1_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_13_sp2, lhs_mat_23_13_sp2), _mm512_maddubs_epi16(rhs_mat_2367ABEF_12_sp2, lhs_mat_23_12_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_23_11_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_23_10_sp2));
2298
2299 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2300 __m512i iacc_mat_00_0 = _mm512_add_epi16(iacc_mat_00_0_sp1, iacc_mat_00_0_sp2);
2301 __m512i iacc_mat_01_0 = _mm512_add_epi16(iacc_mat_01_0_sp1, iacc_mat_01_0_sp2);
2302 __m512i iacc_mat_10_0 = _mm512_add_epi16(iacc_mat_10_0_sp1, iacc_mat_10_0_sp2);
2303 __m512i iacc_mat_11_0 = _mm512_add_epi16(iacc_mat_11_0_sp1, iacc_mat_11_0_sp2);
2304
2305 __m512i iacc_mat_00_1 = _mm512_add_epi16(iacc_mat_00_1_sp1, iacc_mat_00_1_sp2);
2306 __m512i iacc_mat_01_1 = _mm512_add_epi16(iacc_mat_01_1_sp1, iacc_mat_01_1_sp2);
2307 __m512i iacc_mat_10_1 = _mm512_add_epi16(iacc_mat_10_1_sp1, iacc_mat_10_1_sp2);
2308 __m512i iacc_mat_11_1 = _mm512_add_epi16(iacc_mat_11_1_sp1, iacc_mat_11_1_sp2);
2309
2310 iacc_mat_00_0 = _mm512_madd_epi16(iacc_mat_00_0, scale_014589CD_0);
2311 iacc_mat_01_0 = _mm512_madd_epi16(iacc_mat_01_0, scale_2367ABEF_0);
2312 iacc_mat_10_0 = _mm512_madd_epi16(iacc_mat_10_0, scale_014589CD_0);
2313 iacc_mat_11_0 = _mm512_madd_epi16(iacc_mat_11_0, scale_2367ABEF_0);
2314
2315 iacc_mat_00_1 = _mm512_madd_epi16(iacc_mat_00_1, scale_014589CD_1);
2316 iacc_mat_01_1 = _mm512_madd_epi16(iacc_mat_01_1, scale_2367ABEF_1);
2317 iacc_mat_10_1 = _mm512_madd_epi16(iacc_mat_10_1, scale_014589CD_1);
2318 iacc_mat_11_1 = _mm512_madd_epi16(iacc_mat_11_1, scale_2367ABEF_1);
2319
2320 // Straighten out to make 4 row vectors (4 for each sub block which are accumulated together in the next step)
2321 __m512i iacc_row_0_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00_0, _mm512_shuffle_epi32(iacc_mat_01_0, (_MM_PERM_ENUM)78));
2322 __m512i iacc_row_1_0 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00_0, (_MM_PERM_ENUM)78), iacc_mat_01_0);
2323 __m512i iacc_row_2_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10_0, _mm512_shuffle_epi32(iacc_mat_11_0, (_MM_PERM_ENUM)78));
2324 __m512i iacc_row_3_0 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10_0, (_MM_PERM_ENUM)78), iacc_mat_11_0);
2325 __m512i iacc_row_0_1 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00_1, _mm512_shuffle_epi32(iacc_mat_01_1, (_MM_PERM_ENUM)78));
2326 __m512i iacc_row_1_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00_1, (_MM_PERM_ENUM)78), iacc_mat_01_1);
2327 __m512i iacc_row_2_1 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10_1, _mm512_shuffle_epi32(iacc_mat_11_1, (_MM_PERM_ENUM)78));
2328 __m512i iacc_row_3_1 = _mm512_mask_blend_epi32(0xCCCC,_mm512_shuffle_epi32(iacc_mat_10_1, (_MM_PERM_ENUM)78), iacc_mat_11_1);
2329
2330 __m512i iacc_row_0 = _mm512_add_epi32(iacc_row_0_0, iacc_row_0_1);
2331 __m512i iacc_row_1 = _mm512_add_epi32(iacc_row_1_0, iacc_row_1_1);
2332 __m512i iacc_row_2 = _mm512_add_epi32(iacc_row_2_0, iacc_row_2_1);
2333 __m512i iacc_row_3 = _mm512_add_epi32(iacc_row_3_0, iacc_row_3_1);
2334
2335 // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes
2336 const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
2337 const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
2338 const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
2339
2340 // Multiply with appropiate scales and accumulate (for both d and dmin) below
2341 acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
2342 acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
2343 acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
2344 acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
2345
2346 __m512i iacc_row_min_0 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_hsum_0123_01, (_MM_PERM_ENUM)0), mins_01);
2347 __m512i iacc_row_min_1 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_hsum_0123_01, (_MM_PERM_ENUM)85), mins_01);
2348 __m512i iacc_row_min_2 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_hsum_0123_01, (_MM_PERM_ENUM)170), mins_01);
2349 __m512i iacc_row_min_3 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_hsum_0123_01, (_MM_PERM_ENUM)255), mins_01);
2350
2351 acc_min_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_0), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_min_rows[rp * 4]);
2352 acc_min_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_1), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_min_rows[rp * 4 + 1]);
2353 acc_min_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_2), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_min_rows[rp * 4 + 2]);
2354 acc_min_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[rp * 4 + 3]);
2355 }
2356 }
2357 }
2358 // Store the accumulated values
2359 for (int i = 0; i < 16; i++) {
2360 _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
2361 }
2362 }
2363 }
2364
2365 for (; y < nr / 4; y++) {
2366
2367 const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb);
2368
2369 // Take group of eight block_q4_kx8 structures at each pass of the loop and perform dot product operation
2370 for (int64_t x = 0; x < anc / 8; x += 2) {
2371
2372 const block_q4_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
2373 const block_q4_Kx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
2374
2375 // Master FP accumulators
2376 __m512 acc_rows[4];
2377 for (int i = 0; i < 4; i++) {
2378 acc_rows[i] = _mm512_setzero_ps();
2379 }
2380
2381 __m512 acc_min_rows[4];
2382 for (int i = 0; i < 4; i++) {
2383 acc_min_rows[i] = _mm512_setzero_ps();
2384 }
2385
2386 // For super block
2387 for (int64_t b = 0; b < nb; b++) {
2388 // Scale values - Load the sixteen scale values from two block_q4_kx8 structures
2389 const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
2390
2391 // dmin values - Load the sixteen dmin values from two block_q4_kx8 structures
2392 const __m512 col_dmin_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].dmin, b_ptr_1[b].dmin);
2393
2394 // Loop to iterate over the eight sub blocks of a super block - two sub blocks are processed per iteration
2395 for (int sb = 0; sb < QK_K / 64; sb++) {
2396
2397 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + sb * 256));
2398 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 32 + sb * 256));
2399 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 64 + sb * 256));
2400 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 96 + sb * 256));
2401 const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 128 + sb * 256));
2402 const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 160 + sb * 256));
2403 const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 192 + sb * 256));
2404 const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 224 + sb * 256));
2405
2406 const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + sb * 256));
2407 const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 32 + sb * 256));
2408 const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 64 + sb * 256));
2409 const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 96 + sb * 256));
2410 const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 128 + sb * 256));
2411 const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 160 + sb * 256));
2412 const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 192 + sb * 256));
2413 const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 224 + sb * 256));
2414
2415 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2416 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2417 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2418 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2419 const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240);
2420 const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240);
2421 const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240);
2422 const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240);
2423
2424 const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
2425 const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
2426 const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
2427 const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
2428 const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240);
2429 const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240);
2430 const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240);
2431 const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240);
2432
2433 const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
2434 const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
2435 const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
2436 const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
2437
2438 const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1);
2439 const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1);
2440 const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1);
2441 const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1);
2442
2443 //4-bit -> 8-bit
2444 const __m512i rhs_mat_014589CD_00 = _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) B08(0-7) B09(0-7) B0C(0-7) B0D(0-7)
2445 const __m512i rhs_mat_2367ABEF_00 = _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) B0A(0-7) B0B(0-7) B0E(0-7) B0F(0-7)
2446 const __m512i rhs_mat_014589CD_01 = _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) B08(8-15) B09(8-15) B0C(8-15) B0D(8-15)
2447 const __m512i rhs_mat_2367ABEF_01 = _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) B0A(8-15) B0B(8-15) B0E(8-15) B0F(8-15)
2448
2449 const __m512i rhs_mat_014589CD_02 = _mm512_and_si512(rhs_raw_mat_014589CD_2, m4bexpanded); //B00(16-23) B01(16-23) B04(16-23) B05(16-23) B08(16-23) B09(16-23) B0C(16-23) B0D(16-23)
2450 const __m512i rhs_mat_2367ABEF_02 = _mm512_and_si512(rhs_raw_mat_2367ABEF_2, m4bexpanded); //B02(16-23) B03(16-23) B06(16-23) B07(16-23) B0A(16-23) B0B(16-23) B0E(16-23) B0F(16-23)
2451 const __m512i rhs_mat_014589CD_03 = _mm512_and_si512(rhs_raw_mat_014589CD_3, m4bexpanded); //B00(24-31) B01(24-31) B04(24-31) B05(24-31) B08(24-31) B09(24-31) B0C(24-31) B0D(24-31)
2452 const __m512i rhs_mat_2367ABEF_03 = _mm512_and_si512(rhs_raw_mat_2367ABEF_3, m4bexpanded); //B02(24-31) B03(24-31) B06(24-31) B07(24-31) B0A(24-31) B0B(24-31) B0E(24-31) B0F(24-31)
2453
2454 const __m512i rhs_mat_014589CD_10 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) B18(0-7) B19(0-7) B1C(0-7) B1D(0-7)
2455 const __m512i rhs_mat_2367ABEF_10 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) B1A(0-7) B1B(0-7) B1E(0-7) B1F(0-7)
2456 const __m512i rhs_mat_014589CD_11 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) B18(8-15) B19(8-15) B1C(8-15) B1D(8-15)
2457 const __m512i rhs_mat_2367ABEF_11 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) B1A(8-15) B1B(8-15) B1E(8-15) B1F(8-15)
2458
2459 const __m512i rhs_mat_014589CD_12 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m4bexpanded); //B10(16-23) B11(16-23) B14(16-23) B15(16-23) B18(16-23) B19(16-23) B1C(16-23) B1D(16-23)
2460 const __m512i rhs_mat_2367ABEF_12 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m4bexpanded); //B12(16-23) B13(16-23) B16(16-23) B17(16-23) B1A(16-23) B1B(16-23) B1E(16-23) B1F(16-23)
2461 const __m512i rhs_mat_014589CD_13 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m4bexpanded); //B10(24-31) B11(24-31) B14(24-31) B15(24-31) B18(24-31) B19(24-31) B1C(24-31) B1D(24-31)
2462 const __m512i rhs_mat_2367ABEF_13 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m4bexpanded); //B12(24-31) B13(24-31) B16(24-31) B17(24-31) B1A(24-31) B1B(24-31) B1E(24-31) B1F(24-31)
2463
2464 // Shuffle pattern one - right side input
2465 const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3)
2466 const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) B0A(0-3) B0B(0-3) B0A(0-3) B0B(0-3) B0E(0-3) B0F(0-3) B0E(0-3) B0F(0-3)
2467 const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11)
2468 const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) B0A(8-11) B0B(8-11) B0A(8-11) B0B(8-11) B0E(8-11) B0F(8-11) B0E(8-11) B0F(8-11)
2469 const __m512i rhs_mat_014589CD_02_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_02, (_MM_PERM_ENUM)136); //B00(16-19) B01(16-19) B00(16-19) B01(16-19) B04(16-19) B05(16-19) B04(16-19) B05(16-19) B08(16-19) B09(16-19) B08(16-19) B09(16-19) B0C(16-19) B0D(16-19) B0C(16-19) B0D(16-19)
2470 const __m512i rhs_mat_2367ABEF_02_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_02, (_MM_PERM_ENUM)136); //B02(16-19) B03(16-19) B02(16-19) B03(16-19) B06(16-19) B07(16-19) B06(16-19) B07(16-19) B0A(16-19) B0B(16-19) B0A(16-19) B0B(16-19) B0E(16-19) B0F(16-19) B0E(16-19) B0F(16-19)
2471 const __m512i rhs_mat_014589CD_03_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_03, (_MM_PERM_ENUM)136); //B00(24-27) B01(24-27) B00(24-27) B01(24-27) B04(24-27) B05(24-27) B04(24-27) B05(24-27) B08(24-27) B09(24-27) B08(24-27) B09(24-27) B0C(24-27) B0D(24-27) B0C(24-27) B0D(24-27)
2472 const __m512i rhs_mat_2367ABEF_03_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_03, (_MM_PERM_ENUM)136); //B02(24-27) B03(24-27) B02(24-27) B03(24-27) B06(24-27) B07(24-27) B06(24-27) B07(24-27) B0A(24-27) B0B(24-27) B0A(24-27) B0B(24-27) B0E(24-27) B0F(24-27) B0E(24-27) B0F(24-27)
2473
2474 const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) B18(0-3) B19(0-3) B18(0-3) B19(0-3) B1C(0-3) B1D(0-3) B1C(0-3) B1D(0-3)
2475 const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) B1A(0-3) B1B(0-3) B1A(0-3) B1B(0-3) B1E(0-3) B1F(0-3) B1E(0-3) B1F(0-3)
2476 const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) B18(8-11) B19(8-11) B18(8-11) B19(8-11) B1C(8-11) B1D(8-11) B1C(8-11) B1D(8-11)
2477 const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) B1A(8-11) B1B(8-11) B1A(8-11) B1B(8-11) B1E(8-11) B1F(8-11) B1E(8-11) B1F(8-11)
2478 const __m512i rhs_mat_014589CD_12_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_12, (_MM_PERM_ENUM)136); //B10(16-19) B11(16-19) B10(16-19) B11(16-19) B14(16-19) B15(16-19) B14(16-19) B15(16-19) B18(16-19) B19(16-19) B18(16-19) B19(16-19) B1C(16-19) B1D(16-19) B1C(16-19) B1D(16-19)
2479 const __m512i rhs_mat_2367ABEF_12_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_12, (_MM_PERM_ENUM)136); //B12(16-19) B13(16-19) B12(16-19) B13(16-19) B16(16-19) B17(16-19) B16(16-19) B17(16-19) B1A(16-19) B1B(16-19) B1A(16-19) B1B(16-19) B1E(16-19) B1F(16-19) B1E(16-19) B1F(16-19)
2480 const __m512i rhs_mat_014589CD_13_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_13, (_MM_PERM_ENUM)136); //B10(24-27) B11(24-27) B10(24-27) B11(24-27) B14(24-27) B15(24-27) B14(24-27) B15(24-27) B18(24-27) B19(24-27) B18(24-27) B19(24-27) B1C(24-27) B1D(24-27) B1C(24-27) B1D(24-27)
2481 const __m512i rhs_mat_2367ABEF_13_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_13, (_MM_PERM_ENUM)136); //B12(24-27) B13(24-27) B12(24-27) B13(24-27) B16(24-27) B17(24-27) B16(24-27) B17(24-27) B1A(24-27) B1B(24-27) B1A(24-27) B1B(24-27) B1E(24-27) B1F(24-27) B1E(24-27) B1F(24-27)
2482
2483 // Shuffle pattern two - right side input
2484 const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) B08(4-7) B09(4-7) B08(4-7) B09(4-7) B0C(4-7) B0D(4-7) B0C(4-7) B0D(4-7)
2485 const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) B0A(4-7) B0B(4-7) B0A(4-7) B0B(4-7) B0E(4-7) B0F(4-7) B0E(4-7) B0F(4-7)
2486 const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) B08(12-15) B09(12-15) B08(12-15) B09(12-15) B0C(12-15) B0D(12-15) B0C(12-15) B0D(12-15)
2487 const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) B0A(12-15) B0B(12-15) B0A(12-15) B0B(12-15) B0E(12-15) B0F(12-15) B0E(12-15) B0F(12-15)
2488 const __m512i rhs_mat_014589CD_02_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_02, (_MM_PERM_ENUM)221); //B00(20-23) B01(20-23) B00(20-23) B01(20-23) B04(20-23) B05(20-23) B04(20-23) B05(20-23) B08(20-23) B09(20-23) B08(20-23) B09(20-23) B0C(20-23) B0D(20-23) B0C(20-23) B0D(20-23)
2489 const __m512i rhs_mat_2367ABEF_02_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_02, (_MM_PERM_ENUM)221); //B02(20-23) B03(20-23) B02(20-23) B03(20-23) B06(20-23) B07(20-23) B06(20-23) B07(20-23) B0A(20-23) B0B(20-23) B0A(20-23) B0B(20-23) B0E(20-23) B0F(20-23) B0E(20-23) B0F(20-23)
2490 const __m512i rhs_mat_014589CD_03_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_03, (_MM_PERM_ENUM)221); //B00(28-31) B01(28-31) B00(28-31) B01(28-31) B04(28-31) B05(28-31) B04(28-31) B05(28-31) B08(28-31) B09(28-31) B08(28-31) B09(28-31) B0C(28-31) B0D(28-31) B0C(28-31) 0BD(28-31)
2491 const __m512i rhs_mat_2367ABEF_03_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_03, (_MM_PERM_ENUM)221); //B02(28-31) B03(28-31) B02(28-31) B03(28-31) B06(28-31) B07(28-31) B06(28-31) B07(28-31) B0A(28-31) B0B(28-31) B0A(28-31) B0B(28-31) B0E(28-31) B0F(28-31) B0E(28-31) B0F(28-31)
2492
2493 const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) B18(4-7) B19(4-7) B18(4-7) B19(4-7) B1C(4-7) B1D(4-7) B1C(4-7) B1D(4-7)
2494 const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) B1A(4-7) B1B(4-7) B1A(4-7) B1B(4-7) B1E(4-7) B1F(4-7) B1E(4-7) B1F(4-7)
2495 const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) B18(12-15) B19(12-15) B18(12-15) B19(12-15) B1C(12-15) B1D(12-15) B1C(12-15) B1D(12-15)
2496 const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) B1A(12-15) B1B(12-15) B1A(12-15) B1B(12-15) B1E(12-15) B1F(12-15) B1E(12-15) B1F(12-15)
2497 const __m512i rhs_mat_014589CD_12_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_12, (_MM_PERM_ENUM)221); //B10(20-23) B11(20-23) B10(20-23) B11(20-23) B14(20-23) B15(20-23) B14(20-23) B15(20-23) B18(20-23) B19(20-23) B18(20-23) B19(20-23) B1C(20-23) B1D(20-23) B1C(20-23) B1D(20-23)
2498 const __m512i rhs_mat_2367ABEF_12_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_12, (_MM_PERM_ENUM)221); //B12(20-23) B13(20-23) B12(20-23) B13(20-23) B16(20-23) B17(20-23) B16(20-23) B17(20-23) B1A(20-23) B1B(20-23) B1A(20-23) B1B(20-23) B1E(20-23) B1F(20-23) B1E(20-23) B1F(20-23)
2499 const __m512i rhs_mat_014589CD_13_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_13, (_MM_PERM_ENUM)221); //B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31) B18(28-31) B19(28-31) B18(28-31) B19(28-31) B1C(28-31) B1D(28-31) B1C(28-31) B1D(28-31)
2500 const __m512i rhs_mat_2367ABEF_13_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_13, (_MM_PERM_ENUM)221); //B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31) B1A(28-31) B1B(28-31) B1A(28-31) B1B(28-31) B1E(28-31) B1F(28-31) B1E(28-31) B1F(28-31)
2501
2502 uint32_t utmp_00[4], utmp_01[4], utmp_10[4], utmp_11[4];
2503
2504 // Scales and Mins of corresponding sub blocks from different Q4_K structures are stored together
2505 // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop
2506 memcpy(utmp_00, b_ptr_0[b].scales + 24 * sb, 12);
2507 utmp_00[3] = ((utmp_00[2] >> 4) & kmask2) | (((utmp_00[1] >> 6) & kmask3) << 4);
2508 const uint32_t uaux_00 = utmp_00[1] & kmask1;
2509 utmp_00[1] = (utmp_00[2] & kmask2) | (((utmp_00[0] >> 6) & kmask3) << 4);
2510 utmp_00[2] = uaux_00;
2511 utmp_00[0] &= kmask1;
2512
2513 // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop
2514 memcpy(utmp_01, b_ptr_0[b].scales + 12 + sb * 24, 12);
2515 utmp_01[3] = ((utmp_01[2] >> 4) & kmask2) | (((utmp_01[1] >> 6) & kmask3) << 4);
2516 const uint32_t uaux_01 = utmp_01[1] & kmask1;
2517 utmp_01[1] = (utmp_01[2] & kmask2) | (((utmp_01[0] >> 6) & kmask3) << 4);
2518 utmp_01[2] = uaux_01;
2519 utmp_01[0] &= kmask1;
2520
2521 // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop
2522 memcpy(utmp_10, b_ptr_1[b].scales + sb * 24, 12);
2523 utmp_10[3] = ((utmp_10[2] >> 4) & kmask2) | (((utmp_10[1] >> 6) & kmask3) << 4);
2524 const uint32_t uaux_10 = utmp_10[1] & kmask1;
2525 utmp_10[1] = (utmp_10[2] & kmask2) | (((utmp_10[0] >> 6) & kmask3) << 4);
2526 utmp_10[2] = uaux_10;
2527 utmp_10[0] &= kmask1;
2528
2529 // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop
2530 memcpy(utmp_11, b_ptr_1[b].scales + 12 + sb * 24, 12);
2531 utmp_11[3] = ((utmp_11[2] >> 4) & kmask2) | (((utmp_11[1] >> 6) & kmask3) << 4);
2532 const uint32_t uaux_11 = utmp_11[1] & kmask1;
2533 utmp_11[1] = (utmp_11[2] & kmask2) | (((utmp_11[0] >> 6) & kmask3) << 4);
2534 utmp_11[2] = uaux_11;
2535 utmp_11[0] &= kmask1;
2536
2537 // Scales of first sub block in the sb loop
2538 const __m256i mins_and_scales_0 = _mm256_set_epi32(utmp_10[3], utmp_10[2], utmp_10[1], utmp_10[0], utmp_00[3], utmp_00[2], utmp_00[1], utmp_00[0]);
2539 const __m512i scales_0 = _mm512_cvtepu8_epi16(_mm256_unpacklo_epi8(mins_and_scales_0, mins_and_scales_0));
2540
2541 // Scales of second sub block in the sb loop
2542 const __m256i mins_and_scales_1 = _mm256_set_epi32(utmp_11[3], utmp_11[2], utmp_11[1], utmp_11[0], utmp_01[3], utmp_01[2], utmp_01[1], utmp_01[0]);
2543 const __m512i scales_1 = _mm512_cvtepu8_epi16(_mm256_unpacklo_epi8(mins_and_scales_1, mins_and_scales_1));
2544
2545 // Mins of first and second sub block of Q4_K block are arranged side by side
2546 const __m512i mins_01 = _mm512_cvtepu8_epi16(_mm256_unpacklo_epi8(_mm256_shuffle_epi32(mins_and_scales_0, 78), _mm256_shuffle_epi32(mins_and_scales_1, 78)));
2547
2548 const __m512i scale_014589CD_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)68);
2549 const __m512i scale_2367ABEF_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)238);
2550
2551 const __m512i scale_014589CD_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)68);
2552 const __m512i scale_2367ABEF_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)238);
2553
2554 // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
2555 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
2556 __m256i lhs_mat_ymm_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 256 * sb)));
2557 __m256i lhs_mat_ymm_01_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 0);
2558 __m256i lhs_mat_ymm_23_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 17);
2559 __m256i lhs_mat_ymm_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 32 + 256 * sb)));
2560 __m256i lhs_mat_ymm_01_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 0);
2561 __m256i lhs_mat_ymm_23_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 17);
2562 __m256i lhs_mat_ymm_0123_02 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 64 + 256 * sb)));
2563 __m256i lhs_mat_ymm_01_02 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_02, lhs_mat_ymm_0123_02, 0);
2564 __m256i lhs_mat_ymm_23_02 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_02, lhs_mat_ymm_0123_02, 17);
2565 __m256i lhs_mat_ymm_0123_03 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 96 + 256 * sb)));
2566 __m256i lhs_mat_ymm_01_03 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_03, lhs_mat_ymm_0123_03, 0);
2567 __m256i lhs_mat_ymm_23_03 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_03, lhs_mat_ymm_0123_03, 17);
2568 __m256i lhs_mat_ymm_0123_10 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 128 + 256 * sb)));
2569 __m256i lhs_mat_ymm_01_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 0);
2570 __m256i lhs_mat_ymm_23_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 17);
2571 __m256i lhs_mat_ymm_0123_11 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 160 + 256 * sb)));
2572 __m256i lhs_mat_ymm_01_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 0);
2573 __m256i lhs_mat_ymm_23_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 17);
2574 __m256i lhs_mat_ymm_0123_12 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 192 + 256 * sb)));
2575 __m256i lhs_mat_ymm_01_12 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_12, lhs_mat_ymm_0123_12, 0);
2576 __m256i lhs_mat_ymm_23_12 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_12, lhs_mat_ymm_0123_12, 17);
2577 __m256i lhs_mat_ymm_0123_13 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 224 + 256 * sb)));
2578 __m256i lhs_mat_ymm_01_13 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_13, lhs_mat_ymm_0123_13, 0);
2579 __m256i lhs_mat_ymm_23_13 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_13, lhs_mat_ymm_0123_13, 17);
2580
2581 //Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into a 512 bit vector
2582 __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1);
2583 __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1);
2584 __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1);
2585 __m512i lhs_mat_23_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_01), lhs_mat_ymm_23_01, 1);
2586 __m512i lhs_mat_01_02 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_02), lhs_mat_ymm_01_02, 1);
2587 __m512i lhs_mat_23_02 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_02), lhs_mat_ymm_23_02, 1);
2588 __m512i lhs_mat_01_03 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_03), lhs_mat_ymm_01_03, 1);
2589 __m512i lhs_mat_23_03 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_03), lhs_mat_ymm_23_03, 1);
2590
2591 __m512i lhs_mat_01_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_10), lhs_mat_ymm_01_10, 1);
2592 __m512i lhs_mat_23_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_10), lhs_mat_ymm_23_10, 1);
2593 __m512i lhs_mat_01_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_11), lhs_mat_ymm_01_11, 1);
2594 __m512i lhs_mat_23_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_11), lhs_mat_ymm_23_11, 1);
2595 __m512i lhs_mat_01_12 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_12), lhs_mat_ymm_01_12, 1);
2596 __m512i lhs_mat_23_12 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_12), lhs_mat_ymm_23_12, 1);
2597 __m512i lhs_mat_01_13 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_13), lhs_mat_ymm_01_13, 1);
2598 __m512i lhs_mat_23_13 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_13), lhs_mat_ymm_23_13, 1);
2599
2600 // Bsums are loaded - four bsums are loaded (for two sub blocks) for the different Q8_K blocks
2601 __m256i lhs_bsums_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].bsums + 16 * sb)));
2602 __m256i lhs_bsums_hsum_ymm_0123_01 = _mm256_castsi128_si256(_mm_hadd_epi16(_mm256_castsi256_si128(lhs_bsums_0123_01), _mm256_extractf128_si256(lhs_bsums_0123_01, 1)));
2603 lhs_bsums_hsum_ymm_0123_01 = _mm256_permute2x128_si256(lhs_bsums_hsum_ymm_0123_01, lhs_bsums_hsum_ymm_0123_01, 0);
2604 __m512i lhs_bsums_hsum_0123_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_hsum_ymm_0123_01), lhs_bsums_hsum_ymm_0123_01, 1);
2605
2606 // Shuffle pattern one - left side input
2607 const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3)
2608 const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3)
2609 const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11)
2610 const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11)
2611 const __m512i lhs_mat_01_02_sp1 = _mm512_shuffle_epi32(lhs_mat_01_02, (_MM_PERM_ENUM)160); //A00(16-19) A00(16-19) A01(16-19) A01(16-19) A00(16-19) A00(16-19) A01(16-19) A01(16-19) A00(16-19) A00(16-19) A01(16-19) A01(16-19) A00(16-19) A00(16-19) A01(16-19) A01(16-19)
2612 const __m512i lhs_mat_23_02_sp1 = _mm512_shuffle_epi32(lhs_mat_23_02, (_MM_PERM_ENUM)160); //A02(16-19) A02(16-19) A03(16-19) A03(16-19) A02(16-19) A02(16-19) A03(16-19) A03(16-19) A02(16-19) A02(16-19) A03(16-19) A03(16-19) A02(16-19) A02(16-19) A03(16-19) A03(16-19)
2613 const __m512i lhs_mat_01_03_sp1 = _mm512_shuffle_epi32(lhs_mat_01_03, (_MM_PERM_ENUM)160); //A00(24-27) A00(24-27) A01(24-27) A01(24-27) A00(24-27) A00(24-27) A01(24-27) A01(24-27) A00(24-27) A00(24-27) A01(24-27) A01(24-27) A00(24-27) A00(24-27) A01(24-27) A01(24-27)
2614 const __m512i lhs_mat_23_03_sp1 = _mm512_shuffle_epi32(lhs_mat_23_03, (_MM_PERM_ENUM)160); //A02(24-27) A02(24-27) A03(24-27) A03(24-27) A02(24-27) A02(24-27) A03(24-27) A03(24-27) A02(24-27) A02(24-27) A03(24-27) A03(24-27) A02(24-27) A02(24-27) A03(24-27) A03(24-27)
2615
2616 const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3)
2617 const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3)
2618 const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11)
2619 const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11)
2620 const __m512i lhs_mat_01_12_sp1 = _mm512_shuffle_epi32(lhs_mat_01_12, (_MM_PERM_ENUM)160); //A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19)
2621 const __m512i lhs_mat_23_12_sp1 = _mm512_shuffle_epi32(lhs_mat_23_12, (_MM_PERM_ENUM)160); //A12(16-19) A12(16-19) A13(16-19) A13(16-19) A12(16-19) A12(16-19) A13(16-19) A13(16-19) A12(16-19) A12(16-19) A13(16-19) A13(16-19) A12(16-19) A12(16-19) A13(16-19) A13(16-19)
2622 const __m512i lhs_mat_01_13_sp1 = _mm512_shuffle_epi32(lhs_mat_01_13, (_MM_PERM_ENUM)160); //A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27)
2623 const __m512i lhs_mat_23_13_sp1 = _mm512_shuffle_epi32(lhs_mat_23_13, (_MM_PERM_ENUM)160); //A12(24-27) A12(24-27) A13(24-27) A13(24-27) A12(24-27) A12(24-27) A13(24-27) A13(24-27) A12(24-27) A12(24-27) A13(24-27) A13(24-27) A12(24-27) A12(24-27) A13(24-27) A13(24-27)
2624
2625 const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7)
2626 const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7)
2627 const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15)
2628 const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15)
2629 const __m512i lhs_mat_01_02_sp2 = _mm512_shuffle_epi32(lhs_mat_01_02, (_MM_PERM_ENUM)245); //A00(20-23) A00(20-23) A01(20-23) A01(20-23) A00(20-23) A00(20-23) A01(20-23) A01(20-23) A00(20-23) A00(20-23) A01(20-23) A01(20-23) A00(20-23) A00(20-23) A01(20-23) A01(20-23)
2630 const __m512i lhs_mat_23_02_sp2 = _mm512_shuffle_epi32(lhs_mat_23_02, (_MM_PERM_ENUM)245); //A02(20-23) A02(20-23) A03(20-23) A03(20-23) A02(20-23) A02(20-23) A03(20-23) A03(20-23) A02(20-23) A02(20-23) A03(20-23) A03(20-23) A02(20-23) A02(20-23) A03(20-23) A03(20-23)
2631 const __m512i lhs_mat_01_03_sp2 = _mm512_shuffle_epi32(lhs_mat_01_03, (_MM_PERM_ENUM)245); //A00(28-31) A00(28-31) A01(28-31) A01(28-31) A00(28-31) A00(28-31) A01(28-31) A01(28-31) A00(28-31) A00(28-31) A01(28-31) A01(28-31) A00(28-31) A00(28-31) A01(28-31) A01(28-31)
2632 const __m512i lhs_mat_23_03_sp2 = _mm512_shuffle_epi32(lhs_mat_23_03, (_MM_PERM_ENUM)245); //A02(28-31) A02(28-31) A03(28-31) A03(28-31) A02(28-31) A02(28-31) A03(28-31) A03(28-31) A02(28-31) A02(28-31) A03(28-31) A03(28-31) A02(28-31) A02(28-31) A03(28-31) A03(28-31)
2633
2634 const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7)
2635 const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7)
2636 const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15)
2637 const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15)
2638 const __m512i lhs_mat_01_12_sp2 = _mm512_shuffle_epi32(lhs_mat_01_12, (_MM_PERM_ENUM)245); //A10(20-23) A10(20-23) A11(20-23) A11(20-23) A10(20-23) A10(20-23) A11(20-23) A11(20-23) A10(20-23) A10(20-23) A11(20-23) A11(20-23) A10(20-23) A10(20-23) A11(20-23) A11(20-23)
2639 const __m512i lhs_mat_23_12_sp2 = _mm512_shuffle_epi32(lhs_mat_23_12, (_MM_PERM_ENUM)245); //A12(20-23) A12(20-23) A13(20-23) A13(20-23) A12(20-23) A12(20-23) A13(20-23) A13(20-23) A12(20-23) A12(20-23) A13(20-23) A13(20-23) A12(20-23) A12(20-23) A13(20-23) A13(20-23)
2640 const __m512i lhs_mat_01_13_sp2 = _mm512_shuffle_epi32(lhs_mat_01_13, (_MM_PERM_ENUM)245); //A10(28-31) A10(28-31) A11(28-31) A11(28-31) A10(28-31) A10(28-31) A11(28-31) A11(28-31) A10(28-31) A10(28-31) A11(28-31) A11(28-31) A10(28-31) A10(28-31) A11(28-31) A11(28-31)
2641 const __m512i lhs_mat_23_13_sp2 = _mm512_shuffle_epi32(lhs_mat_23_13, (_MM_PERM_ENUM)245); //A12(28-31) A12(28-31) A13(28-31) A13(28-31) A12(28-31) A12(28-31) A13(28-31) A13(28-31) A12(28-31) A12(28-31) A13(28-31) A13(28-31) A12(28-31) A12(28-31) A13(28-31) A13(28-31)
2642
2643 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2644 __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_03_sp1, lhs_mat_01_03_sp1), _mm512_maddubs_epi16(rhs_mat_014589CD_02_sp1, lhs_mat_01_02_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1));
2645 __m512i iacc_mat_01_0_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_03_sp1, lhs_mat_01_03_sp1), _mm512_maddubs_epi16(rhs_mat_2367ABEF_02_sp1, lhs_mat_01_02_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_01_01_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_01_00_sp1));
2646 __m512i iacc_mat_10_0_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_03_sp1, lhs_mat_23_03_sp1), _mm512_maddubs_epi16(rhs_mat_014589CD_02_sp1, lhs_mat_23_02_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_23_01_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_23_00_sp1));
2647 __m512i iacc_mat_11_0_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_03_sp1, lhs_mat_23_03_sp1), _mm512_maddubs_epi16(rhs_mat_2367ABEF_02_sp1, lhs_mat_23_02_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_23_01_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_23_00_sp1));
2648 __m512i iacc_mat_00_1_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_13_sp1, lhs_mat_01_13_sp1), _mm512_maddubs_epi16(rhs_mat_014589CD_12_sp1, lhs_mat_01_12_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_01_11_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_01_10_sp1));
2649 __m512i iacc_mat_01_1_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_13_sp1, lhs_mat_01_13_sp1), _mm512_maddubs_epi16(rhs_mat_2367ABEF_12_sp1, lhs_mat_01_12_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_01_11_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_01_10_sp1));
2650 __m512i iacc_mat_10_1_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_13_sp1, lhs_mat_23_13_sp1), _mm512_maddubs_epi16(rhs_mat_014589CD_12_sp1, lhs_mat_23_12_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_23_11_sp1)), _mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_23_10_sp1));
2651 __m512i iacc_mat_11_1_sp1 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_13_sp1, lhs_mat_23_13_sp1), _mm512_maddubs_epi16(rhs_mat_2367ABEF_12_sp1, lhs_mat_23_12_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_23_11_sp1)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_23_10_sp1));
2652
2653 __m512i iacc_mat_00_0_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_03_sp2, lhs_mat_01_03_sp2), _mm512_maddubs_epi16(rhs_mat_014589CD_02_sp2, lhs_mat_01_02_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_01_01_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_01_00_sp2));
2654 __m512i iacc_mat_01_0_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_03_sp2, lhs_mat_01_03_sp2), _mm512_maddubs_epi16(rhs_mat_2367ABEF_02_sp2, lhs_mat_01_02_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_01_01_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_01_00_sp2));
2655 __m512i iacc_mat_10_0_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_03_sp2, lhs_mat_23_03_sp2), _mm512_maddubs_epi16(rhs_mat_014589CD_02_sp2, lhs_mat_23_02_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_23_01_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_23_00_sp2));
2656 __m512i iacc_mat_11_0_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_03_sp2, lhs_mat_23_03_sp2), _mm512_maddubs_epi16(rhs_mat_2367ABEF_02_sp2, lhs_mat_23_02_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_23_01_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_23_00_sp2));
2657 __m512i iacc_mat_00_1_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_13_sp2, lhs_mat_01_13_sp2), _mm512_maddubs_epi16(rhs_mat_014589CD_12_sp2, lhs_mat_01_12_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_01_11_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_01_10_sp2));
2658 __m512i iacc_mat_01_1_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_13_sp2, lhs_mat_01_13_sp2), _mm512_maddubs_epi16(rhs_mat_2367ABEF_12_sp2, lhs_mat_01_12_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_01_11_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_01_10_sp2));
2659 __m512i iacc_mat_10_1_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_13_sp2, lhs_mat_23_13_sp2), _mm512_maddubs_epi16(rhs_mat_014589CD_12_sp2, lhs_mat_23_12_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_23_11_sp2)), _mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_23_10_sp2));
2660 __m512i iacc_mat_11_1_sp2 = _mm512_add_epi16(_mm512_add_epi16(_mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_13_sp2, lhs_mat_23_13_sp2), _mm512_maddubs_epi16(rhs_mat_2367ABEF_12_sp2, lhs_mat_23_12_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_23_11_sp2)), _mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_23_10_sp2));
2661
2662 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2663 __m512i iacc_mat_00_0 = _mm512_add_epi16(iacc_mat_00_0_sp1, iacc_mat_00_0_sp2);
2664 __m512i iacc_mat_01_0 = _mm512_add_epi16(iacc_mat_01_0_sp1, iacc_mat_01_0_sp2);
2665 __m512i iacc_mat_10_0 = _mm512_add_epi16(iacc_mat_10_0_sp1, iacc_mat_10_0_sp2);
2666 __m512i iacc_mat_11_0 = _mm512_add_epi16(iacc_mat_11_0_sp1, iacc_mat_11_0_sp2);
2667
2668 __m512i iacc_mat_00_1 = _mm512_add_epi16(iacc_mat_00_1_sp1, iacc_mat_00_1_sp2);
2669 __m512i iacc_mat_01_1 = _mm512_add_epi16(iacc_mat_01_1_sp1, iacc_mat_01_1_sp2);
2670 __m512i iacc_mat_10_1 = _mm512_add_epi16(iacc_mat_10_1_sp1, iacc_mat_10_1_sp2);
2671 __m512i iacc_mat_11_1 = _mm512_add_epi16(iacc_mat_11_1_sp1, iacc_mat_11_1_sp2);
2672
2673 iacc_mat_00_0 = _mm512_madd_epi16(iacc_mat_00_0, scale_014589CD_0);
2674 iacc_mat_01_0 = _mm512_madd_epi16(iacc_mat_01_0, scale_2367ABEF_0);
2675 iacc_mat_10_0 = _mm512_madd_epi16(iacc_mat_10_0, scale_014589CD_0);
2676 iacc_mat_11_0 = _mm512_madd_epi16(iacc_mat_11_0, scale_2367ABEF_0);
2677
2678 iacc_mat_00_1 = _mm512_madd_epi16(iacc_mat_00_1, scale_014589CD_1);
2679 iacc_mat_01_1 = _mm512_madd_epi16(iacc_mat_01_1, scale_2367ABEF_1);
2680 iacc_mat_10_1 = _mm512_madd_epi16(iacc_mat_10_1, scale_014589CD_1);
2681 iacc_mat_11_1 = _mm512_madd_epi16(iacc_mat_11_1, scale_2367ABEF_1);
2682
2683 // Straighten out to make 4 row vectors (4 for each sub block which are accumulated together in the next step)
2684 __m512i iacc_row_0_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00_0, _mm512_shuffle_epi32(iacc_mat_01_0, (_MM_PERM_ENUM)78));
2685 __m512i iacc_row_1_0 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00_0, (_MM_PERM_ENUM)78), iacc_mat_01_0);
2686 __m512i iacc_row_2_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10_0, _mm512_shuffle_epi32(iacc_mat_11_0, (_MM_PERM_ENUM)78));
2687 __m512i iacc_row_3_0 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10_0, (_MM_PERM_ENUM)78), iacc_mat_11_0);
2688 __m512i iacc_row_0_1 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00_1, _mm512_shuffle_epi32(iacc_mat_01_1, (_MM_PERM_ENUM)78));
2689 __m512i iacc_row_1_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00_1, (_MM_PERM_ENUM)78), iacc_mat_01_1);
2690 __m512i iacc_row_2_1 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10_1, _mm512_shuffle_epi32(iacc_mat_11_1, (_MM_PERM_ENUM)78));
2691 __m512i iacc_row_3_1 = _mm512_mask_blend_epi32(0xCCCC,_mm512_shuffle_epi32(iacc_mat_10_1, (_MM_PERM_ENUM)78), iacc_mat_11_1);
2692
2693 __m512i iacc_row_0 = _mm512_add_epi32(iacc_row_0_0, iacc_row_0_1);
2694 __m512i iacc_row_1 = _mm512_add_epi32(iacc_row_1_0, iacc_row_1_1);
2695 __m512i iacc_row_2 = _mm512_add_epi32(iacc_row_2_0, iacc_row_2_1);
2696 __m512i iacc_row_3 = _mm512_add_epi32(iacc_row_3_0, iacc_row_3_1);
2697
2698 // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes
2699 const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
2700 const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
2701 const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
2702
2703 // Multiply with appropiate scales and accumulate (for both d and dmin) below
2704 acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
2705 acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
2706 acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
2707 acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
2708
2709 __m512i iacc_row_min_0 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_hsum_0123_01, (_MM_PERM_ENUM)0), mins_01);
2710 __m512i iacc_row_min_1 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_hsum_0123_01, (_MM_PERM_ENUM)85), mins_01);
2711 __m512i iacc_row_min_2 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_hsum_0123_01, (_MM_PERM_ENUM)170), mins_01);
2712 __m512i iacc_row_min_3 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_hsum_0123_01, (_MM_PERM_ENUM)255), mins_01);
2713
2714 acc_min_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_0), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_min_rows[0]);
2715 acc_min_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_1), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_min_rows[1]);
2716 acc_min_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_2), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_min_rows[2]);
2717 acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
2718 }
2719 }
2720 // Store accumlated values
2721 for (int i = 0; i < 4; i++) {
2722 _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
2723 }
2724 }
2725 }
2726 if (anc != nc) {
2727 xstart = anc/8;
2728 y = 0;
2729 }
2730#endif //AVX512F
2731
2732 // Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
2733 for (; y < anr / 4; y += 4) {
2734
2735 const block_q8_Kx4 * a_ptrs[4];
2736
2737 a_ptrs[0] = a_ptr_start + (y * nb);
2738 for (int i = 0; i < 3; ++i) {
2739 a_ptrs[i + 1] = a_ptrs[i] + nb;
2740 }
2741
2742 // Take group of eight block_q4_kx8 structures at each pass of the loop and perform dot product operation
2743 for (int64_t x = xstart; x < nc / 8; x++) {
2744
2745 const block_q4_Kx8 * b_ptr = b_ptr_start + (x * b_nb);
2746
2747 // Master FP accumulators
2748 __m256 acc_rows[16];
2749 for (int i = 0; i < 16; i++) {
2750 acc_rows[i] = _mm256_setzero_ps();
2751 }
2752
2753 __m256 acc_min_rows[16];
2754 for (int i = 0; i < 16; i++) {
2755 acc_min_rows[i] = _mm256_setzero_ps();
2756 }
2757
2758 // For super block
2759 for (int64_t b = 0; b < nb; b++) {
2760
2761 // Scale values - Load the eight scale values of block_q4_kx8
2762 const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
2763
2764 // dmin values - Load the eight dmin values of block_q4_kx8
2765 const __m256 col_dmin_f32 = GGML_F32Cx8_LOAD(b_ptr[b].dmin);
2766
2767 // Loop to iterate over the eight sub blocks of a super block - two sub blocks are processed per iteration
2768 for (int sb = 0; sb < QK_K / 64; sb++) {
2769
2770 // Load the eight block_q4_K for two sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7
2771 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + sb * 256));
2772 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 32 + sb * 256));
2773 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 64 + sb * 256));
2774 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 96 + sb * 256));
2775 const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 128 + sb * 256));
2776 const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 160 + sb * 256));
2777 const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 192 + sb * 256));
2778 const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 224 + sb * 256));
2779
2780 // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
2781 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2782 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2783 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2784 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2785 const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240);
2786 const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240);
2787 const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240);
2788 const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240);
2789
2790 // 4-bit -> 8-bit
2791 // First sub block of the two sub blocks processed in the iteration
2792 const __m256i rhs_mat_0145_00 = _mm256_and_si256(a: rhs_raw_mat_0145_0, b: m4b); //B00(0-7) B01(0-7) B04(0-7) B05(0-7)
2793 const __m256i rhs_mat_2367_00 = _mm256_and_si256(a: rhs_raw_mat_2367_0, b: m4b); //B02(0-7) B03(0-7) B06(0-7) B07(0-7)
2794
2795 const __m256i rhs_mat_0145_01 = _mm256_and_si256(a: rhs_raw_mat_0145_1, b: m4b); //B00(8-15) B01(8-15) B04(8-15) B05(8-15)
2796 const __m256i rhs_mat_2367_01 = _mm256_and_si256(a: rhs_raw_mat_2367_1, b: m4b); //B02(8-15) B03(8-15) B06(8-15) B07(8-15)
2797
2798 const __m256i rhs_mat_0145_02 = _mm256_and_si256(a: rhs_raw_mat_0145_2, b: m4b); //B00(16-23) B01(16-23) B04(16-23) B05(16-23)
2799 const __m256i rhs_mat_2367_02 = _mm256_and_si256(a: rhs_raw_mat_2367_2, b: m4b); //B02(16-23) B03(16-23) B06(16-23) B07(16-23)
2800
2801 const __m256i rhs_mat_0145_03 = _mm256_and_si256(a: rhs_raw_mat_0145_3, b: m4b); //B00(24-31) B01(24-31) B04(24-31) B05(24-31)
2802 const __m256i rhs_mat_2367_03 = _mm256_and_si256(a: rhs_raw_mat_2367_3, b: m4b); //B02(24-31) B03(24-31) B06(24-31) B07(24-31)
2803
2804 // Second sub block of the two sub blocks processed in the iteration
2805 const __m256i rhs_mat_0145_10 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 4), b: m4b); //B10(0-7) B11(0-7) B14(0-7) B15(0-7)
2806 const __m256i rhs_mat_2367_10 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 4), b: m4b); //B12(0-7) B13(0-7) B16(0-7) B17(0-7)
2807
2808 const __m256i rhs_mat_0145_11 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 4), b: m4b); //B10(8-15) B11(8-15) B14(8-15) B15(8-15)
2809 const __m256i rhs_mat_2367_11 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 4), b: m4b); //B12(8-15) B13(8-15) B16(8-15) B17(8-15)
2810
2811 const __m256i rhs_mat_0145_12 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_2, count: 4), b: m4b); //B10(16-23) B11(16-23) B14(16-23) B15(16-23)
2812 const __m256i rhs_mat_2367_12 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_2, count: 4), b: m4b); //B12(16-23) B13(16-23) B16(16-23) B17(16-23)
2813
2814 const __m256i rhs_mat_0145_13 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_3, count: 4), b: m4b); //B10(24-31) B11(24-31) B14(24-31) B15(24-31)
2815 const __m256i rhs_mat_2367_13 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_3, count: 4), b: m4b); //B12(24-31) B13(24-31) B16(24-31) B17(24-31)
2816
2817 // Shuffle pattern one - right side input
2818 const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3)
2819 const __m256i rhs_mat_2367_00_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_00, 136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3)
2820
2821 const __m256i rhs_mat_0145_01_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_01, 136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11)
2822 const __m256i rhs_mat_2367_01_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_01, 136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11)
2823
2824 const __m256i rhs_mat_0145_02_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_02, 136); //B00(16-19) B01(16-19) B00(16-19) B01(16-19) B04(16-19) B05(16-19) B04(16-19) B05(16-19)
2825 const __m256i rhs_mat_2367_02_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_02, 136); //B02(16-19) B03(16-19) B02(16-19) B03(16-19) B06(16-19) B07(16-19) B06(16-19) B07(16-19)
2826
2827 const __m256i rhs_mat_0145_03_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_03, 136); //B00(24-27) B01(24-27) B00(24-27) B01(24-27) B04(24-27) B05(24-27) B04(24-27) B05(24-27)
2828 const __m256i rhs_mat_2367_03_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_03, 136); //B02(24-27) B03(24-27) B02(24-27) B03(24-27) B06(24-27) B07(24-27) B06(24-27) B07(24-27)
2829
2830 const __m256i rhs_mat_0145_10_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_10, 136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3)
2831 const __m256i rhs_mat_2367_10_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_10, 136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3)
2832
2833 const __m256i rhs_mat_0145_11_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_11, 136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11)
2834 const __m256i rhs_mat_2367_11_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_11, 136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11)
2835
2836 const __m256i rhs_mat_0145_12_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_12, 136); //B10(16-19) B11(16-19) B10(16-19) B11(16-19) B14(16-19) B15(16-19) B14(16-19) B15(16-19)
2837 const __m256i rhs_mat_2367_12_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_12, 136); //B12(16-19) B13(16-19) B12(16-19) B13(16-19) B16(16-19) B17(16-19) B16(16-19) B17(16-19)
2838
2839 const __m256i rhs_mat_0145_13_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_13, 136); //B10(24-27) B11(24-27) B10(24-27) B11(24-27) B14(24-27) B15(24-27) B14(24-27) B15(24-27)
2840 const __m256i rhs_mat_2367_13_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_13, 136); //B12(24-27) B13(24-27) B12(24-27) B13(24-27) B16(24-27) B17(24-27) B16(24-27) B17(24-27)
2841
2842
2843 // Shuffle pattern two - right side input
2844 const __m256i rhs_mat_0145_00_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_00, 221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7)
2845 const __m256i rhs_mat_2367_00_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_00, 221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7)
2846
2847 const __m256i rhs_mat_0145_01_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_01, 221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15)
2848 const __m256i rhs_mat_2367_01_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_01, 221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15)
2849
2850 const __m256i rhs_mat_0145_02_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_02, 221); //B00(20-23) B01(20-23) B00(20-23) B01(20-23) B04(20-23) B05(20-23) B04(20-23) B05(20-23)
2851 const __m256i rhs_mat_2367_02_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_02, 221); //B02(20-23) B03(20-23) B02(20-23) B03(20-23) B06(20-23) B07(20-23) B06(20-23) B07(20-23)
2852
2853 const __m256i rhs_mat_0145_03_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_03, 221); //B00(28-31) B01(28-31) B00(28-31) B01(28-31) B04(28-31) B05(28-31) B04(28-31) B05(28-31)
2854 const __m256i rhs_mat_2367_03_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_03, 221); //B02(28-31) B03(28-31) B02(28-31) B03(28-31) B06(28-31) B07(28-31) B06(28-31) B07(28-31)
2855
2856 const __m256i rhs_mat_0145_10_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_10, 221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7)
2857 const __m256i rhs_mat_2367_10_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_10, 221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7)
2858
2859 const __m256i rhs_mat_0145_11_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_11, 221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15)
2860 const __m256i rhs_mat_2367_11_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_11, 221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15)
2861
2862 const __m256i rhs_mat_0145_12_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_12, 221); //B10(20-23) B11(20-23) B10(20-23) B11(20-23) B14(20-23) B15(20-23) B14(20-23) B15(20-23)
2863 const __m256i rhs_mat_2367_12_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_12, 221); //B12(20-23) B13(20-23) B12(20-23) B13(20-23) B16(20-23) B17(20-23) B16(20-23) B17(20-23)
2864
2865 const __m256i rhs_mat_0145_13_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_13, 221); //B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31)
2866 const __m256i rhs_mat_2367_13_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_13, 221); //B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31)
2867
2868 uint32_t utmp_0[4], utmp_1[4];
2869
2870 // Scales and Mins of corresponding sub blocks from different Q4_K structures are stored together
2871 // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop
2872 memcpy(dest: utmp_0, src: b_ptr[b].scales + 24 * sb, n: 12);
2873 utmp_0[3] = ((utmp_0[2] >> 4) & kmask2) | (((utmp_0[1] >> 6) & kmask3) << 4);
2874 const uint32_t uaux_0 = utmp_0[1] & kmask1;
2875 utmp_0[1] = (utmp_0[2] & kmask2) | (((utmp_0[0] >> 6) & kmask3) << 4);
2876 utmp_0[2] = uaux_0;
2877 utmp_0[0] &= kmask1;
2878
2879 // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures for the sb loop
2880 memcpy(dest: utmp_1, src: b_ptr[b].scales + 12 + sb * 24, n: 12);
2881 utmp_1[3] = ((utmp_1[2] >> 4) & kmask2) | (((utmp_1[1] >> 6) & kmask3) << 4);
2882 const uint32_t uaux_1 = utmp_1[1] & kmask1;
2883 utmp_1[1] = (utmp_1[2] & kmask2) | (((utmp_1[0] >> 6) & kmask3) << 4);
2884 utmp_1[2] = uaux_1;
2885 utmp_1[0] &= kmask1;
2886
2887 // Scales of first sub block in the sb loop
2888 const __m128i mins_and_scales_0 = _mm_set_epi32(i3: utmp_0[3], i2: utmp_0[2], i1: utmp_0[1], i0: utmp_0[0]);
2889 const __m256i scales_0 = _mm256_cvtepu8_epi16(V: _mm_unpacklo_epi8(a: mins_and_scales_0, b: mins_and_scales_0));
2890
2891 // Scales of second sub block in the sb loop
2892 const __m128i mins_and_scales_1 = _mm_set_epi32(i3: utmp_1[3], i2: utmp_1[2], i1: utmp_1[1], i0: utmp_1[0]);
2893 const __m256i scales_1 = _mm256_cvtepu8_epi16(V: _mm_unpacklo_epi8(a: mins_and_scales_1, b: mins_and_scales_1));
2894
2895 // Mins of first and second sub block of Q4_K block are arranged side by side
2896 const __m256i mins_01 = _mm256_cvtepu8_epi16(V: _mm_unpacklo_epi8(_mm_shuffle_epi32(mins_and_scales_0, 78), _mm_shuffle_epi32(mins_and_scales_1, 78)));
2897
2898 const __m256i scale_0145_0 = _mm256_shuffle_epi32(scales_0, 68);
2899 const __m256i scale_2367_0 = _mm256_shuffle_epi32(scales_0, 238);
2900
2901 const __m256i scale_0145_1 = _mm256_shuffle_epi32(scales_1, 68);
2902 const __m256i scale_2367_1 = _mm256_shuffle_epi32(scales_1, 238);
2903
2904 for (int rp = 0; rp < 4; rp++) {
2905
2906 // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
2907 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
2908 __m256i lhs_mat_0123_00 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 256 * sb)));
2909 __m256i lhs_mat_01_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 0);
2910 __m256i lhs_mat_23_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 17);
2911 __m256i lhs_mat_0123_01 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 32 + 256 * sb)));
2912 __m256i lhs_mat_01_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 0);
2913 __m256i lhs_mat_23_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 17);
2914 __m256i lhs_mat_0123_02 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 64 + 256 * sb)));
2915 __m256i lhs_mat_01_02 = _mm256_permute2f128_si256(lhs_mat_0123_02, lhs_mat_0123_02, 0);
2916 __m256i lhs_mat_23_02 = _mm256_permute2f128_si256(lhs_mat_0123_02, lhs_mat_0123_02, 17);
2917 __m256i lhs_mat_0123_03 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 96 + 256 * sb)));
2918 __m256i lhs_mat_01_03 = _mm256_permute2f128_si256(lhs_mat_0123_03, lhs_mat_0123_03, 0);
2919 __m256i lhs_mat_23_03 = _mm256_permute2f128_si256(lhs_mat_0123_03, lhs_mat_0123_03, 17);
2920 __m256i lhs_mat_0123_10 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 128 + 256 * sb)));
2921 __m256i lhs_mat_01_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 0);
2922 __m256i lhs_mat_23_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 17);
2923 __m256i lhs_mat_0123_11 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 160 + 256 * sb)));
2924 __m256i lhs_mat_01_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 0);
2925 __m256i lhs_mat_23_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 17);
2926 __m256i lhs_mat_0123_12 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 192 + 256 * sb)));
2927 __m256i lhs_mat_01_12 = _mm256_permute2f128_si256(lhs_mat_0123_12, lhs_mat_0123_12, 0);
2928 __m256i lhs_mat_23_12 = _mm256_permute2f128_si256(lhs_mat_0123_12, lhs_mat_0123_12, 17);
2929 __m256i lhs_mat_0123_13 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 224 + 256 * sb)));
2930 __m256i lhs_mat_01_13 = _mm256_permute2f128_si256(lhs_mat_0123_13, lhs_mat_0123_13, 0);
2931 __m256i lhs_mat_23_13 = _mm256_permute2f128_si256(lhs_mat_0123_13, lhs_mat_0123_13, 17);
2932
2933 // Bsums are loaded - four bsums are loaded (for two sub blocks) for the different Q8_K blocks
2934 __m256i lhs_bsums_0123_01 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].bsums + 16 * sb)));
2935 __m256i lhs_bsums_hsum_0123_01 = _mm256_castsi128_si256(a: _mm_hadd_epi16(a: _mm256_castsi256_si128(a: lhs_bsums_0123_01), _mm256_extractf128_si256(lhs_bsums_0123_01, 1)));
2936 lhs_bsums_hsum_0123_01 = _mm256_permute2x128_si256(lhs_bsums_hsum_0123_01, lhs_bsums_hsum_0123_01, 0);
2937
2938 // Shuffle pattern one - left side input
2939 const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3)
2940 const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3)
2941
2942 const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11)
2943 const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11)
2944
2945 const __m256i lhs_mat_01_02_sp1 = _mm256_shuffle_epi32(lhs_mat_01_02, 160); //A00(16-19) A00(16-19) A01(16-19) A01(16-19) A00(16-19) A00(16-19) A01(16-19) A01(16-19)
2946 const __m256i lhs_mat_23_02_sp1 = _mm256_shuffle_epi32(lhs_mat_23_02, 160); //A02(16-19) A03(16-19) A02(16-19) A03(16-19) A02(16-19) A03(16-19) A02(16-19) A03(16-19)
2947
2948 const __m256i lhs_mat_01_03_sp1 = _mm256_shuffle_epi32(lhs_mat_01_03, 160); //A00(24-27) A00(24-27) A01(24-27) A01(24-27) A00(24-27) A00(24-27) A01(24-27) A01(24-27)
2949 const __m256i lhs_mat_23_03_sp1 = _mm256_shuffle_epi32(lhs_mat_23_03, 160); //A02(24-27) A03(24-27) A02(24-27) A03(24-27) A02(24-27) A03(24-27) A02(24-27) A03(24-27)
2950
2951 const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3)
2952 const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3)
2953
2954 const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11)
2955 const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11)
2956
2957 const __m256i lhs_mat_01_12_sp1 = _mm256_shuffle_epi32(lhs_mat_01_12, 160); //A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19)
2958 const __m256i lhs_mat_23_12_sp1 = _mm256_shuffle_epi32(lhs_mat_23_12, 160); //A12(16-19) A13(16-19) A12(16-19) A13(16-19) A12(16-19) A13(16-19) A12(16-19) A13(16-19)
2959
2960 const __m256i lhs_mat_01_13_sp1 = _mm256_shuffle_epi32(lhs_mat_01_13, 160); //A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27)
2961 const __m256i lhs_mat_23_13_sp1 = _mm256_shuffle_epi32(lhs_mat_23_13, 160); //A12(24-27) A13(24-27) A12(24-27) A13(24-27) A12(24-27) A13(24-27) A12(24-27) A13(24-27)
2962
2963 // Shuffle pattern two- left side input
2964 const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7)
2965 const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7)
2966
2967 const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15)
2968 const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15)
2969
2970 const __m256i lhs_mat_01_02_sp2 = _mm256_shuffle_epi32(lhs_mat_01_02, 245); //A00(20-23) A00(20-23) A01(20-23) A01(20-23) A00(20-23) A00(20-23) A01(20-23) A01(20-23)
2971 const __m256i lhs_mat_23_02_sp2 = _mm256_shuffle_epi32(lhs_mat_23_02, 245); //A02(20-23) A03(20-23) A02(20-23) A03(20-23) A02(20-23) A03(20-23) A02(20-23) A03(20-23)
2972
2973 const __m256i lhs_mat_01_03_sp2 = _mm256_shuffle_epi32(lhs_mat_01_03, 245); //A00(28-31) A00(28-31) A01(28-31) A01(28-31) A00(28-31) A00(28-31) A01(28-31) A01(28-31)
2974 const __m256i lhs_mat_23_03_sp2 = _mm256_shuffle_epi32(lhs_mat_23_03, 245); //A02(28-31) A03(28-31) A02(28-31) A03(28-31) A02(28-31) A03(28-31) A02(28-31) A03(28-31)
2975
2976 const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7)
2977 const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7)
2978
2979 const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15)
2980 const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15)
2981
2982 const __m256i lhs_mat_01_12_sp2 = _mm256_shuffle_epi32(lhs_mat_01_12, 245); //A10(20-23) A10(20-23) A11(20-23) A11(20-23) A10(20-23) A10(20-23) A11(20-23) A11(20-23)
2983 const __m256i lhs_mat_23_12_sp2 = _mm256_shuffle_epi32(lhs_mat_23_12, 245); //A12(20-23) A13(20-23) A12(20-23) A13(20-23) A12(20-23) A13(20-23) A12(20-23) A13(20-23)
2984
2985 const __m256i lhs_mat_01_13_sp2 = _mm256_shuffle_epi32(lhs_mat_01_13, 245); //A10(28-31) A10(28-31) A11(28-31) A11(28-31) A10(28-31) A10(28-31) A11(28-31) A11(28-31)
2986 const __m256i lhs_mat_23_13_sp2 = _mm256_shuffle_epi32(lhs_mat_23_13, 245); //A12(28-31) A13(28-31) A12(28-31) A13(28-31) A12(28-31) A13(28-31) A12(28-31) A13(28-31)
2987
2988 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2989 __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_03_sp1, b: lhs_mat_01_03_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_0145_02_sp1, b: lhs_mat_01_02_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp1, b: lhs_mat_01_01_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp1, b: lhs_mat_01_00_sp1));
2990 __m256i iacc_mat_01_0_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_03_sp1, b: lhs_mat_01_03_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_2367_02_sp1, b: lhs_mat_01_02_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp1, b: lhs_mat_01_01_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp1, b: lhs_mat_01_00_sp1));
2991 __m256i iacc_mat_10_0_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_03_sp1, b: lhs_mat_23_03_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_0145_02_sp1, b: lhs_mat_23_02_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp1, b: lhs_mat_23_01_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp1, b: lhs_mat_23_00_sp1));
2992 __m256i iacc_mat_11_0_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_03_sp1, b: lhs_mat_23_03_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_2367_02_sp1, b: lhs_mat_23_02_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp1, b: lhs_mat_23_01_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp1, b: lhs_mat_23_00_sp1));
2993 __m256i iacc_mat_00_1_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_13_sp1, b: lhs_mat_01_13_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_0145_12_sp1, b: lhs_mat_01_12_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp1, b: lhs_mat_01_11_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp1, b: lhs_mat_01_10_sp1));
2994 __m256i iacc_mat_01_1_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_13_sp1, b: lhs_mat_01_13_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_2367_12_sp1, b: lhs_mat_01_12_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp1, b: lhs_mat_01_11_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp1, b: lhs_mat_01_10_sp1));
2995 __m256i iacc_mat_10_1_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_13_sp1, b: lhs_mat_23_13_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_0145_12_sp1, b: lhs_mat_23_12_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp1, b: lhs_mat_23_11_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp1, b: lhs_mat_23_10_sp1));
2996 __m256i iacc_mat_11_1_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_13_sp1, b: lhs_mat_23_13_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_2367_12_sp1, b: lhs_mat_23_12_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp1, b: lhs_mat_23_11_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp1, b: lhs_mat_23_10_sp1));
2997
2998 __m256i iacc_mat_00_0_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_03_sp2, b: lhs_mat_01_03_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_0145_02_sp2, b: lhs_mat_01_02_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp2, b: lhs_mat_01_01_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp2, b: lhs_mat_01_00_sp2));
2999 __m256i iacc_mat_01_0_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_03_sp2, b: lhs_mat_01_03_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_2367_02_sp2, b: lhs_mat_01_02_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp2, b: lhs_mat_01_01_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp2, b: lhs_mat_01_00_sp2));
3000 __m256i iacc_mat_10_0_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_03_sp2, b: lhs_mat_23_03_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_0145_02_sp2, b: lhs_mat_23_02_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp2, b: lhs_mat_23_01_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp2, b: lhs_mat_23_00_sp2));
3001 __m256i iacc_mat_11_0_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_03_sp2, b: lhs_mat_23_03_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_2367_02_sp2, b: lhs_mat_23_02_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp2, b: lhs_mat_23_01_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp2, b: lhs_mat_23_00_sp2));
3002 __m256i iacc_mat_00_1_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_13_sp2, b: lhs_mat_01_13_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_0145_12_sp2, b: lhs_mat_01_12_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp2, b: lhs_mat_01_11_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp2, b: lhs_mat_01_10_sp2));
3003 __m256i iacc_mat_01_1_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_13_sp2, b: lhs_mat_01_13_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_2367_12_sp2, b: lhs_mat_01_12_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp2, b: lhs_mat_01_11_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp2, b: lhs_mat_01_10_sp2));
3004 __m256i iacc_mat_10_1_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_13_sp2, b: lhs_mat_23_13_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_0145_12_sp2, b: lhs_mat_23_12_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp2, b: lhs_mat_23_11_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp2, b: lhs_mat_23_10_sp2));
3005 __m256i iacc_mat_11_1_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_13_sp2, b: lhs_mat_23_13_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_2367_12_sp2, b: lhs_mat_23_12_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp2, b: lhs_mat_23_11_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp2, b: lhs_mat_23_10_sp2));
3006
3007 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
3008 __m256i iacc_mat_00_0 = _mm256_add_epi16(a: iacc_mat_00_0_sp1, b: iacc_mat_00_0_sp2);
3009 __m256i iacc_mat_01_0 = _mm256_add_epi16(a: iacc_mat_01_0_sp1, b: iacc_mat_01_0_sp2);
3010 __m256i iacc_mat_10_0 = _mm256_add_epi16(a: iacc_mat_10_0_sp1, b: iacc_mat_10_0_sp2);
3011 __m256i iacc_mat_11_0 = _mm256_add_epi16(a: iacc_mat_11_0_sp1, b: iacc_mat_11_0_sp2);
3012
3013 __m256i iacc_mat_00_1 = _mm256_add_epi16(a: iacc_mat_00_1_sp1, b: iacc_mat_00_1_sp2);
3014 __m256i iacc_mat_01_1 = _mm256_add_epi16(a: iacc_mat_01_1_sp1, b: iacc_mat_01_1_sp2);
3015 __m256i iacc_mat_10_1 = _mm256_add_epi16(a: iacc_mat_10_1_sp1, b: iacc_mat_10_1_sp2);
3016 __m256i iacc_mat_11_1 = _mm256_add_epi16(a: iacc_mat_11_1_sp1, b: iacc_mat_11_1_sp2);
3017
3018 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
3019 iacc_mat_00_0 = _mm256_madd_epi16(a: iacc_mat_00_0, b: scale_0145_0);
3020 iacc_mat_01_0 = _mm256_madd_epi16(a: iacc_mat_01_0, b: scale_2367_0);
3021 iacc_mat_10_0 = _mm256_madd_epi16(a: iacc_mat_10_0, b: scale_0145_0);
3022 iacc_mat_11_0 = _mm256_madd_epi16(a: iacc_mat_11_0, b: scale_2367_0);
3023
3024 iacc_mat_00_1 = _mm256_madd_epi16(a: iacc_mat_00_1, b: scale_0145_1);
3025 iacc_mat_01_1 = _mm256_madd_epi16(a: iacc_mat_01_1, b: scale_2367_1);
3026 iacc_mat_10_1 = _mm256_madd_epi16(a: iacc_mat_10_1, b: scale_0145_1);
3027 iacc_mat_11_1 = _mm256_madd_epi16(a: iacc_mat_11_1, b: scale_2367_1);
3028
3029 // Straighten out to make 4 row vectors (4 for each sub block which are accumulated together in the next step)
3030 __m256i iacc_row_0_0 = _mm256_blend_epi32(iacc_mat_00_0, _mm256_shuffle_epi32(iacc_mat_01_0, 78), 204);
3031 __m256i iacc_row_1_0 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00_0, 78), iacc_mat_01_0, 204);
3032 __m256i iacc_row_2_0 = _mm256_blend_epi32(iacc_mat_10_0, _mm256_shuffle_epi32(iacc_mat_11_0, 78), 204);
3033 __m256i iacc_row_3_0 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10_0, 78), iacc_mat_11_0, 204);
3034 __m256i iacc_row_0_1 = _mm256_blend_epi32(iacc_mat_00_1, _mm256_shuffle_epi32(iacc_mat_01_1, 78), 204);
3035 __m256i iacc_row_1_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00_1, 78), iacc_mat_01_1, 204);
3036 __m256i iacc_row_2_1 = _mm256_blend_epi32(iacc_mat_10_1, _mm256_shuffle_epi32(iacc_mat_11_1, 78), 204);
3037 __m256i iacc_row_3_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10_1, 78), iacc_mat_11_1, 204);
3038
3039 __m256i iacc_row_0 = _mm256_add_epi32(a: iacc_row_0_0, b: iacc_row_0_1);
3040 __m256i iacc_row_1 = _mm256_add_epi32(a: iacc_row_1_0, b: iacc_row_1_1);
3041 __m256i iacc_row_2 = _mm256_add_epi32(a: iacc_row_2_0, b: iacc_row_2_1);
3042 __m256i iacc_row_3 = _mm256_add_epi32(a: iacc_row_3_0, b: iacc_row_3_1);
3043
3044 // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes
3045 const __m128 row_scale_f32_sse = _mm_load_ps(p: a_ptrs[rp][b].d);
3046 const __m256 row_scale_f32 = _mm256_set_m128(hi: row_scale_f32_sse, lo: row_scale_f32_sse);//GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
3047
3048 // Multiply with appropiate scales and accumulate (for both d and dmin) below
3049 acc_rows[rp * 4] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_0), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_rows[rp * 4]);
3050 acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_1), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_rows[rp * 4 + 1]);
3051 acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_2), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_rows[rp * 4 + 2]);
3052 acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_3), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_rows[rp * 4 + 3]);
3053
3054 __m256i iacc_row_min_0 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_hsum_0123_01, 0), b: mins_01);
3055 __m256i iacc_row_min_1 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_hsum_0123_01, 85), b: mins_01);
3056 __m256i iacc_row_min_2 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_hsum_0123_01, 170), b: mins_01);
3057 __m256i iacc_row_min_3 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_hsum_0123_01, 255), b: mins_01);
3058
3059 acc_min_rows[rp * 4] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_0), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_min_rows[rp * 4]);
3060 acc_min_rows[rp * 4 + 1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_1), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_min_rows[rp * 4 + 1]);
3061 acc_min_rows[rp * 4 + 2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_2), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_min_rows[rp * 4 + 2]);
3062 acc_min_rows[rp * 4 + 3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_3), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_min_rows[rp * 4 + 3]);
3063
3064 }
3065 }
3066 }
3067 // Store the accumulated values
3068 for (int i = 0; i < 16; i++) {
3069 _mm256_storeu_ps(p: (float * )(s + ((y * 4 + i) * bs + x * 8)), a: _mm256_sub_ps(a: acc_rows[i], b: acc_min_rows[i]));
3070 }
3071 }
3072 }
3073 for (; y < nr / 4; y++) {
3074
3075 const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb);
3076
3077 for (int64_t x = xstart; x < nc / 8; x++) {
3078
3079 const block_q4_Kx8 * b_ptr = b_ptr_start + (x * b_nb);
3080
3081 // Master FP accumulators
3082 __m256 acc_rows[4];
3083 for (int i = 0; i < 4; i++) {
3084 acc_rows[i] = _mm256_setzero_ps();
3085 }
3086
3087 __m256 acc_min_rows[4];
3088 for (int i = 0; i < 4; i++) {
3089 acc_min_rows[i] = _mm256_setzero_ps();
3090 }
3091
3092 for (int64_t b = 0; b < nb; b++) {
3093
3094 // Scale values - Load the eight scale values of block_q4_Kx8
3095 const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
3096
3097 // dmin values - Load the eight dmin values of block_q4_Kx8
3098 const __m256 col_dmin_f32 = GGML_F32Cx8_LOAD(b_ptr[b].dmin);
3099
3100 // Loop to iterate over the eight sub blocks of a super block - two sub blocks are processed per iteration
3101 for (int sb = 0; sb < QK_K / 64; sb++) {
3102
3103 // Load the eight block_q4_k for two sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7
3104 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + sb * 256));
3105 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 32 + sb * 256));
3106 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 64 + sb * 256));
3107 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 96 + sb * 256));
3108 const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 128 + sb * 256));
3109 const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 160 + sb * 256));
3110 const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 192 + sb * 256));
3111 const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256(p: (const __m256i * )(b_ptr[b].qs + 224 + sb * 256));
3112
3113 // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
3114 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
3115 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
3116 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
3117 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
3118 const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240);
3119 const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240);
3120 const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240);
3121 const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240);
3122
3123 // 4-bit -> 8-bit
3124 // First sub block of the two sub blocks processed in the iteration
3125 const __m256i rhs_mat_0145_00 = _mm256_and_si256(a: rhs_raw_mat_0145_0, b: m4b); //B00(0-7) B01(0-7) B04(0-7) B05(0-7)
3126 const __m256i rhs_mat_2367_00 = _mm256_and_si256(a: rhs_raw_mat_2367_0, b: m4b); //B02(0-7) B03(0-7) B06(0-7) B07(0-7)
3127
3128 const __m256i rhs_mat_0145_01 = _mm256_and_si256(a: rhs_raw_mat_0145_1, b: m4b); //B00(8-15) B01(8-15) B04(8-15) B05(8-15)
3129 const __m256i rhs_mat_2367_01 = _mm256_and_si256(a: rhs_raw_mat_2367_1, b: m4b); //B02(8-15) B03(8-15) B06(8-15) B07(8-15)
3130
3131 const __m256i rhs_mat_0145_02 = _mm256_and_si256(a: rhs_raw_mat_0145_2, b: m4b); //B00(16-23) B01(16-23) B04(16-23) B05(16-23)
3132 const __m256i rhs_mat_2367_02 = _mm256_and_si256(a: rhs_raw_mat_2367_2, b: m4b); //B02(16-23) B03(16-23) B06(16-23) B07(16-23)
3133
3134 const __m256i rhs_mat_0145_03 = _mm256_and_si256(a: rhs_raw_mat_0145_3, b: m4b); //B00(24-31) B01(24-31) B04(24-31) B05(24-31)
3135 const __m256i rhs_mat_2367_03 = _mm256_and_si256(a: rhs_raw_mat_2367_3, b: m4b); //B02(24-31) B03(24-31) B06(24-31) B07(24-31)
3136
3137 // Second sub block of the two sub blocks processed in the iteration
3138 const __m256i rhs_mat_0145_10 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 4), b: m4b); //B10(0-7) B11(0-7) B14(0-7) B15(0-7)
3139 const __m256i rhs_mat_2367_10 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 4), b: m4b); //B12(0-7) B13(0-7) B16(0-7) B17(0-7)
3140
3141 const __m256i rhs_mat_0145_11 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 4), b: m4b); //B10(8-15) B11(8-15) B14(8-15) B15(8-15)
3142 const __m256i rhs_mat_2367_11 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 4), b: m4b); //B12(8-15) B13(8-15) B16(8-15) B17(8-15)
3143
3144 const __m256i rhs_mat_0145_12 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_2, count: 4), b: m4b); //B10(16-23) B11(16-23) B14(16-23) B15(16-23)
3145 const __m256i rhs_mat_2367_12 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_2, count: 4), b: m4b); //B12(16-23) B13(16-23) B16(16-23) B17(16-23)
3146
3147 const __m256i rhs_mat_0145_13 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_3, count: 4), b: m4b); //B10(24-31) B11(24-31) B14(24-31) B15(24-31)
3148 const __m256i rhs_mat_2367_13 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_3, count: 4), b: m4b); //B12(24-31) B13(24-31) B16(24-31) B17(24-31)
3149
3150 // Shuffle pattern one - right side input
3151 const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3)
3152 const __m256i rhs_mat_2367_00_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_00, 136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3)
3153
3154 const __m256i rhs_mat_0145_01_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_01, 136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11)
3155 const __m256i rhs_mat_2367_01_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_01, 136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11)
3156
3157 const __m256i rhs_mat_0145_02_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_02, 136); //B00(16-19) B01(16-19) B00(16-19) B01(16-19) B04(16-19) B05(16-19) B04(16-19) B05(16-19)
3158 const __m256i rhs_mat_2367_02_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_02, 136); //B02(16-19) B03(16-19) B02(16-19) B03(16-19) B06(16-19) B07(16-19) B06(16-19) B07(16-19)
3159
3160 const __m256i rhs_mat_0145_03_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_03, 136); //B00(24-27) B01(24-27) B00(24-27) B01(24-27) B04(24-27) B05(24-27) B04(24-27) B05(24-27)
3161 const __m256i rhs_mat_2367_03_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_03, 136); //B02(24-27) B03(24-27) B02(24-27) B03(24-27) B06(24-27) B07(24-27) B06(24-27) B07(24-27)
3162
3163 const __m256i rhs_mat_0145_10_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_10, 136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3)
3164 const __m256i rhs_mat_2367_10_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_10, 136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3)
3165
3166 const __m256i rhs_mat_0145_11_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_11, 136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11)
3167 const __m256i rhs_mat_2367_11_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_11, 136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11)
3168
3169 const __m256i rhs_mat_0145_12_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_12, 136); //B10(16-19) B11(16-19) B10(16-19) B11(16-19) B14(16-19) B15(16-19) B14(16-19) B15(16-19)
3170 const __m256i rhs_mat_2367_12_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_12, 136); //B12(16-19) B13(16-19) B12(16-19) B13(16-19) B16(16-19) B17(16-19) B16(16-19) B17(16-19)
3171
3172 const __m256i rhs_mat_0145_13_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_13, 136); //B10(24-27) B11(24-27) B10(24-27) B11(24-27) B14(24-27) B15(24-27) B14(24-27) B15(24-27)
3173 const __m256i rhs_mat_2367_13_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_13, 136); //B12(24-27) B13(24-27) B12(24-27) B13(24-27) B16(24-27) B17(24-27) B16(24-27) B17(24-27)
3174
3175 // Shuffle pattern two - right side input
3176 const __m256i rhs_mat_0145_00_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_00, 221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7)
3177 const __m256i rhs_mat_2367_00_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_00, 221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7)
3178
3179 const __m256i rhs_mat_0145_01_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_01, 221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15)
3180 const __m256i rhs_mat_2367_01_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_01, 221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15)
3181
3182 const __m256i rhs_mat_0145_02_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_02, 221); //B00(20-23) B01(20-23) B00(20-23) B01(20-23) B04(20-23) B05(20-23) B04(20-23) B05(20-23)
3183 const __m256i rhs_mat_2367_02_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_02, 221); //B02(20-23) B03(20-23) B02(20-23) B03(20-23) B06(20-23) B07(20-23) B06(20-23) B07(20-23)
3184
3185 const __m256i rhs_mat_0145_03_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_03, 221); //B00(28-31) B01(28-31) B00(28-31) B01(28-31) B04(28-31) B05(28-31) B04(28-31) B05(28-31)
3186 const __m256i rhs_mat_2367_03_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_03, 221); //B02(28-31) B03(28-31) B02(28-31) B03(28-31) B06(28-31) B07(28-31) B06(28-31) B07(28-31)
3187
3188 const __m256i rhs_mat_0145_10_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_10, 221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7)
3189 const __m256i rhs_mat_2367_10_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_10, 221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7)
3190
3191 const __m256i rhs_mat_0145_11_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_11, 221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15)
3192 const __m256i rhs_mat_2367_11_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_11, 221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15)
3193
3194 const __m256i rhs_mat_0145_12_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_12, 221); //B10(20-23) B11(20-23) B10(20-23) B11(20-23) B14(20-23) B15(20-23) B14(20-23) B15(20-23)
3195 const __m256i rhs_mat_2367_12_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_12, 221); //B12(20-23) B13(20-23) B12(20-23) B13(20-23) B16(20-23) B17(20-23) B16(20-23) B17(20-23)
3196
3197 const __m256i rhs_mat_0145_13_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_13, 221); //B10(28-31) B11(28-31) B10(28-31) B11(28-31) B14(28-31) B15(28-31) B14(28-31) B15(28-31)
3198 const __m256i rhs_mat_2367_13_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_13, 221); //B12(28-31) B13(28-31) B12(28-31) B13(28-31) B16(28-31) B17(28-31) B16(28-31) B17(28-31)
3199
3200 uint32_t utmp_0[4], utmp_1[4];
3201
3202 // Scales and Mins of corresponding sub blocks from different Q4_K structures are stored together
3203 // The below block is for eg to extract first sub block's scales and mins from different Q4_K structures for the sb loop
3204 memcpy(dest: utmp_0, src: b_ptr[b].scales + 24 * sb, n: 12);
3205 utmp_0[3] = ((utmp_0[2] >> 4) & kmask2) | (((utmp_0[1] >> 6) & kmask3) << 4);
3206 const uint32_t uaux_0 = utmp_0[1] & kmask1;
3207 utmp_0[1] = (utmp_0[2] & kmask2) | (((utmp_0[0] >> 6) & kmask3) << 4);
3208 utmp_0[2] = uaux_0;
3209 utmp_0[0] &= kmask1;
3210
3211 // The below block is for eg to extract second sub block's scales and mins from different Q4_K structures when sb = 1
3212 memcpy(dest: utmp_1, src: b_ptr[b].scales + 12 + sb * 24, n: 12);
3213 utmp_1[3] = ((utmp_1[2] >> 4) & kmask2) | (((utmp_1[1] >> 6) & kmask3) << 4);
3214 const uint32_t uaux_1 = utmp_1[1] & kmask1;
3215 utmp_1[1] = (utmp_1[2] & kmask2) | (((utmp_1[0] >> 6) & kmask3) << 4);
3216 utmp_1[2] = uaux_1;
3217 utmp_1[0] &= kmask1;
3218
3219 // Scales of first sub block in the sb loop
3220 const __m128i mins_and_scales_0 = _mm_set_epi32(i3: utmp_0[3], i2: utmp_0[2], i1: utmp_0[1], i0: utmp_0[0]);
3221 const __m256i scales_0 = _mm256_cvtepu8_epi16(V: _mm_unpacklo_epi8(a: mins_and_scales_0, b: mins_and_scales_0));
3222
3223 // Scales of second sub block in the sb loop
3224 const __m128i mins_and_scales_1 = _mm_set_epi32(i3: utmp_1[3], i2: utmp_1[2], i1: utmp_1[1], i0: utmp_1[0]);
3225 const __m256i scales_1 = _mm256_cvtepu8_epi16(V: _mm_unpacklo_epi8(a: mins_and_scales_1, b: mins_and_scales_1));
3226
3227 // Mins of first and second sub block of Q4_K block are arranged side by side
3228 const __m256i mins_01 = _mm256_cvtepu8_epi16(V: _mm_unpacklo_epi8(_mm_shuffle_epi32(mins_and_scales_0, 78), _mm_shuffle_epi32(mins_and_scales_1, 78)));
3229
3230 const __m256i scale_0145_0 = _mm256_shuffle_epi32(scales_0, 68);
3231 const __m256i scale_2367_0 = _mm256_shuffle_epi32(scales_0, 238);
3232
3233 const __m256i scale_0145_1 = _mm256_shuffle_epi32(scales_1, 68);
3234 const __m256i scale_2367_1 = _mm256_shuffle_epi32(scales_1, 238);
3235
3236 // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
3237 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
3238 __m256i lhs_mat_0123_00 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 256 * sb)));
3239 __m256i lhs_mat_01_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 0);
3240 __m256i lhs_mat_23_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 17);
3241 __m256i lhs_mat_0123_01 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 32 + 256 * sb)));
3242 __m256i lhs_mat_01_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 0);
3243 __m256i lhs_mat_23_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 17);
3244 __m256i lhs_mat_0123_02 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 64 + 256 * sb)));
3245 __m256i lhs_mat_01_02 = _mm256_permute2f128_si256(lhs_mat_0123_02, lhs_mat_0123_02, 0);
3246 __m256i lhs_mat_23_02 = _mm256_permute2f128_si256(lhs_mat_0123_02, lhs_mat_0123_02, 17);
3247 __m256i lhs_mat_0123_03 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 96 + 256 * sb)));
3248 __m256i lhs_mat_01_03 = _mm256_permute2f128_si256(lhs_mat_0123_03, lhs_mat_0123_03, 0);
3249 __m256i lhs_mat_23_03 = _mm256_permute2f128_si256(lhs_mat_0123_03, lhs_mat_0123_03, 17);
3250 __m256i lhs_mat_0123_10 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 128 + 256 * sb)));
3251 __m256i lhs_mat_01_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 0);
3252 __m256i lhs_mat_23_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 17);
3253 __m256i lhs_mat_0123_11 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 160 + 256 * sb)));
3254 __m256i lhs_mat_01_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 0);
3255 __m256i lhs_mat_23_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 17);
3256 __m256i lhs_mat_0123_12 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 192 + 256 * sb)));
3257 __m256i lhs_mat_01_12 = _mm256_permute2f128_si256(lhs_mat_0123_12, lhs_mat_0123_12, 0);
3258 __m256i lhs_mat_23_12 = _mm256_permute2f128_si256(lhs_mat_0123_12, lhs_mat_0123_12, 17);
3259 __m256i lhs_mat_0123_13 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 224 + 256 * sb)));
3260 __m256i lhs_mat_01_13 = _mm256_permute2f128_si256(lhs_mat_0123_13, lhs_mat_0123_13, 0);
3261 __m256i lhs_mat_23_13 = _mm256_permute2f128_si256(lhs_mat_0123_13, lhs_mat_0123_13, 17);
3262
3263 // Bsums are loaded - four bsums are loaded (for two sub blocks) for the different Q8_K blocks
3264 __m256i lhs_bsums_0123_01 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].bsums + 16 * sb)));
3265 __m256i lhs_bsums_hsum_0123_01 = _mm256_castsi128_si256(a: _mm_hadd_epi16(a: _mm256_castsi256_si128(a: lhs_bsums_0123_01), _mm256_extractf128_si256(lhs_bsums_0123_01, 1)));
3266 lhs_bsums_hsum_0123_01 = _mm256_permute2x128_si256(lhs_bsums_hsum_0123_01, lhs_bsums_hsum_0123_01, 0);
3267
3268 // Shuffle pattern one - left side input
3269 const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3)
3270 const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3)
3271
3272 const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11)
3273 const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11)
3274
3275 const __m256i lhs_mat_01_02_sp1 = _mm256_shuffle_epi32(lhs_mat_01_02, 160); //A00(16-19) A00(16-19) A01(16-19) A01(16-19) A00(16-19) A00(16-19) A01(16-19) A01(16-19)
3276 const __m256i lhs_mat_23_02_sp1 = _mm256_shuffle_epi32(lhs_mat_23_02, 160); //A02(16-19) A03(16-19) A02(16-19) A03(16-19) A02(16-19) A03(16-19) A02(16-19) A03(16-19)
3277
3278 const __m256i lhs_mat_01_03_sp1 = _mm256_shuffle_epi32(lhs_mat_01_03, 160); //A00(24-27) A00(24-27) A01(24-27) A01(24-27) A00(24-27) A00(24-27) A01(24-27) A01(24-27)
3279 const __m256i lhs_mat_23_03_sp1 = _mm256_shuffle_epi32(lhs_mat_23_03, 160); //A02(24-27) A03(24-27) A02(24-27) A03(24-27) A02(24-27) A03(24-27) A02(24-27) A03(24-27)
3280
3281 const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3)
3282 const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3)
3283
3284 const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11)
3285 const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11)
3286
3287 const __m256i lhs_mat_01_12_sp1 = _mm256_shuffle_epi32(lhs_mat_01_12, 160); //A10(16-19) A10(16-19) A11(16-19) A11(16-19) A10(16-19) A10(16-19) A11(16-19) A11(16-19)
3288 const __m256i lhs_mat_23_12_sp1 = _mm256_shuffle_epi32(lhs_mat_23_12, 160); //A12(16-19) A13(16-19) A12(16-19) A13(16-19) A12(16-19) A13(16-19) A12(16-19) A13(16-19)
3289
3290 const __m256i lhs_mat_01_13_sp1 = _mm256_shuffle_epi32(lhs_mat_01_13, 160); //A10(24-27) A10(24-27) A11(24-27) A11(24-27) A10(24-27) A10(24-27) A11(24-27) A11(24-27)
3291 const __m256i lhs_mat_23_13_sp1 = _mm256_shuffle_epi32(lhs_mat_23_13, 160); //A12(24-27) A13(24-27) A12(24-27) A13(24-27) A12(24-27) A13(24-27) A12(24-27) A13(24-27)
3292
3293 // Shuffle pattern two- left side input
3294 const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7)
3295 const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7)
3296
3297 const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15)
3298 const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15)
3299
3300 const __m256i lhs_mat_01_02_sp2 = _mm256_shuffle_epi32(lhs_mat_01_02, 245); //A00(20-23) A00(20-23) A01(20-23) A01(20-23) A00(20-23) A00(20-23) A01(20-23) A01(20-23)
3301 const __m256i lhs_mat_23_02_sp2 = _mm256_shuffle_epi32(lhs_mat_23_02, 245); //A02(20-23) A03(20-23) A02(20-23) A03(20-23) A02(20-23) A03(20-23) A02(20-23) A03(20-23)
3302
3303 const __m256i lhs_mat_01_03_sp2 = _mm256_shuffle_epi32(lhs_mat_01_03, 245); //A00(28-31) A00(28-31) A01(28-31) A01(28-31) A00(28-31) A00(28-31) A01(28-31) A01(28-31)
3304 const __m256i lhs_mat_23_03_sp2 = _mm256_shuffle_epi32(lhs_mat_23_03, 245); //A02(28-31) A03(28-31) A02(28-31) A03(28-31) A02(28-31) A03(28-31) A02(28-31) A03(28-31)
3305
3306 const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7)
3307 const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7)
3308
3309 const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15)
3310 const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15)
3311
3312 const __m256i lhs_mat_01_12_sp2 = _mm256_shuffle_epi32(lhs_mat_01_12, 245); //A10(20-23) A10(20-23) A11(20-23) A11(20-23) A10(20-23) A10(20-23) A11(20-23) A11(20-23)
3313 const __m256i lhs_mat_23_12_sp2 = _mm256_shuffle_epi32(lhs_mat_23_12, 245); //A12(20-23) A13(20-23) A12(20-23) A13(20-23) A12(20-23) A13(20-23) A12(20-23) A13(20-23)
3314
3315 const __m256i lhs_mat_01_13_sp2 = _mm256_shuffle_epi32(lhs_mat_01_13, 245); //A10(28-31) A10(28-31) A11(28-31) A11(28-31) A10(28-31) A10(28-31) A11(28-31) A11(28-31)
3316 const __m256i lhs_mat_23_13_sp2 = _mm256_shuffle_epi32(lhs_mat_23_13, 245); //A12(28-31) A13(28-31) A12(28-31) A13(28-31) A12(28-31) A13(28-31) A12(28-31) A13(28-31)
3317
3318 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
3319 __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_03_sp1, b: lhs_mat_01_03_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_0145_02_sp1, b: lhs_mat_01_02_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp1, b: lhs_mat_01_01_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp1, b: lhs_mat_01_00_sp1));
3320 __m256i iacc_mat_01_0_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_03_sp1, b: lhs_mat_01_03_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_2367_02_sp1, b: lhs_mat_01_02_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp1, b: lhs_mat_01_01_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp1, b: lhs_mat_01_00_sp1));
3321 __m256i iacc_mat_10_0_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_03_sp1, b: lhs_mat_23_03_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_0145_02_sp1, b: lhs_mat_23_02_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp1, b: lhs_mat_23_01_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp1, b: lhs_mat_23_00_sp1));
3322 __m256i iacc_mat_11_0_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_03_sp1, b: lhs_mat_23_03_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_2367_02_sp1, b: lhs_mat_23_02_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp1, b: lhs_mat_23_01_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp1, b: lhs_mat_23_00_sp1));
3323 __m256i iacc_mat_00_1_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_13_sp1, b: lhs_mat_01_13_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_0145_12_sp1, b: lhs_mat_01_12_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp1, b: lhs_mat_01_11_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp1, b: lhs_mat_01_10_sp1));
3324 __m256i iacc_mat_01_1_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_13_sp1, b: lhs_mat_01_13_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_2367_12_sp1, b: lhs_mat_01_12_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp1, b: lhs_mat_01_11_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp1, b: lhs_mat_01_10_sp1));
3325 __m256i iacc_mat_10_1_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_13_sp1, b: lhs_mat_23_13_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_0145_12_sp1, b: lhs_mat_23_12_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp1, b: lhs_mat_23_11_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp1, b: lhs_mat_23_10_sp1));
3326 __m256i iacc_mat_11_1_sp1 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_13_sp1, b: lhs_mat_23_13_sp1), b: _mm256_maddubs_epi16(a: rhs_mat_2367_12_sp1, b: lhs_mat_23_12_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp1, b: lhs_mat_23_11_sp1)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp1, b: lhs_mat_23_10_sp1));
3327
3328 __m256i iacc_mat_00_0_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_03_sp2, b: lhs_mat_01_03_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_0145_02_sp2, b: lhs_mat_01_02_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp2, b: lhs_mat_01_01_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp2, b: lhs_mat_01_00_sp2));
3329 __m256i iacc_mat_01_0_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_03_sp2, b: lhs_mat_01_03_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_2367_02_sp2, b: lhs_mat_01_02_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp2, b: lhs_mat_01_01_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp2, b: lhs_mat_01_00_sp2));
3330 __m256i iacc_mat_10_0_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_03_sp2, b: lhs_mat_23_03_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_0145_02_sp2, b: lhs_mat_23_02_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp2, b: lhs_mat_23_01_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp2, b: lhs_mat_23_00_sp2));
3331 __m256i iacc_mat_11_0_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_03_sp2, b: lhs_mat_23_03_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_2367_02_sp2, b: lhs_mat_23_02_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp2, b: lhs_mat_23_01_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp2, b: lhs_mat_23_00_sp2));
3332 __m256i iacc_mat_00_1_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_13_sp2, b: lhs_mat_01_13_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_0145_12_sp2, b: lhs_mat_01_12_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp2, b: lhs_mat_01_11_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp2, b: lhs_mat_01_10_sp2));
3333 __m256i iacc_mat_01_1_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_13_sp2, b: lhs_mat_01_13_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_2367_12_sp2, b: lhs_mat_01_12_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp2, b: lhs_mat_01_11_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp2, b: lhs_mat_01_10_sp2));
3334 __m256i iacc_mat_10_1_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_13_sp2, b: lhs_mat_23_13_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_0145_12_sp2, b: lhs_mat_23_12_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp2, b: lhs_mat_23_11_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp2, b: lhs_mat_23_10_sp2));
3335 __m256i iacc_mat_11_1_sp2 = _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_13_sp2, b: lhs_mat_23_13_sp2), b: _mm256_maddubs_epi16(a: rhs_mat_2367_12_sp2, b: lhs_mat_23_12_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp2, b: lhs_mat_23_11_sp2)), b: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp2, b: lhs_mat_23_10_sp2));
3336
3337 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
3338 __m256i iacc_mat_00_0 = _mm256_add_epi16(a: iacc_mat_00_0_sp1, b: iacc_mat_00_0_sp2);
3339 __m256i iacc_mat_01_0 = _mm256_add_epi16(a: iacc_mat_01_0_sp1, b: iacc_mat_01_0_sp2);
3340 __m256i iacc_mat_10_0 = _mm256_add_epi16(a: iacc_mat_10_0_sp1, b: iacc_mat_10_0_sp2);
3341 __m256i iacc_mat_11_0 = _mm256_add_epi16(a: iacc_mat_11_0_sp1, b: iacc_mat_11_0_sp2);
3342
3343 __m256i iacc_mat_00_1 = _mm256_add_epi16(a: iacc_mat_00_1_sp1, b: iacc_mat_00_1_sp2);
3344 __m256i iacc_mat_01_1 = _mm256_add_epi16(a: iacc_mat_01_1_sp1, b: iacc_mat_01_1_sp2);
3345 __m256i iacc_mat_10_1 = _mm256_add_epi16(a: iacc_mat_10_1_sp1, b: iacc_mat_10_1_sp2);
3346 __m256i iacc_mat_11_1 = _mm256_add_epi16(a: iacc_mat_11_1_sp1, b: iacc_mat_11_1_sp2);
3347
3348 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
3349 iacc_mat_00_0 = _mm256_madd_epi16(a: iacc_mat_00_0, b: scale_0145_0);
3350 iacc_mat_01_0 = _mm256_madd_epi16(a: iacc_mat_01_0, b: scale_2367_0);
3351 iacc_mat_10_0 = _mm256_madd_epi16(a: iacc_mat_10_0, b: scale_0145_0);
3352 iacc_mat_11_0 = _mm256_madd_epi16(a: iacc_mat_11_0, b: scale_2367_0);
3353
3354 iacc_mat_00_1 = _mm256_madd_epi16(a: iacc_mat_00_1, b: scale_0145_1);
3355 iacc_mat_01_1 = _mm256_madd_epi16(a: iacc_mat_01_1, b: scale_2367_1);
3356 iacc_mat_10_1 = _mm256_madd_epi16(a: iacc_mat_10_1, b: scale_0145_1);
3357 iacc_mat_11_1 = _mm256_madd_epi16(a: iacc_mat_11_1, b: scale_2367_1);
3358
3359 // Straighten out to make 4 row vectors (4 for each sub block which are accumulated together in the next step)
3360 __m256i iacc_row_0_0 = _mm256_blend_epi32(iacc_mat_00_0, _mm256_shuffle_epi32(iacc_mat_01_0, 78), 204);
3361 __m256i iacc_row_1_0 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00_0, 78), iacc_mat_01_0, 204);
3362 __m256i iacc_row_2_0 = _mm256_blend_epi32(iacc_mat_10_0, _mm256_shuffle_epi32(iacc_mat_11_0, 78), 204);
3363 __m256i iacc_row_3_0 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10_0, 78), iacc_mat_11_0, 204);
3364 __m256i iacc_row_0_1 = _mm256_blend_epi32(iacc_mat_00_1, _mm256_shuffle_epi32(iacc_mat_01_1, 78), 204);
3365 __m256i iacc_row_1_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00_1, 78), iacc_mat_01_1, 204);
3366 __m256i iacc_row_2_1 = _mm256_blend_epi32(iacc_mat_10_1, _mm256_shuffle_epi32(iacc_mat_11_1, 78), 204);
3367 __m256i iacc_row_3_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10_1, 78), iacc_mat_11_1, 204);
3368
3369 __m256i iacc_row_0 = _mm256_add_epi32(a: iacc_row_0_0, b: iacc_row_0_1);
3370 __m256i iacc_row_1 = _mm256_add_epi32(a: iacc_row_1_0, b: iacc_row_1_1);
3371 __m256i iacc_row_2 = _mm256_add_epi32(a: iacc_row_2_0, b: iacc_row_2_1);
3372 __m256i iacc_row_3 = _mm256_add_epi32(a: iacc_row_3_0, b: iacc_row_3_1);
3373
3374 // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes
3375 const __m128 row_scale_f32_sse = _mm_load_ps(p: a_ptr[b].d);
3376 const __m256 row_scale_f32 = _mm256_set_m128(hi: row_scale_f32_sse, lo: row_scale_f32_sse); //GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
3377
3378 // Multiply with appropiate scales and accumulate (for both d and dmin) below
3379 acc_rows[0] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_0), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_rows[0]);
3380 acc_rows[1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_1), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_rows[1]);
3381 acc_rows[2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_2), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_rows[2]);
3382 acc_rows[3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_3), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_rows[3]);
3383
3384 __m256i iacc_row_min_0 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_hsum_0123_01, 0), b: mins_01);
3385 __m256i iacc_row_min_1 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_hsum_0123_01, 85), b: mins_01);
3386 __m256i iacc_row_min_2 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_hsum_0123_01, 170), b: mins_01);
3387 __m256i iacc_row_min_3 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_hsum_0123_01, 255), b: mins_01);
3388
3389 acc_min_rows[0] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_0), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_min_rows[0]);
3390 acc_min_rows[1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_1), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_min_rows[1]);
3391 acc_min_rows[2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_2), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_min_rows[2]);
3392 acc_min_rows[3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_3), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_min_rows[3]);
3393 }
3394 }
3395
3396 // Store the accumulated values
3397 for (int i = 0; i < 4; i++) {
3398 _mm256_storeu_ps(p: (float * )(s + ((y * 4 + i) * bs + x * 8)), a: _mm256_sub_ps(a: acc_rows[i], b: acc_min_rows[i]));
3399 }
3400 }
3401 }
3402
3403#else
3404 UNUSED(kmask1);
3405 UNUSED(kmask2);
3406 UNUSED(kmask3);
3407 ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
3408#endif
3409}
3410
3411void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
3412#if defined(__AVX2__) || defined(__AVX512F__)
3413 {
3414 __m256i signextendlut = _mm256_castsi128_si256(a: _mm_loadu_si128(p: (const __m128i*)kvalues_iq4nl));
3415 signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
3416
3417 gemm_q4_b32_8x8_q8_0_lut_avx<block_iq4_nlx8>(n, s, bs, vx, vy, nr, nc, signextendlut);
3418
3419 return;
3420 }
3421#endif // defined(__AVX2__) || defined(__AVX512F__)
3422
3423 ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
3424}
3425
3426void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
3427 const int qk = QK_K;
3428 const int nb = n / qk;
3429 const int ncols_interleaved = 8;
3430 const int blocklen = 8;
3431
3432 assert (n % qk == 0);
3433 assert (nr % 4 == 0);
3434 assert (nc % ncols_interleaved == 0);
3435
3436 UNUSED(s);
3437 UNUSED(bs);
3438 UNUSED(vx);
3439 UNUSED(vy);
3440 UNUSED(nr);
3441 UNUSED(nc);
3442 UNUSED(nb);
3443 UNUSED(ncols_interleaved);
3444 UNUSED(blocklen);
3445
3446#if defined(__AVX2__) || defined(__AVX512F__)
3447 const block_q2_Kx8 * b_ptr_start = (const block_q2_Kx8 * ) vx;
3448 const block_q8_Kx4 * a_ptr_start = (const block_q8_Kx4 * ) vy;
3449 int64_t b_nb = n / QK_K;
3450 int64_t y = 0;
3451
3452 // Permute mask used for easier vector processing at later stages
3453 __m256i requiredOrder = _mm256_set_epi32(i0: 3, i1: 2, i2: 1, i3: 0, i4: 7, i5: 6, i6: 5, i7: 4);
3454 int64_t xstart = 0;
3455 int anr = nr - nr % 16; // Used to align nr with boundary of 16
3456
3457 // Mask to convert 2 bit and 4 bit values into a bytes
3458 const __m256i m3b = _mm256_set1_epi8(b: 3);
3459 const __m128i m4b_sse = _mm_set1_epi8(b: 0xF);
3460
3461 //Mask to get appropriate scales
3462 __m128i scalesmask1_sse = _mm_set_epi8(b15: 14,b14: 14,b13: 12,b12: 12,b11: 10,b10: 10,b9: 8,b8: 8,b7: 6,b6: 6,b5: 4,b4: 4,b3: 2,b2: 2,b1: 0,b0: 0);
3463 __m128i scalesmask2_sse = _mm_set_epi8(b15: 15,b14: 15,b13: 13,b12: 13,b11: 11,b10: 11,b9: 9,b8: 9,b7: 7,b6: 7,b5: 5,b4: 5,b3: 3,b2: 3,b1: 1,b0: 1);
3464
3465 __m256i scalesmask1 = _mm256_castsi128_si256(a: scalesmask1_sse);
3466 scalesmask1 = _mm256_permute2f128_si256(scalesmask1, scalesmask1, 0);
3467 __m256i scalesmask2 = _mm256_castsi128_si256(a: scalesmask2_sse);
3468 scalesmask2 = _mm256_permute2f128_si256(scalesmask2, scalesmask2, 0);
3469
3470#ifdef __AVX512F__
3471
3472 int anc = nc - nc % 16; // Used to align nc with boundary of 16
3473
3474 // Mask to mask out nibbles from packed bytes
3475 const __m256i m4b = _mm256_set1_epi8(0x0F);
3476 // Mask to mask out nibbles from packed bytes expanded to 512 bit length
3477 const __m512i m3bexpanded = _mm512_set1_epi8(3);
3478 //Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
3479 for (; y < anr / 4; y += 4) {
3480
3481 const block_q8_Kx4 * a_ptrs[4];
3482
3483 a_ptrs[0] = a_ptr_start + (y * nb);
3484 for (int i = 0; i < 3; ++i) {
3485 a_ptrs[i + 1] = a_ptrs[i] + nb;
3486 }
3487
3488 // Take group of eight block_q2_kx8 structures at each pass of the loop and perform dot product operation
3489 for (int64_t x = 0; x < anc / 8; x += 2) {
3490
3491 const block_q2_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
3492 const block_q2_Kx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
3493
3494 // Master FP accumulators
3495 __m512 acc_rows[16];
3496 for (int i = 0; i < 16; i++) {
3497 acc_rows[i] = _mm512_setzero_ps();
3498 }
3499
3500 __m512 acc_min_rows[16];
3501 for (int i = 0; i < 16; i++) {
3502 acc_min_rows[i] = _mm512_setzero_ps();
3503 }
3504 // For super block
3505 for (int64_t b = 0; b < nb; b++) {
3506 // Delta values - Load the sixteen scale values from two block_q2_kx8 structures
3507 const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
3508
3509 // dmin values - Load the sixteen dmin values from two block_q2_kx8 structures
3510 const __m512 col_dmin_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].dmin, b_ptr_1[b].dmin);
3511
3512 // Loop to iterate over the sixteen sub blocks of a super block - eight sub blocks are processed per iteration
3513 for (int sb = 0; sb < QK_K / 128; sb++) {
3514
3515 // Load the eight block_q2_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7
3516 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + sb * 256));
3517 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 32 + sb * 256));
3518 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 64 + sb * 256));
3519 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 96 + sb * 256));
3520 const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 128 + sb * 256));
3521 const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 160 + sb * 256));
3522 const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 192 + sb * 256));
3523 const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 224 + sb * 256));
3524
3525 const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + sb * 256));
3526 const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 32 + sb * 256));
3527 const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 64 + sb * 256));
3528 const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 96 + sb * 256));
3529 const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 128 + sb * 256));
3530 const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 160 + sb * 256));
3531 const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 192 + sb * 256));
3532 const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 224 + sb * 256));
3533
3534 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
3535 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
3536 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
3537 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
3538 const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240);
3539 const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240);
3540 const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240);
3541 const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240);
3542
3543 const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
3544 const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
3545 const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
3546 const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
3547 const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240);
3548 const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240);
3549 const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240);
3550 const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240);
3551
3552 const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
3553 const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
3554 const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
3555 const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
3556
3557 const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1);
3558 const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1);
3559 const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1);
3560 const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1);
3561
3562 //2-bit -> 8-bit
3563 const __m512i rhs_mat_014589CD_00 = _mm512_and_si512(rhs_raw_mat_014589CD_0,m3bexpanded); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) B08(0-7) B09(0-7) B0C(0-7) B0D(0-7)
3564 const __m512i rhs_mat_2367ABEF_00 = _mm512_and_si512(rhs_raw_mat_2367ABEF_0,m3bexpanded); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) B0A(0-7) B0B(0-7) B0E(0-7) B0F(0-7)
3565 const __m512i rhs_mat_014589CD_01 = _mm512_and_si512(rhs_raw_mat_014589CD_1,m3bexpanded); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) B08(8-15) B09(8-15) B0C(8-15) B0D(8-15)
3566 const __m512i rhs_mat_2367ABEF_01 = _mm512_and_si512(rhs_raw_mat_2367ABEF_1,m3bexpanded); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) B0A(8-15) B0B(8-15) B0E(8-15) B0F(8-15)
3567 const __m512i rhs_mat_014589CD_10 = _mm512_and_si512(rhs_raw_mat_014589CD_2,m3bexpanded); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) B18(0-7) B19(0-7) B1C(0-7) B1D(0-7)
3568 const __m512i rhs_mat_2367ABEF_10 = _mm512_and_si512(rhs_raw_mat_2367ABEF_2,m3bexpanded); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) B1A(0-7) B1B(0-7) B1E(0-7) B1F(0-7)
3569 const __m512i rhs_mat_014589CD_11 = _mm512_and_si512(rhs_raw_mat_014589CD_3,m3bexpanded); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) B18(8-15) B19(8-15) B1C(8-15) B1D(8-15)
3570 const __m512i rhs_mat_2367ABEF_11 = _mm512_and_si512(rhs_raw_mat_2367ABEF_3,m3bexpanded); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) B1A(8-15) B1B(8-15) B1E(8-15) B1F(8-15)
3571
3572 const __m512i rhs_mat_014589CD_20 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 2), m3bexpanded); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) B28(0-7) B29(0-7) B2C(0-7) B2D(0-7)
3573 const __m512i rhs_mat_2367ABEF_20 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 2), m3bexpanded); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) B2A(0-7) B2B(0-7) B2E(0-7) B2F(0-7)
3574
3575 const __m512i rhs_mat_014589CD_21 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 2), m3bexpanded); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) B28(8-15) B29(8-15) B2C(8-15) B2D(8-15)
3576 const __m512i rhs_mat_2367ABEF_21 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 2), m3bexpanded); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) B2A(8-15) B2B(8-15) B2E(8-15) B2F(8-15)
3577
3578 const __m512i rhs_mat_014589CD_30 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 2), m3bexpanded); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) B38(0-7) B39(0-7) B3C(0-7) B3D(0-7)
3579 const __m512i rhs_mat_2367ABEF_30 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 2), m3bexpanded); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) B3A(0-7) B3B(0-7) B3E(0-7) B3F(0-7)
3580
3581 const __m512i rhs_mat_014589CD_31 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 2), m3bexpanded); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) B38(8-15) B39(8-15) B3C(8-15) B3D(8-15)
3582 const __m512i rhs_mat_2367ABEF_31 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 2), m3bexpanded); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) B3A(8-15) B3B(8-15) B3E(8-15) B3F(8-15)
3583
3584 const __m512i rhs_mat_014589CD_40 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m3bexpanded); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) B48(0-7) B49(0-7) B4C(0-7) B4D(0-7)
3585 const __m512i rhs_mat_2367ABEF_40 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m3bexpanded); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) B4A(0-7) B4B(0-7) B4E(0-7) B4F(0-7)
3586
3587 const __m512i rhs_mat_014589CD_41 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m3bexpanded); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) B48(8-15) B49(8-15) B4C(8-15) B4D(8-15)
3588 const __m512i rhs_mat_2367ABEF_41 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m3bexpanded); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) B4A(8-15) B4B(8-15) B4E(8-15) B4F(8-15)
3589
3590 const __m512i rhs_mat_014589CD_50 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m3bexpanded); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) B58(0-7) B59(0-7) B5C(0-7) B5D(0-7)
3591 const __m512i rhs_mat_2367ABEF_50 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m3bexpanded); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) B5A(0-7) B5B(0-7) B5E(0-7) B5F(0-7)
3592
3593 const __m512i rhs_mat_014589CD_51 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m3bexpanded); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) B58(8-15) B59(8-15) B5C(8-15) B5D(8-15)
3594 const __m512i rhs_mat_2367ABEF_51 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m3bexpanded); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) B5A(8-15) B5B(8-15) B5E(8-15) B5F(8-15)
3595
3596 const __m512i rhs_mat_014589CD_60 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 6), m3bexpanded); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) B68(0-7) B69(0-7) B6C(0-7) B6D(0-7)
3597 const __m512i rhs_mat_2367ABEF_60 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 6), m3bexpanded); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) B6A(0-7) B6B(0-7) B6E(0-7) B6F(0-7)
3598
3599 const __m512i rhs_mat_014589CD_61 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 6), m3bexpanded); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) B68(8-15) B69(8-15) B6C(8-15) B6D(8-15)
3600 const __m512i rhs_mat_2367ABEF_61 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 6), m3bexpanded); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) B6A(8-15) B6B(8-15) B6E(8-15) B6F(8-15)
3601
3602 const __m512i rhs_mat_014589CD_70 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 6), m3bexpanded); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) B78(0-7) B79(0-7) B7C(0-7) B7D(0-7)
3603 const __m512i rhs_mat_2367ABEF_70 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 6), m3bexpanded); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) B7A(0-7) B7B(0-7) B7E(0-7) B7F(0-7)
3604
3605 const __m512i rhs_mat_014589CD_71 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 6), m3bexpanded); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) B78(8-15) B79(8-15) B7C(8-15) B7D(8-15)
3606 const __m512i rhs_mat_2367ABEF_71 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 6), m3bexpanded); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) B7A(8-15) B7B(8-15) B7E(8-15) B7F(8-15)
3607
3608 const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3)
3609 const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) B0A(0-3) B0B(0-3) B0A(0-3) B0B(0-3) B0E(0-3) B0F(0-3) B0E(0-3) B0F(0-3)
3610
3611 const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11)
3612 const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) B0A(8-11) B0B(8-11) B0A(8-11) B0B(8-11) B0E(8-11) B0F(8-11) B0E(8-11) B0F(8-11)
3613
3614 const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) B18(0-3) B19(0-3) B18(0-3) B19(0-3) B1C(0-3) B1D(0-3) B1C(0-3) B1D(0-3)
3615 const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) B1A(0-3) B1B(0-3) B1A(0-3) B1B(0-3) B1E(0-3) B1F(0-3) B1E(0-3) B1F(0-3)
3616
3617 const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) B18(8-11) B19(8-11) B18(8-11) B19(8-11) B1C(8-11) B1D(8-11) B1C(8-11) B1D(8-11)
3618 const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) B1A(8-11) B1B(8-11) B1A(8-11) B1B(8-11) B1E(8-11) B1F(8-11) B1E(8-11) B1F(8-11)
3619
3620 const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) B28(0-3) B29(0-3) B28(0-3) B29(0-3) B2C(0-3) B2D(0-3) B2C(0-3) B2D(0-3)
3621 const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) B2A(0-3) B2B(0-3) B2A(0-3) B2B(0-3) B2E(0-3) B2F(0-3) B2E(0-3) B2F(0-3)
3622
3623 const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) B28(8-11) B29(8-11) B28(8-11) B29(8-11) B2C(8-11) B2D(8-11) B2C(8-11) B2D(8-11)
3624 const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) B2A(8-11) B2B(8-11) B2A(8-11) B2B(8-11) B2E(8-11) B2F(8-11) B2E(8-11) B2F(8-11)
3625
3626 const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); ///B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) B38(0-3) B39(0-3) B38(0-3) B39(0-3) B3C(0-3) B3D(0-3) B3C(0-3) B3D(0-3)
3627 const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) B3A(0-3) B3B(0-3) B3A(0-3) B3B(0-3) B3E(0-3) B3F(0-3) B3E(0-3) B3F(0-3)
3628
3629 const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11) B38(8-11) B39(8-11) B38(8-11) B39(8-11) B3C(8-11) B3D(8-11) B3C(8-11) B3D(8-11)
3630 const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) B3A(8-11) B3B(8-11) B3A(8-11) B3B(8-11) B3E(8-11) B3F(8-11) B3E(8-11) B3F(8-11)
3631
3632 const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) B48(0-3) B49(0-3) B48(0-3) B49(0-3) B4C(0-3) B4D(0-3) B4C(0-3) B4D(0-3)
3633 const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) B4A(0-3) B4B(0-3) B4A(0-3) B4B(0-3) B4E(0-3) B4F(0-3) B4E(0-3) B4F(0-3)
3634
3635 const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) B48(8-11) B49(8-11) B48(8-11) B49(8-11) B4C(8-11) B4D(8-11) B4C(8-11) B4D(8-11)
3636 const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) B4A(8-11) B4B(8-11) B4A(8-11) B4B(8-11) B4E(8-11) B4F(8-11) B4E(8-11) B4F(8-11)
3637
3638 const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) B58(0-3) B59(0-3) B58(0-3) B59(0-3) B5C(0-3) B5D(0-3) B5C(0-3) B5D(0-3)
3639 const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) B5A(0-3) B5B(0-3) B5A(0-3) B5B(0-3) B5E(0-3) B5F(0-3) B5E(0-3) B5F(0-3)
3640
3641 const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) B58(8-11) B59(8-11) B58(8-11) B59(8-11) B5C(8-11) B5D(8-11) B5C(8-11) B5D(8-11)
3642 const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) B5A(8-11) B5B(8-11) B5A(8-11) B5B(8-11) B5E(8-11) B5F(8-11) B5E(8-11) B5F(8-11)
3643
3644 const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) B68(0-3) B69(0-3) B68(0-3) B69(0-3) B6C(0-3) B6D(0-3) B6C(0-3) B6D(0-3)
3645 const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) B6A(0-3) B6B(0-3) B6A(0-3) B6B(0-3) B6E(0-3) B6F(0-3) B6E(0-3) B6F(0-3)
3646
3647 const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) B68(8-11) B69(8-11) B68(8-11) B69(8-11) B6C(8-11) B6D(8-11) B6C(8-11) B6D(8-11)
3648 const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) B6A(8-11) B6B(8-11) B6A(8-11) B6B(8-11) B6E(8-11) B6F(8-11) B6E(8-11) B6F(8-11)
3649
3650 const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) B78(0-3) B79(0-3) B78(0-3) B79(0-3) B7C(0-3) B7D(0-3) B7C(0-3) B7D(0-3)
3651 const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) B7A(0-3) B7B(0-3) B7A(0-3) B7B(0-3) B7E(0-3) B7F(0-3) B7E(0-3) B7F(0-3)
3652
3653 const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11)
3654 const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) B7A(8-11) B7B(8-11) B7A(8-11) B7B(8-11) B7E(8-11) B7F(8-11) B7E(8-11) B7F(8-11)
3655
3656 const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) B08(4-7) B09(4-7) B08(4-7) B09(4-7) B0C(4-7) B0D(4-7) B0C(4-7) B0D(4-7)
3657 const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) B0A(4-7) B0B(4-7) B0A(4-7) B0B(4-7) B0E(4-7) B0F(4-7) B0E(4-7) B0F(4-7)
3658
3659 const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) B08(12-15) B09(12-15) B08(12-15) B09(12-15) B0C(12-15) B0D(12-15) B0C(12-15) B0D(12-15)
3660 const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) B0A(12-15) B0B(12-15) B0A(12-15) B0B(12-15) B0E(12-15) B0F(12-15) B0E(12-15) B0F(12-15)
3661
3662 const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) B18(4-7) B19(4-7) B18(4-7) B19(4-7) B1C(4-7) B1D(4-7) B1C(4-7) B1D(4-7)
3663 const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) B1A(4-7) B1B(4-7) B1A(4-7) B1B(4-7) B1E(4-7) B1F(4-7) B1E(4-7) B1F(4-7)
3664
3665 const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) B18(12-15) B19(12-15) B18(12-15) B19(12-15) B1C(12-15) B1D(12-15) B1C(12-15) B1D(12-15)
3666 const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) B1A(12-15) B1B(12-15) B1A(12-15) B1B(12-15) B1E(12-15) B1F(12-15) B1E(12-15) B1F(12-15)
3667
3668 const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) B28(4-7) B29(4-7) B28(4-7) B29(4-7) B2C(4-7) B2D(4-7) B2C(4-7) B2D(4-7)
3669 const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) B2A(4-7) B2B(4-7) B2A(4-7) B2B(4-7) B2E(4-7) B2F(4-7) B2E(4-7) B2F(4-7)
3670
3671 const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) B28(12-15) B29(12-15) B28(12-15) B29(12-15) B2C(12-15) B2D(12-15) B2C(12-15) B2D(12-15)
3672 const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) B2A(12-15) B2B(12-15) B2A(12-15) B2B(12-15) B2E(12-15) B2F(12-15) B2E(12-15) B2F(12-15)
3673
3674 const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) B38(4-7) B39(4-7) B38(4-7) B39(4-7) B3C(4-7) B3D(4-7) B3C(4-7) B3D(4-7)
3675 const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) B3A(4-7) B3B(4-7) B3A(4-7) B3B(4-7) B3E(4-7) B3F(4-7) B3E(4-7) B3F(4-7)
3676
3677 const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) B38(12-15) B39(12-15) B38(12-15) B39(12-15) B3C(12-15) B3D(12-15) B3C(12-15) B3D(12-15)
3678 const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) B3A(12-15) B3B(12-15) B3A(12-15) B3B(12-15) B3E(12-15) B3F(12-15) B3E(12-15) B3F(12-15)
3679
3680 const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) B48(4-7) B49(4-7) B48(4-7) B49(4-7) B4C(4-7) B4D(4-7) B4C(4-7) B4D(4-7)
3681 const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) B4A(4-7) B4B(4-7) B4A(4-7) B4B(4-7) B4E(4-7) B4F(4-7) B4E(4-7) B4F(4-7)
3682
3683 const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) B48(12-15) B49(12-15) B48(12-15) B49(12-15) B4C(12-15) B4D(12-15) B4C(12-15) B4D(12-15)
3684 const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) B4A(12-15) B4B(12-15) B4A(12-15) B4B(12-15) B4E(12-15) B4F(12-15) B4E(12-15) B4F(12-15)
3685
3686 const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) B58(4-7) B59(4-7) B58(4-7) B59(4-7) B5C(4-7) B5D(4-7) B5C(4-7) B5D(4-7)
3687 const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) B5A(4-7) B5B(4-7) B5A(4-7) B5B(4-7) B5E(4-7) B5F(4-7) B5E(4-7) B5F(4-7)
3688
3689 const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) B58(12-15) B59(12-15) B58(12-15) B59(12-15) B5C(12-15) B5D(12-15) B5C(12-15) B5D(12-15)
3690 const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) B5A(12-15) B5B(12-15) B5A(12-15) B5B(12-15) B5E(12-15) B5F(12-15) B5E(12-15) B5F(12-15)
3691
3692 const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) B68(4-7) B69(4-7) B68(4-7) B69(4-7) B6C(4-7) B6D(4-7) B6C(4-7) B6D(4-7)
3693 const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) B6A(4-7) B6B(4-7) B6A(4-7) B6B(4-7) B6E(4-7) B6F(4-7) B6E(4-7) B6F(4-7)
3694
3695 const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) B68(12-15) B69(12-15) B68(12-15) B69(12-15) B6C(12-15) B6D(12-15) B6C(12-15) B6D(12-15)
3696 const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) B6A(12-15) B6B(12-15) B6A(12-15) B6B(12-15) B6E(12-15) B6F(12-15) B6E(12-15) B6F(12-15)
3697
3698 const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) B78(4-7) B79(4-7) B78(4-7) B79(4-7) B7C(4-7) B7D(4-7) B7C(4-7) B7D(4-7)
3699 const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) B7A(4-7) B7B(4-7) B7A(4-7) B7B(4-7) B7E(4-7) B7F(4-7) B7E(4-7) B7F(4-7)
3700
3701 const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) B78(12-15) B79(12-15) B78(12-15) B79(12-15) B7C(12-15) B7D(12-15) B7C(12-15) B7D(12-15)
3702 const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) B7A(12-15) B7B(12-15) B7A(12-15) B7B(12-15) B7E(12-15) B7F(12-15) B7E(12-15) B7F(12-15)
3703
3704 //notation:superblock subblock
3705 //s00 m00 s01 m01 s10 m10 s11 m11 s20 m20 s21 m21 s30 m30 s31 m31 s40 m40 s41 m41 s50 m50 s51 m51 s60 m60 s61 m61 s70 m70 s71 m71
3706
3707 const __m128i mins_and_scales_01_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + sb * 64));
3708 const __m128i mins_and_scales_23_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 16 + sb * 64));
3709 const __m128i mins_and_scales_45_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 32 + sb * 64));
3710 const __m128i mins_and_scales_67_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 48 + sb * 64));
3711
3712 const __m128i mins_and_scales_01_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + sb * 64));
3713 const __m128i mins_and_scales_23_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 16 + sb * 64));
3714 const __m128i mins_and_scales_45_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 32 + sb * 64));
3715 const __m128i mins_and_scales_67_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 48 + sb * 64));
3716
3717 // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop
3718 const __m256i mins_and_scales_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(mins_and_scales_01_0), mins_and_scales_01_1, 1);
3719 const __m256i mins_and_scales_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(mins_and_scales_23_0), mins_and_scales_23_1, 1);
3720 const __m256i mins_and_scales_45 = _mm256_insertf128_si256(_mm256_castsi128_si256(mins_and_scales_45_0), mins_and_scales_45_1, 1);
3721 const __m256i mins_and_scales_67 = _mm256_insertf128_si256(_mm256_castsi128_si256(mins_and_scales_67_0), mins_and_scales_67_1, 1);
3722
3723 // Extract scales which is lower half from mins_and_scales
3724 const __m256i scales_01 = _mm256_and_si256(mins_and_scales_01, m4b);
3725 const __m256i scales_23 = _mm256_and_si256(mins_and_scales_23, m4b);
3726 const __m256i scales_45 = _mm256_and_si256(mins_and_scales_45, m4b);
3727 const __m256i scales_67 = _mm256_and_si256(mins_and_scales_67, m4b);
3728
3729 // Extract mins which is upper half from mins_and_scales
3730 const __m512i mins_01 = _mm512_cvtepu8_epi16(_mm256_and_si256(_mm256_srli_epi16(mins_and_scales_01, 4), m4b));
3731 const __m512i mins_23 = _mm512_cvtepu8_epi16(_mm256_and_si256(_mm256_srli_epi16(mins_and_scales_23, 4), m4b));
3732 const __m512i mins_45 = _mm512_cvtepu8_epi16(_mm256_and_si256(_mm256_srli_epi16(mins_and_scales_45, 4), m4b));
3733 const __m512i mins_67 = _mm512_cvtepu8_epi16(_mm256_and_si256(_mm256_srli_epi16(mins_and_scales_67, 4), m4b));
3734
3735 const __m512i scales_0 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_01,scalesmask1));
3736 const __m512i scales_1 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_01,scalesmask2));
3737 const __m512i scales_2 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_23,scalesmask1));
3738 const __m512i scales_3 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_23,scalesmask2));
3739 const __m512i scales_4 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_45,scalesmask1));
3740 const __m512i scales_5 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_45,scalesmask2));
3741 const __m512i scales_6 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_67,scalesmask1));
3742 const __m512i scales_7 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_67,scalesmask2));
3743
3744 const __m512i scale_014589CD_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)68);
3745 const __m512i scale_2367ABEF_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)238);
3746
3747 const __m512i scale_014589CD_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)68);
3748 const __m512i scale_2367ABEF_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)238);
3749
3750 const __m512i scale_014589CD_2 = _mm512_shuffle_epi32(scales_2, (_MM_PERM_ENUM)68);
3751 const __m512i scale_2367ABEF_2 = _mm512_shuffle_epi32(scales_2, (_MM_PERM_ENUM)238);
3752
3753 const __m512i scale_014589CD_3 = _mm512_shuffle_epi32(scales_3, (_MM_PERM_ENUM)68);
3754 const __m512i scale_2367ABEF_3 = _mm512_shuffle_epi32(scales_3, (_MM_PERM_ENUM)238);
3755
3756 const __m512i scale_014589CD_4 = _mm512_shuffle_epi32(scales_4, (_MM_PERM_ENUM)68);
3757 const __m512i scale_2367ABEF_4 = _mm512_shuffle_epi32(scales_4, (_MM_PERM_ENUM)238);
3758
3759 const __m512i scale_014589CD_5 = _mm512_shuffle_epi32(scales_5, (_MM_PERM_ENUM)68);
3760 const __m512i scale_2367ABEF_5 = _mm512_shuffle_epi32(scales_5, (_MM_PERM_ENUM)238);
3761
3762 const __m512i scale_014589CD_6 = _mm512_shuffle_epi32(scales_6, (_MM_PERM_ENUM)68);
3763 const __m512i scale_2367ABEF_6 = _mm512_shuffle_epi32(scales_6, (_MM_PERM_ENUM)238);
3764
3765 const __m512i scale_014589CD_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)68);
3766 const __m512i scale_2367ABEF_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)238);
3767
3768
3769 for (int rp = 0; rp < 4; rp++) {
3770
3771 // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
3772 // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
3773 __m256i lhs_mat_ymm_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 512 * sb)));
3774 __m256i lhs_mat_ymm_01_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 0);
3775 __m256i lhs_mat_ymm_23_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 17);
3776 __m256i lhs_mat_ymm_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 32 + 512 * sb)));
3777 __m256i lhs_mat_ymm_01_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 0);
3778 __m256i lhs_mat_ymm_23_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 17);
3779 __m256i lhs_mat_ymm_0123_10 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 64 + 512 * sb)));
3780 __m256i lhs_mat_ymm_01_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 0);
3781 __m256i lhs_mat_ymm_23_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 17);
3782 __m256i lhs_mat_ymm_0123_11 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 96 + 512 * sb)));
3783 __m256i lhs_mat_ymm_01_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 0);
3784 __m256i lhs_mat_ymm_23_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 17);
3785 __m256i lhs_mat_ymm_0123_20 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 128 + 512 * sb)));
3786 __m256i lhs_mat_ymm_01_20 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_20, lhs_mat_ymm_0123_20, 0);
3787 __m256i lhs_mat_ymm_23_20 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_20, lhs_mat_ymm_0123_20, 17);
3788 __m256i lhs_mat_ymm_0123_21 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 160 + 512 * sb)));
3789 __m256i lhs_mat_ymm_01_21 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_21, lhs_mat_ymm_0123_21, 0);
3790 __m256i lhs_mat_ymm_23_21 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_21, lhs_mat_ymm_0123_21, 17);
3791 __m256i lhs_mat_ymm_0123_30 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 192 + 512 * sb)));
3792 __m256i lhs_mat_ymm_01_30 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_30, lhs_mat_ymm_0123_30, 0);
3793 __m256i lhs_mat_ymm_23_30 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_30, lhs_mat_ymm_0123_30, 17);
3794 __m256i lhs_mat_ymm_0123_31 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 224 + 512 * sb)));
3795 __m256i lhs_mat_ymm_01_31 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_31, lhs_mat_ymm_0123_31, 0);
3796 __m256i lhs_mat_ymm_23_31 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_31, lhs_mat_ymm_0123_31, 17);
3797
3798 __m256i lhs_mat_ymm_0123_40 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 256 + 512 * sb)));
3799 __m256i lhs_mat_ymm_01_40 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_40, lhs_mat_ymm_0123_40, 0);
3800 __m256i lhs_mat_ymm_23_40 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_40, lhs_mat_ymm_0123_40, 17);
3801 __m256i lhs_mat_ymm_0123_41 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 288 + 512 * sb)));
3802 __m256i lhs_mat_ymm_01_41 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_41, lhs_mat_ymm_0123_41, 0);
3803 __m256i lhs_mat_ymm_23_41 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_41, lhs_mat_ymm_0123_41, 17);
3804 __m256i lhs_mat_ymm_0123_50 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 320 + 512 * sb)));
3805 __m256i lhs_mat_ymm_01_50 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_50, lhs_mat_ymm_0123_50, 0);
3806 __m256i lhs_mat_ymm_23_50 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_50, lhs_mat_ymm_0123_50, 17);
3807 __m256i lhs_mat_ymm_0123_51 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 352 + 512 * sb)));
3808 __m256i lhs_mat_ymm_01_51 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_51, lhs_mat_ymm_0123_51, 0);
3809 __m256i lhs_mat_ymm_23_51 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_51, lhs_mat_ymm_0123_51, 17);
3810 __m256i lhs_mat_ymm_0123_60 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 384 + 512 * sb)));
3811 __m256i lhs_mat_ymm_01_60 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_60, lhs_mat_ymm_0123_60, 0);
3812 __m256i lhs_mat_ymm_23_60 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_60, lhs_mat_ymm_0123_60, 17);
3813 __m256i lhs_mat_ymm_0123_61 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 416 + 512 * sb)));
3814 __m256i lhs_mat_ymm_01_61 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_61, lhs_mat_ymm_0123_61, 0);
3815 __m256i lhs_mat_ymm_23_61 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_61, lhs_mat_ymm_0123_61, 17);
3816 __m256i lhs_mat_ymm_0123_70 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 448 + 512 * sb)));
3817 __m256i lhs_mat_ymm_01_70 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_70, lhs_mat_ymm_0123_70, 0);
3818 __m256i lhs_mat_ymm_23_70 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_70, lhs_mat_ymm_0123_70, 17);
3819 __m256i lhs_mat_ymm_0123_71 = _mm256_loadu_si256((const __m256i * )((a_ptrs[rp][b].qs + 480 + 512 * sb)));
3820 __m256i lhs_mat_ymm_01_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 0);
3821 __m256i lhs_mat_ymm_23_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 17);
3822
3823
3824 __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1);
3825 __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1);
3826 __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1);
3827 __m512i lhs_mat_23_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_01), lhs_mat_ymm_23_01, 1);
3828
3829 __m512i lhs_mat_01_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_10), lhs_mat_ymm_01_10, 1);
3830 __m512i lhs_mat_23_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_10), lhs_mat_ymm_23_10, 1);
3831 __m512i lhs_mat_01_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_11), lhs_mat_ymm_01_11, 1);
3832 __m512i lhs_mat_23_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_11), lhs_mat_ymm_23_11, 1);
3833
3834 __m512i lhs_mat_01_20 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_20), lhs_mat_ymm_01_20, 1);
3835 __m512i lhs_mat_23_20 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_20), lhs_mat_ymm_23_20, 1);
3836 __m512i lhs_mat_01_21 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_21), lhs_mat_ymm_01_21, 1);
3837 __m512i lhs_mat_23_21 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_21), lhs_mat_ymm_23_21, 1);
3838
3839 __m512i lhs_mat_01_30 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_30), lhs_mat_ymm_01_30, 1);
3840 __m512i lhs_mat_23_30 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_30), lhs_mat_ymm_23_30, 1);
3841 __m512i lhs_mat_01_31 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_31), lhs_mat_ymm_01_31, 1);
3842 __m512i lhs_mat_23_31 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_31), lhs_mat_ymm_23_31, 1);
3843
3844 __m512i lhs_mat_01_40 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_40), lhs_mat_ymm_01_40, 1);
3845 __m512i lhs_mat_23_40 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_40), lhs_mat_ymm_23_40, 1);
3846 __m512i lhs_mat_01_41 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_41), lhs_mat_ymm_01_41, 1);
3847 __m512i lhs_mat_23_41 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_41), lhs_mat_ymm_23_41, 1);
3848
3849 __m512i lhs_mat_01_50 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_50), lhs_mat_ymm_01_50, 1);
3850 __m512i lhs_mat_23_50 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_50), lhs_mat_ymm_23_50, 1);
3851 __m512i lhs_mat_01_51 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_51), lhs_mat_ymm_01_51, 1);
3852 __m512i lhs_mat_23_51 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_51), lhs_mat_ymm_23_51, 1);
3853
3854 __m512i lhs_mat_01_60 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_60), lhs_mat_ymm_01_60, 1);
3855 __m512i lhs_mat_23_60 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_60), lhs_mat_ymm_23_60, 1);
3856 __m512i lhs_mat_01_61 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_61), lhs_mat_ymm_01_61, 1);
3857 __m512i lhs_mat_23_61 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_61), lhs_mat_ymm_23_61, 1);
3858
3859 __m512i lhs_mat_01_70 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_70), lhs_mat_ymm_01_70, 1);
3860 __m512i lhs_mat_23_70 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_70), lhs_mat_ymm_23_70, 1);
3861 __m512i lhs_mat_01_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_71), lhs_mat_ymm_01_71, 1);
3862 __m512i lhs_mat_23_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_71), lhs_mat_ymm_23_71, 1);
3863
3864 // Bsums are loaded for the different Q8_K blocks
3865 __m128i lhs_raw_bsums_01_0123 = _mm_loadu_si128((const __m128i *)((a_ptrs[rp][b].bsums + 32 * sb)));
3866 __m128i lhs_raw_bsums_23_0123 = _mm_loadu_si128((const __m128i *)(a_ptrs[rp][b].bsums + 8 + 32 * sb));
3867 __m128i lhs_raw_bsums_01_4567 = _mm_loadu_si128((const __m128i *)((a_ptrs[rp][b].bsums + 16 + 32 * sb)));
3868 __m128i lhs_raw_bsums_23_4567 = _mm_loadu_si128((const __m128i *)(a_ptrs[rp][b].bsums + 24 + 32 * sb));
3869
3870 __m256i lhs_bsums_ymm_01_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_01_0123), lhs_raw_bsums_01_0123, 1);
3871 __m512i lhs_bsums_01_0123 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_ymm_01_0123), lhs_bsums_ymm_01_0123, 1);
3872 __m256i lhs_bsums_ymm_23_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_23_0123), lhs_raw_bsums_23_0123, 1);
3873 __m512i lhs_bsums_23_0123 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_ymm_23_0123), lhs_bsums_ymm_23_0123, 1); __m256i lhs_bsums_ymm_01_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_01_4567), lhs_raw_bsums_01_4567, 1);
3874 __m512i lhs_bsums_01_4567 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_ymm_01_4567), lhs_bsums_ymm_01_4567, 1);
3875 __m256i lhs_bsums_ymm_23_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_23_4567), lhs_raw_bsums_23_4567, 1);
3876 __m512i lhs_bsums_23_4567 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_ymm_23_4567), lhs_bsums_ymm_23_4567, 1);
3877
3878 // Shuffle pattern one - left side input
3879 const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3)
3880 const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3)
3881
3882 const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11)
3883 const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11)
3884
3885 const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3)
3886 const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3)
3887
3888 const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11)
3889 const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11)
3890
3891 const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3)
3892 const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A22(0-3) A23(0-3) A23(0-3) A22(0-3) A22(0-3) A23(0-3) A23(0-3) A22(0-3) A22(0-3) A23(0-3) A23(0-3) A22(0-3) A22(0-3) A23(0-3) A23(0-3)
3893
3894 const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11)
3895 const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A22(8-11) A23(8-11) A23(8-11) A22(8-11) A22(8-11) A23(8-11) A23(8-11) A22(8-11) A22(8-11) A23(8-11) A23(8-11) A22(8-11) A22(8-11) A23(8-11) A23(8-11)
3896
3897 const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3)
3898 const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A32(0-3) A33(0-3) A33(0-3) A32(0-3) A32(0-3) A33(0-3) A33(0-3) A32(0-3) A32(0-3) A33(0-3) A33(0-3) A32(0-3) A32(0-3) A33(0-3) A33(0-3)
3899
3900 const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11)
3901 const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A32(8-11) A33(8-11) A33(8-11) A32(8-11) A32(8-11) A33(8-11) A33(8-11) A32(8-11) A32(8-11) A33(8-11) A33(8-11) A32(8-11) A32(8-11) A33(8-11) A33(8-11)
3902
3903 const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3)
3904 const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A42(0-3) A43(0-3) A43(0-3) A42(0-3) A42(0-3) A43(0-3) A43(0-3) A42(0-3) A42(0-3) A43(0-3) A43(0-3) A42(0-3) A42(0-3) A43(0-3) A43(0-3)
3905
3906 const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11)
3907 const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A42(8-11) A43(8-11) A43(8-11) A42(8-11) A42(8-11) A43(8-11) A43(8-11) A42(8-11) A42(8-11) A43(8-11) A43(8-11) A42(8-11) A42(8-11) A43(8-11) A43(8-11)
3908
3909 const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3)
3910 const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A52(0-3) A53(0-3) A53(0-3) A52(0-3) A52(0-3) A53(0-3) A53(0-3) A52(0-3) A52(0-3) A53(0-3) A53(0-3) A52(0-3) A52(0-3) A53(0-3) A53(0-3)
3911
3912 const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11)
3913 const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A52(8-11) A53(8-11) A53(8-11) A52(8-11) A52(8-11) A53(8-11) A53(8-11) A52(8-11) A52(8-11) A53(8-11) A53(8-11) A52(8-11) A52(8-11) A53(8-11) A53(8-11)
3914
3915 const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3)
3916 const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A62(0-3) A63(0-3) A63(0-3) A62(0-3) A62(0-3) A63(0-3) A63(0-3) A62(0-3) A62(0-3) A63(0-3) A63(0-3) A62(0-3) A62(0-3) A63(0-3) A63(0-3)
3917
3918 const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11)
3919 const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A62(8-11) A63(8-11) A63(8-11) A62(8-11) A62(8-11) A63(8-11) A63(8-11) A62(8-11) A62(8-11) A63(8-11) A63(8-11) A62(8-11) A62(8-11) A63(8-11) A63(8-11)
3920
3921 const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3)
3922 const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A72(0-3) A73(0-3) A73(0-3) A72(0-3) A72(0-3) A73(0-3) A73(0-3) A72(0-3) A72(0-3) A73(0-3) A73(0-3) A72(0-3) A72(0-3) A73(0-3) A73(0-3)
3923
3924 const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11)
3925 const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A72(8-11) A73(8-11) A73(8-11) A72(8-11) A72(8-11) A73(8-11) A73(8-11) A72(8-11) A72(8-11) A73(8-11) A73(8-11) A72(8-11) A72(8-11) A73(8-11) A73(8-11)
3926
3927 const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7)
3928 const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7)
3929
3930 const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15)
3931 const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15)
3932
3933 const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7)
3934 const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7)
3935
3936 const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15)
3937 const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15)
3938
3939 const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7)
3940 const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A22(4-7) A23(4-7) A23(4-7) A22(4-7) A22(4-7) A23(4-7) A23(4-7) A22(4-7) A22(4-7) A23(4-7) A23(4-7) A22(4-7) A22(4-7) A23(4-7) A23(4-7)
3941
3942 const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15)
3943 const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A22(12-15) A23(12-15) A23(12-15) A22(12-15) A22(12-15) A23(12-15) A23(12-15) A22(12-15) A22(12-15) A23(12-15) A23(12-15) A22(12-15) A22(12-15) A23(12-15) A23(12-15)
3944
3945 const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7)
3946 const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A32(4-7) A33(4-7) A33(4-7) A32(4-7) A32(4-7) A33(4-7) A33(4-7) A32(4-7) A32(4-7) A33(4-7) A33(4-7) A32(4-7) A32(4-7) A33(4-7) A33(4-7)
3947
3948 const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15)
3949 const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A32(12-15) A33(12-15) A33(12-15) A32(12-15) A32(12-15) A33(12-15) A33(12-15) A32(12-15) A32(12-15) A33(12-15) A33(12-15) A32(12-15) A32(12-15) A33(12-15) A33(12-15)
3950
3951 const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7)
3952 const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A42(4-7) A43(4-7) A43(4-7) A42(4-7) A42(4-7) A43(4-7) A43(4-7) A42(4-7) A42(4-7) A43(4-7) A43(4-7) A42(4-7) A42(4-7) A43(4-7) A43(4-7)
3953
3954 const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15)
3955 const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A42(12-15) A43(12-15) A43(12-15) A42(12-15) A42(12-15) A43(12-15) A43(12-15) A42(12-15) A42(12-15) A43(12-15) A43(12-15) A42(12-15) A42(12-15) A43(12-15) A43(12-15)
3956
3957 const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7)
3958 const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A52(4-7) A53(4-7) A53(4-7) A52(4-7) A52(4-7) A53(4-7) A53(4-7) A52(4-7) A52(4-7) A53(4-7) A53(4-7) A52(4-7) A52(4-7) A53(4-7) A53(4-7)
3959
3960 const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15)
3961 const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A52(12-15) A53(12-15) A53(12-15) A52(12-15) A52(12-15) A53(12-15) A53(12-15) A52(12-15) A52(12-15) A53(12-15) A53(12-15) A52(12-15) A52(12-15) A53(12-15) A53(12-15)
3962
3963 const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7)
3964 const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A62(4-7) A63(4-7) A63(4-7) A62(4-7) A62(4-7) A63(4-7) A63(4-7) A62(4-7) A62(4-7) A63(4-7) A63(4-7) A62(4-7) A62(4-7) A63(4-7) A63(4-7)
3965
3966 const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15)
3967 const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A62(12-15) A63(12-15) A63(12-15) A62(12-15) A62(12-15) A63(12-15) A63(12-15) A62(12-15) A62(12-15) A63(12-15) A63(12-15) A62(12-15) A62(12-15) A63(12-15) A63(12-15)
3968
3969 const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7)
3970 const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A72(4-7) A73(4-7) A73(4-7) A72(4-7) A72(4-7) A73(4-7) A73(4-7) A72(4-7) A72(4-7) A73(4-7) A73(4-7) A72(4-7) A72(4-7) A73(4-7) A73(4-7)
3971
3972 const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15)
3973 const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A72(12-15) A73(12-15) A73(12-15) A72(12-15) A72(12-15) A73(12-15) A73(12-15) A72(12-15) A72(12-15) A73(12-15) A73(12-15) A72(12-15) A72(12-15) A73(12-15) A73(12-15)
3974
3975 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
3976 __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1));
3977 __m512i iacc_mat_01_0_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_01_00_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_01_01_sp1));
3978
3979 __m512i iacc_mat_10_0_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_23_00_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_23_01_sp1));
3980 __m512i iacc_mat_11_0_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_23_00_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_23_01_sp1));
3981
3982 __m512i iacc_mat_00_1_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_01_10_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_01_11_sp1));
3983 __m512i iacc_mat_01_1_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_01_10_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_01_11_sp1));
3984
3985 __m512i iacc_mat_10_1_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_23_10_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_23_11_sp1));
3986 __m512i iacc_mat_11_1_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_23_10_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_23_11_sp1));
3987
3988 __m512i iacc_mat_00_2_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp1, lhs_mat_01_20_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp1, lhs_mat_01_21_sp1));
3989 __m512i iacc_mat_01_2_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp1, lhs_mat_01_20_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp1, lhs_mat_01_21_sp1));
3990
3991 __m512i iacc_mat_10_2_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp1, lhs_mat_23_20_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp1, lhs_mat_23_21_sp1));
3992 __m512i iacc_mat_11_2_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp1, lhs_mat_23_20_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp1, lhs_mat_23_21_sp1));
3993
3994 __m512i iacc_mat_00_3_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp1, lhs_mat_01_30_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp1, lhs_mat_01_31_sp1));
3995 __m512i iacc_mat_01_3_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp1, lhs_mat_01_30_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp1, lhs_mat_01_31_sp1));
3996
3997 __m512i iacc_mat_10_3_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp1, lhs_mat_23_30_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp1, lhs_mat_23_31_sp1));
3998 __m512i iacc_mat_11_3_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp1, lhs_mat_23_30_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp1, lhs_mat_23_31_sp1));
3999
4000 __m512i iacc_mat_00_4_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp1, lhs_mat_01_40_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp1, lhs_mat_01_41_sp1));
4001 __m512i iacc_mat_01_4_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp1, lhs_mat_01_40_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp1, lhs_mat_01_41_sp1));
4002
4003 __m512i iacc_mat_10_4_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp1, lhs_mat_23_40_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp1, lhs_mat_23_41_sp1));
4004 __m512i iacc_mat_11_4_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp1, lhs_mat_23_40_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp1, lhs_mat_23_41_sp1));
4005
4006 __m512i iacc_mat_00_5_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp1, lhs_mat_01_50_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp1, lhs_mat_01_51_sp1));
4007 __m512i iacc_mat_01_5_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp1, lhs_mat_01_50_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp1, lhs_mat_01_51_sp1));
4008
4009 __m512i iacc_mat_10_5_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp1, lhs_mat_23_50_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp1, lhs_mat_23_51_sp1));
4010 __m512i iacc_mat_11_5_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp1, lhs_mat_23_50_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp1, lhs_mat_23_51_sp1));
4011
4012 __m512i iacc_mat_00_6_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp1, lhs_mat_01_60_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp1, lhs_mat_01_61_sp1));
4013 __m512i iacc_mat_01_6_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp1, lhs_mat_01_60_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp1, lhs_mat_01_61_sp1));
4014
4015 __m512i iacc_mat_10_6_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp1, lhs_mat_23_60_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp1, lhs_mat_23_61_sp1));
4016 __m512i iacc_mat_11_6_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp1, lhs_mat_23_60_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp1, lhs_mat_23_61_sp1));
4017
4018 __m512i iacc_mat_00_7_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp1, lhs_mat_01_70_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp1, lhs_mat_01_71_sp1));
4019 __m512i iacc_mat_01_7_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp1, lhs_mat_01_70_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp1, lhs_mat_01_71_sp1));
4020
4021 __m512i iacc_mat_10_7_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp1, lhs_mat_23_70_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp1, lhs_mat_23_71_sp1));
4022 __m512i iacc_mat_11_7_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp1, lhs_mat_23_70_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp1, lhs_mat_23_71_sp1));
4023
4024
4025 __m512i iacc_mat_00_0_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_01_00_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_01_01_sp2));
4026 __m512i iacc_mat_01_0_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_01_00_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_01_01_sp2));
4027
4028 __m512i iacc_mat_10_0_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_23_00_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_23_01_sp2));
4029 __m512i iacc_mat_11_0_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_23_00_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_23_01_sp2));
4030
4031 __m512i iacc_mat_00_1_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_01_10_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_01_11_sp2));
4032 __m512i iacc_mat_01_1_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_01_10_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_01_11_sp2));
4033
4034 __m512i iacc_mat_10_1_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_23_10_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_23_11_sp2));
4035 __m512i iacc_mat_11_1_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_23_10_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_23_11_sp2));
4036
4037 __m512i iacc_mat_00_2_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp2, lhs_mat_01_20_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp2, lhs_mat_01_21_sp2));
4038 __m512i iacc_mat_01_2_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp2, lhs_mat_01_20_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp2, lhs_mat_01_21_sp2));
4039
4040 __m512i iacc_mat_10_2_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp2, lhs_mat_23_20_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp2, lhs_mat_23_21_sp2));
4041 __m512i iacc_mat_11_2_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp2, lhs_mat_23_20_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp2, lhs_mat_23_21_sp2));
4042
4043 __m512i iacc_mat_00_3_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp2, lhs_mat_01_30_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp2, lhs_mat_01_31_sp2));
4044 __m512i iacc_mat_01_3_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp2, lhs_mat_01_30_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp2, lhs_mat_01_31_sp2));
4045
4046 __m512i iacc_mat_10_3_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp2, lhs_mat_23_30_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp2, lhs_mat_23_31_sp2));
4047 __m512i iacc_mat_11_3_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp2, lhs_mat_23_30_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp2, lhs_mat_23_31_sp2));
4048
4049 __m512i iacc_mat_00_4_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp2, lhs_mat_01_40_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp2, lhs_mat_01_41_sp2));
4050 __m512i iacc_mat_01_4_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp2, lhs_mat_01_40_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp2, lhs_mat_01_41_sp2));
4051
4052 __m512i iacc_mat_10_4_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp2, lhs_mat_23_40_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp2, lhs_mat_23_41_sp2));
4053 __m512i iacc_mat_11_4_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp2, lhs_mat_23_40_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp2, lhs_mat_23_41_sp2));
4054
4055 __m512i iacc_mat_00_5_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp2, lhs_mat_01_50_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp2, lhs_mat_01_51_sp2));
4056 __m512i iacc_mat_01_5_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp2, lhs_mat_01_50_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp2, lhs_mat_01_51_sp2));
4057
4058 __m512i iacc_mat_10_5_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp2, lhs_mat_23_50_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp2, lhs_mat_23_51_sp2));
4059 __m512i iacc_mat_11_5_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp2, lhs_mat_23_50_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp2, lhs_mat_23_51_sp2));
4060
4061 __m512i iacc_mat_00_6_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp2, lhs_mat_01_60_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp2, lhs_mat_01_61_sp2));
4062 __m512i iacc_mat_01_6_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp2, lhs_mat_01_60_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp2, lhs_mat_01_61_sp2));
4063
4064 __m512i iacc_mat_10_6_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp2, lhs_mat_23_60_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp2, lhs_mat_23_61_sp2));
4065 __m512i iacc_mat_11_6_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp2, lhs_mat_23_60_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp2, lhs_mat_23_61_sp2));
4066
4067 __m512i iacc_mat_00_7_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp2, lhs_mat_01_70_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp2, lhs_mat_01_71_sp2));
4068 __m512i iacc_mat_01_7_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp2, lhs_mat_01_70_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp2, lhs_mat_01_71_sp2));
4069
4070 __m512i iacc_mat_10_7_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp2, lhs_mat_23_70_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp2, lhs_mat_23_71_sp2));
4071 __m512i iacc_mat_11_7_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp2, lhs_mat_23_70_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp2, lhs_mat_23_71_sp2));
4072
4073 // Combine results from both shuffle patterns for each output block
4074 __m512i iacc_mat_00_0 = _mm512_add_epi16(iacc_mat_00_0_sp1, iacc_mat_00_0_sp2);
4075 __m512i iacc_mat_01_0 = _mm512_add_epi16(iacc_mat_01_0_sp1, iacc_mat_01_0_sp2);
4076 __m512i iacc_mat_10_0 = _mm512_add_epi16(iacc_mat_10_0_sp1, iacc_mat_10_0_sp2);
4077 __m512i iacc_mat_11_0 = _mm512_add_epi16(iacc_mat_11_0_sp1, iacc_mat_11_0_sp2);
4078
4079 __m512i iacc_mat_00_1 = _mm512_add_epi16(iacc_mat_00_1_sp1, iacc_mat_00_1_sp2);
4080 __m512i iacc_mat_01_1 = _mm512_add_epi16(iacc_mat_01_1_sp1, iacc_mat_01_1_sp2);
4081 __m512i iacc_mat_10_1 = _mm512_add_epi16(iacc_mat_10_1_sp1, iacc_mat_10_1_sp2);
4082 __m512i iacc_mat_11_1 = _mm512_add_epi16(iacc_mat_11_1_sp1, iacc_mat_11_1_sp2);
4083
4084 __m512i iacc_mat_00_2 = _mm512_add_epi16(iacc_mat_00_2_sp1, iacc_mat_00_2_sp2);
4085 __m512i iacc_mat_01_2 = _mm512_add_epi16(iacc_mat_01_2_sp1, iacc_mat_01_2_sp2);
4086 __m512i iacc_mat_10_2 = _mm512_add_epi16(iacc_mat_10_2_sp1, iacc_mat_10_2_sp2);
4087 __m512i iacc_mat_11_2 = _mm512_add_epi16(iacc_mat_11_2_sp1, iacc_mat_11_2_sp2);
4088
4089 __m512i iacc_mat_00_3 = _mm512_add_epi16(iacc_mat_00_3_sp1, iacc_mat_00_3_sp2);
4090 __m512i iacc_mat_01_3 = _mm512_add_epi16(iacc_mat_01_3_sp1, iacc_mat_01_3_sp2);
4091 __m512i iacc_mat_10_3 = _mm512_add_epi16(iacc_mat_10_3_sp1, iacc_mat_10_3_sp2);
4092 __m512i iacc_mat_11_3 = _mm512_add_epi16(iacc_mat_11_3_sp1, iacc_mat_11_3_sp2);
4093
4094 __m512i iacc_mat_00_4 = _mm512_add_epi16(iacc_mat_00_4_sp1, iacc_mat_00_4_sp2);
4095 __m512i iacc_mat_01_4 = _mm512_add_epi16(iacc_mat_01_4_sp1, iacc_mat_01_4_sp2);
4096 __m512i iacc_mat_10_4 = _mm512_add_epi16(iacc_mat_10_4_sp1, iacc_mat_10_4_sp2);
4097 __m512i iacc_mat_11_4 = _mm512_add_epi16(iacc_mat_11_4_sp1, iacc_mat_11_4_sp2);
4098
4099 __m512i iacc_mat_00_5 = _mm512_add_epi16(iacc_mat_00_5_sp1, iacc_mat_00_5_sp2);
4100 __m512i iacc_mat_01_5 = _mm512_add_epi16(iacc_mat_01_5_sp1, iacc_mat_01_5_sp2);
4101 __m512i iacc_mat_10_5 = _mm512_add_epi16(iacc_mat_10_5_sp1, iacc_mat_10_5_sp2);
4102 __m512i iacc_mat_11_5 = _mm512_add_epi16(iacc_mat_11_5_sp1, iacc_mat_11_5_sp2);
4103
4104 __m512i iacc_mat_00_6 = _mm512_add_epi16(iacc_mat_00_6_sp1, iacc_mat_00_6_sp2);
4105 __m512i iacc_mat_01_6 = _mm512_add_epi16(iacc_mat_01_6_sp1, iacc_mat_01_6_sp2);
4106 __m512i iacc_mat_10_6 = _mm512_add_epi16(iacc_mat_10_6_sp1, iacc_mat_10_6_sp2);
4107 __m512i iacc_mat_11_6 = _mm512_add_epi16(iacc_mat_11_6_sp1, iacc_mat_11_6_sp2);
4108
4109 __m512i iacc_mat_00_7 = _mm512_add_epi16(iacc_mat_00_7_sp1, iacc_mat_00_7_sp2);
4110 __m512i iacc_mat_01_7 = _mm512_add_epi16(iacc_mat_01_7_sp1, iacc_mat_01_7_sp2);
4111 __m512i iacc_mat_10_7 = _mm512_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2);
4112 __m512i iacc_mat_11_7 = _mm512_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2);
4113
4114 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
4115 iacc_mat_00_0 = _mm512_madd_epi16(iacc_mat_00_0, scale_014589CD_0);
4116 iacc_mat_01_0 = _mm512_madd_epi16(iacc_mat_01_0, scale_2367ABEF_0);
4117 iacc_mat_10_0 = _mm512_madd_epi16(iacc_mat_10_0, scale_014589CD_0);
4118 iacc_mat_11_0 = _mm512_madd_epi16(iacc_mat_11_0, scale_2367ABEF_0);
4119
4120 iacc_mat_00_1 = _mm512_madd_epi16(iacc_mat_00_1, scale_014589CD_1);
4121 iacc_mat_01_1 = _mm512_madd_epi16(iacc_mat_01_1, scale_2367ABEF_1);
4122 iacc_mat_10_1 = _mm512_madd_epi16(iacc_mat_10_1, scale_014589CD_1);
4123 iacc_mat_11_1 = _mm512_madd_epi16(iacc_mat_11_1, scale_2367ABEF_1);
4124
4125 iacc_mat_00_2 = _mm512_madd_epi16(iacc_mat_00_2, scale_014589CD_2);
4126 iacc_mat_01_2 = _mm512_madd_epi16(iacc_mat_01_2, scale_2367ABEF_2);
4127 iacc_mat_10_2 = _mm512_madd_epi16(iacc_mat_10_2, scale_014589CD_2);
4128 iacc_mat_11_2 = _mm512_madd_epi16(iacc_mat_11_2, scale_2367ABEF_2);
4129
4130 iacc_mat_00_3 = _mm512_madd_epi16(iacc_mat_00_3, scale_014589CD_3);
4131 iacc_mat_01_3 = _mm512_madd_epi16(iacc_mat_01_3, scale_2367ABEF_3);
4132 iacc_mat_10_3 = _mm512_madd_epi16(iacc_mat_10_3, scale_014589CD_3);
4133 iacc_mat_11_3 = _mm512_madd_epi16(iacc_mat_11_3, scale_2367ABEF_3);
4134
4135 iacc_mat_00_4 = _mm512_madd_epi16(iacc_mat_00_4, scale_014589CD_4);
4136 iacc_mat_01_4 = _mm512_madd_epi16(iacc_mat_01_4, scale_2367ABEF_4);
4137 iacc_mat_10_4 = _mm512_madd_epi16(iacc_mat_10_4, scale_014589CD_4);
4138 iacc_mat_11_4 = _mm512_madd_epi16(iacc_mat_11_4, scale_2367ABEF_4);
4139
4140 iacc_mat_00_5 = _mm512_madd_epi16(iacc_mat_00_5, scale_014589CD_5);
4141 iacc_mat_01_5 = _mm512_madd_epi16(iacc_mat_01_5, scale_2367ABEF_5);
4142 iacc_mat_10_5 = _mm512_madd_epi16(iacc_mat_10_5, scale_014589CD_5);
4143 iacc_mat_11_5 = _mm512_madd_epi16(iacc_mat_11_5, scale_2367ABEF_5);
4144
4145 iacc_mat_00_6 = _mm512_madd_epi16(iacc_mat_00_6, scale_014589CD_6);
4146 iacc_mat_01_6 = _mm512_madd_epi16(iacc_mat_01_6, scale_2367ABEF_6);
4147 iacc_mat_10_6 = _mm512_madd_epi16(iacc_mat_10_6, scale_014589CD_6);
4148 iacc_mat_11_6 = _mm512_madd_epi16(iacc_mat_11_6, scale_2367ABEF_6);
4149
4150 iacc_mat_00_7 = _mm512_madd_epi16(iacc_mat_00_7, scale_014589CD_7);
4151 iacc_mat_01_7 = _mm512_madd_epi16(iacc_mat_01_7, scale_2367ABEF_7);
4152 iacc_mat_10_7 = _mm512_madd_epi16(iacc_mat_10_7, scale_014589CD_7);
4153 iacc_mat_11_7 = _mm512_madd_epi16(iacc_mat_11_7, scale_2367ABEF_7);
4154
4155 __m512i iacc_mat_00 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_0, iacc_mat_00_1), _mm512_add_epi32(iacc_mat_00_2, iacc_mat_00_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_4, iacc_mat_00_5), _mm512_add_epi32(iacc_mat_00_6, iacc_mat_00_7)));
4156 __m512i iacc_mat_01 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_0, iacc_mat_01_1), _mm512_add_epi32(iacc_mat_01_2, iacc_mat_01_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_4, iacc_mat_01_5), _mm512_add_epi32(iacc_mat_01_6, iacc_mat_01_7)));
4157 __m512i iacc_mat_10 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_0, iacc_mat_10_1), _mm512_add_epi32(iacc_mat_10_2, iacc_mat_10_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_4, iacc_mat_10_5), _mm512_add_epi32(iacc_mat_10_6, iacc_mat_10_7)));
4158 __m512i iacc_mat_11 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_11_0, iacc_mat_11_1), _mm512_add_epi32(iacc_mat_11_2, iacc_mat_11_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_11_4, iacc_mat_11_5), _mm512_add_epi32(iacc_mat_11_6, iacc_mat_11_7)));
4159
4160 // Straighten out to make 4 row vectors
4161 __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
4162 __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
4163 __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
4164 __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);
4165
4166 // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes
4167 const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
4168 const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
4169 const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
4170
4171 // Multiply with appropiate scales and accumulate (for both d and dmin) below
4172 acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
4173 acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
4174 acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
4175 acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
4176
4177 // Take two bsums from two Q8_Ks at a time and multiply with corresponding mins values from each Q2_K
4178 __m512i iacc_row_min_0_01 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_0123, (_MM_PERM_ENUM)0), mins_01);
4179 __m512i iacc_row_min_1_01 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_0123, (_MM_PERM_ENUM)170), mins_01);
4180 __m512i iacc_row_min_2_01 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_0123, (_MM_PERM_ENUM)0), mins_01);
4181 __m512i iacc_row_min_3_01 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_0123, (_MM_PERM_ENUM)170), mins_01);
4182
4183 __m512i iacc_row_min_0_23 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_0123, (_MM_PERM_ENUM)85), mins_23);
4184 __m512i iacc_row_min_1_23 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_0123, (_MM_PERM_ENUM)255), mins_23);
4185 __m512i iacc_row_min_2_23 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_0123, (_MM_PERM_ENUM)85), mins_23);
4186 __m512i iacc_row_min_3_23 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_0123, (_MM_PERM_ENUM)255), mins_23);
4187
4188 __m512i iacc_row_min_0_45 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_4567, (_MM_PERM_ENUM)0), mins_45);
4189 __m512i iacc_row_min_1_45 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_4567, (_MM_PERM_ENUM)170), mins_45);
4190 __m512i iacc_row_min_2_45 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_4567, (_MM_PERM_ENUM)0), mins_45);
4191 __m512i iacc_row_min_3_45 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_4567, (_MM_PERM_ENUM)170), mins_45);
4192
4193 __m512i iacc_row_min_0_67 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_4567, (_MM_PERM_ENUM)85), mins_67);
4194 __m512i iacc_row_min_1_67 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_4567, (_MM_PERM_ENUM)255), mins_67);
4195 __m512i iacc_row_min_2_67 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_4567, (_MM_PERM_ENUM)85), mins_67);
4196 __m512i iacc_row_min_3_67 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_4567, (_MM_PERM_ENUM)255), mins_67);
4197
4198 __m512i iacc_row_min_0 = _mm512_add_epi32(_mm512_add_epi32(iacc_row_min_0_01, iacc_row_min_0_23), _mm512_add_epi32(iacc_row_min_0_45,iacc_row_min_0_67));
4199 __m512i iacc_row_min_1 = _mm512_add_epi32(_mm512_add_epi32(iacc_row_min_1_01, iacc_row_min_1_23), _mm512_add_epi32(iacc_row_min_1_45,iacc_row_min_1_67));
4200 __m512i iacc_row_min_2 = _mm512_add_epi32(_mm512_add_epi32(iacc_row_min_2_01, iacc_row_min_2_23), _mm512_add_epi32(iacc_row_min_2_45,iacc_row_min_2_67));
4201 __m512i iacc_row_min_3 = _mm512_add_epi32(_mm512_add_epi32(iacc_row_min_3_01, iacc_row_min_3_23), _mm512_add_epi32(iacc_row_min_3_45,iacc_row_min_3_67));
4202
4203 acc_min_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_0), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_min_rows[rp * 4]);
4204 acc_min_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_1), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_min_rows[rp * 4 + 1]);
4205 acc_min_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_2), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_min_rows[rp * 4 + 2]);
4206 acc_min_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[rp * 4 + 3]);
4207 }
4208 }
4209 }
4210 // Store the accumulated values
4211 for (int i = 0; i < 16; i++) {
4212 _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
4213 }
4214 }
4215 }
4216
4217 for (; y < nr / 4; y ++) {
4218
4219 const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb);
4220
4221 // Take group of eight block_q2_kx8 structures at each pass of the loop and perform dot product operation
4222 for (int64_t x = 0; x < anc / 8; x += 2) {
4223
4224 const block_q2_Kx8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
4225 const block_q2_Kx8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
4226
4227 // Master FP accumulators
4228 __m512 acc_rows[4];
4229 for (int i = 0; i < 4; i++) {
4230 acc_rows[i] = _mm512_setzero_ps();
4231 }
4232
4233 __m512 acc_min_rows[4];
4234 for (int i = 0; i < 4; i++) {
4235 acc_min_rows[i] = _mm512_setzero_ps();
4236 }
4237 // For super block
4238 for (int64_t b = 0; b < nb; b++) {
4239 // Delta values - Load the sixteen scale values from two block_q2_kx8 structures
4240 const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
4241
4242 // dmin values - Load the sixteen dmin values from two block_q2_kx8 structures
4243 const __m512 col_dmin_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].dmin, b_ptr_1[b].dmin);
4244
4245 // Loop to iterate over the sixteen sub blocks of a super block - eight sub blocks are processed per iteration
4246 for (int sb = 0; sb < QK_K / 128; sb++) {
4247
4248 // Load the eight block_q2_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7
4249 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + sb * 256));
4250 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 32 + sb * 256));
4251 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 64 + sb * 256));
4252 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 96 + sb * 256));
4253 const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 128 + sb * 256));
4254 const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 160 + sb * 256));
4255 const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 192 + sb * 256));
4256 const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_0[b].qs + 224 + sb * 256));
4257
4258 const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + sb * 256));
4259 const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 32 + sb * 256));
4260 const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 64 + sb * 256));
4261 const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 96 + sb * 256));
4262 const __m256i rhs_raw_mat_89AB_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 128 + sb * 256));
4263 const __m256i rhs_raw_mat_CDEF_2 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 160 + sb * 256));
4264 const __m256i rhs_raw_mat_89AB_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 192 + sb * 256));
4265 const __m256i rhs_raw_mat_CDEF_3 = _mm256_loadu_si256((const __m256i * )(b_ptr_1[b].qs + 224 + sb * 256));
4266
4267 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
4268 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
4269 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
4270 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
4271 const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240);
4272 const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240);
4273 const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240);
4274 const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240);
4275
4276 const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
4277 const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
4278 const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
4279 const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
4280 const __m256i rhs_raw_mat_89CD_2 = _mm256_blend_epi32(rhs_raw_mat_89AB_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_2, requiredOrder), 240);
4281 const __m256i rhs_raw_mat_ABEF_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_2, requiredOrder), rhs_raw_mat_CDEF_2, 240);
4282 const __m256i rhs_raw_mat_89CD_3 = _mm256_blend_epi32(rhs_raw_mat_89AB_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_3, requiredOrder), 240);
4283 const __m256i rhs_raw_mat_ABEF_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_3, requiredOrder), rhs_raw_mat_CDEF_3, 240);
4284
4285 const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
4286 const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
4287 const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
4288 const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
4289
4290 const __m512i rhs_raw_mat_014589CD_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_2), rhs_raw_mat_89CD_2, 1);
4291 const __m512i rhs_raw_mat_2367ABEF_2 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_2), rhs_raw_mat_ABEF_2, 1);
4292 const __m512i rhs_raw_mat_014589CD_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_3), rhs_raw_mat_89CD_3, 1);
4293 const __m512i rhs_raw_mat_2367ABEF_3 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_3), rhs_raw_mat_ABEF_3, 1);
4294
4295 //2-bit -> 8-bit
4296 const __m512i rhs_mat_014589CD_00 = _mm512_and_si512(rhs_raw_mat_014589CD_0,m3bexpanded); //B00(0-7) B01(0-7) B04(0-7) B05(0-7) B08(0-7) B09(0-7) B0C(0-7) B0D(0-7)
4297 const __m512i rhs_mat_2367ABEF_00 = _mm512_and_si512(rhs_raw_mat_2367ABEF_0,m3bexpanded); //B02(0-7) B03(0-7) B06(0-7) B07(0-7) B0A(0-7) B0B(0-7) B0E(0-7) B0F(0-7)
4298 const __m512i rhs_mat_014589CD_01 = _mm512_and_si512(rhs_raw_mat_014589CD_1,m3bexpanded); //B00(8-15) B01(8-15) B04(8-15) B05(8-15) B08(8-15) B09(8-15) B0C(8-15) B0D(8-15)
4299 const __m512i rhs_mat_2367ABEF_01 = _mm512_and_si512(rhs_raw_mat_2367ABEF_1,m3bexpanded); //B02(8-15) B03(8-15) B06(8-15) B07(8-15) B0A(8-15) B0B(8-15) B0E(8-15) B0F(8-15)
4300 const __m512i rhs_mat_014589CD_10 = _mm512_and_si512(rhs_raw_mat_014589CD_2,m3bexpanded); //B10(0-7) B11(0-7) B14(0-7) B15(0-7) B18(0-7) B19(0-7) B1C(0-7) B1D(0-7)
4301 const __m512i rhs_mat_2367ABEF_10 = _mm512_and_si512(rhs_raw_mat_2367ABEF_2,m3bexpanded); //B12(0-7) B13(0-7) B16(0-7) B17(0-7) B1A(0-7) B1B(0-7) B1E(0-7) B1F(0-7)
4302 const __m512i rhs_mat_014589CD_11 = _mm512_and_si512(rhs_raw_mat_014589CD_3,m3bexpanded); //B10(8-15) B11(8-15) B14(8-15) B15(8-15) B18(8-15) B19(8-15) B1C(8-15) B1D(8-15)
4303 const __m512i rhs_mat_2367ABEF_11 = _mm512_and_si512(rhs_raw_mat_2367ABEF_3,m3bexpanded); //B12(8-15) B13(8-15) B16(8-15) B17(8-15) B1A(8-15) B1B(8-15) B1E(8-15) B1F(8-15)
4304
4305 const __m512i rhs_mat_014589CD_20 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 2), m3bexpanded); //B20(0-7) B21(0-7) B24(0-7) B25(0-7) B28(0-7) B29(0-7) B2C(0-7) B2D(0-7)
4306 const __m512i rhs_mat_2367ABEF_20 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 2), m3bexpanded); //B22(0-7) B23(0-7) B26(0-7) B27(0-7) B2A(0-7) B2B(0-7) B2E(0-7) B2F(0-7)
4307
4308 const __m512i rhs_mat_014589CD_21 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 2), m3bexpanded); //B20(8-15) B21(8-15) B24(8-15) B25(8-15) B28(8-15) B29(8-15) B2C(8-15) B2D(8-15)
4309 const __m512i rhs_mat_2367ABEF_21 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 2), m3bexpanded); //B22(8-15) B23(8-15) B26(8-15) B27(8-15) B2A(8-15) B2B(8-15) B2E(8-15) B2F(8-15)
4310
4311 const __m512i rhs_mat_014589CD_30 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 2), m3bexpanded); //B30(0-7) B31(0-7) B34(0-7) B35(0-7) B38(0-7) B39(0-7) B3C(0-7) B3D(0-7)
4312 const __m512i rhs_mat_2367ABEF_30 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 2), m3bexpanded); //B32(0-7) B33(0-7) B36(0-7) B37(0-7) B3A(0-7) B3B(0-7) B3E(0-7) B3F(0-7)
4313
4314 const __m512i rhs_mat_014589CD_31 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 2), m3bexpanded); //B30(8-15) B31(8-15) B34(8-15) B35(8-15) B38(8-15) B39(8-15) B3C(8-15) B3D(8-15)
4315 const __m512i rhs_mat_2367ABEF_31 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 2), m3bexpanded); //B32(8-15) B33(8-15) B36(8-15) B37(8-15) B3A(8-15) B3B(8-15) B3E(8-15) B3F(8-15)
4316
4317 const __m512i rhs_mat_014589CD_40 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m3bexpanded); //B40(0-7) B41(0-7) B44(0-7) B45(0-7) B48(0-7) B49(0-7) B4C(0-7) B4D(0-7)
4318 const __m512i rhs_mat_2367ABEF_40 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m3bexpanded); //B42(0-7) B43(0-7) B46(0-7) B47(0-7) B4A(0-7) B4B(0-7) B4E(0-7) B4F(0-7)
4319
4320 const __m512i rhs_mat_014589CD_41 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m3bexpanded); //B40(8-15) B41(8-15) B44(8-15) B45(8-15) B48(8-15) B49(8-15) B4C(8-15) B4D(8-15)
4321 const __m512i rhs_mat_2367ABEF_41 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m3bexpanded); //B42(8-15) B43(8-15) B46(8-15) B47(8-15) B4A(8-15) B4B(8-15) B4E(8-15) B4F(8-15)
4322
4323 const __m512i rhs_mat_014589CD_50 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 4), m3bexpanded); //B50(0-7) B51(0-7) B54(0-7) B55(0-7) B58(0-7) B59(0-7) B5C(0-7) B5D(0-7)
4324 const __m512i rhs_mat_2367ABEF_50 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 4), m3bexpanded); //B52(0-7) B53(0-7) B56(0-7) B57(0-7) B5A(0-7) B5B(0-7) B5E(0-7) B5F(0-7)
4325
4326 const __m512i rhs_mat_014589CD_51 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 4), m3bexpanded); //B50(8-15) B51(8-15) B54(8-15) B55(8-15) B58(8-15) B59(8-15) B5C(8-15) B5D(8-15)
4327 const __m512i rhs_mat_2367ABEF_51 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 4), m3bexpanded); //B52(8-15) B53(8-15) B56(8-15) B57(8-15) B5A(8-15) B5B(8-15) B5E(8-15) B5F(8-15)
4328
4329 const __m512i rhs_mat_014589CD_60 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 6), m3bexpanded); //B60(0-7) B61(0-7) B64(0-7) B65(0-7) B68(0-7) B69(0-7) B6C(0-7) B6D(0-7)
4330 const __m512i rhs_mat_2367ABEF_60 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 6), m3bexpanded); //B62(0-7) B63(0-7) B66(0-7) B67(0-7) B6A(0-7) B6B(0-7) B6E(0-7) B6F(0-7)
4331
4332 const __m512i rhs_mat_014589CD_61 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 6), m3bexpanded); //B60(8-15) B61(8-15) B64(8-15) B65(8-15) B68(8-15) B69(8-15) B6C(8-15) B6D(8-15)
4333 const __m512i rhs_mat_2367ABEF_61 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 6), m3bexpanded); //B62(8-15) B63(8-15) B66(8-15) B67(8-15) B6A(8-15) B6B(8-15) B6E(8-15) B6F(8-15)
4334
4335 const __m512i rhs_mat_014589CD_70 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_2, 6), m3bexpanded); //B70(0-7) B71(0-7) B74(0-7) B75(0-7) B78(0-7) B79(0-7) B7C(0-7) B7D(0-7)
4336 const __m512i rhs_mat_2367ABEF_70 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_2, 6), m3bexpanded); //B72(0-7) B73(0-7) B76(0-7) B77(0-7) B7A(0-7) B7B(0-7) B7E(0-7) B7F(0-7)
4337
4338 const __m512i rhs_mat_014589CD_71 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_3, 6), m3bexpanded); //B70(8-15) B71(8-15) B74(8-15) B75(8-15) B78(8-15) B79(8-15) B7C(8-15) B7D(8-15)
4339 const __m512i rhs_mat_2367ABEF_71 = _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_3, 6), m3bexpanded); //B72(8-15) B73(8-15) B76(8-15) B77(8-15) B7A(8-15) B7B(8-15) B7E(8-15) B7F(8-15)
4340
4341 const __m512i rhs_mat_014589CD_00_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3) B08(0-3) B09(0-3) B08(0-3) B09(0-3) B0C(0-3) B0D(0-3) B0C(0-3) B0D(0-3)
4342 const __m512i rhs_mat_2367ABEF_00_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3) B0A(0-3) B0B(0-3) B0A(0-3) B0B(0-3) B0E(0-3) B0F(0-3) B0E(0-3) B0F(0-3)
4343
4344 const __m512i rhs_mat_014589CD_01_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11)
4345 const __m512i rhs_mat_2367ABEF_01_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11) B0A(8-11) B0B(8-11) B0A(8-11) B0B(8-11) B0E(8-11) B0F(8-11) B0E(8-11) B0F(8-11)
4346
4347 const __m512i rhs_mat_014589CD_10_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3) B18(0-3) B19(0-3) B18(0-3) B19(0-3) B1C(0-3) B1D(0-3) B1C(0-3) B1D(0-3)
4348 const __m512i rhs_mat_2367ABEF_10_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3) B1A(0-3) B1B(0-3) B1A(0-3) B1B(0-3) B1E(0-3) B1F(0-3) B1E(0-3) B1F(0-3)
4349
4350 const __m512i rhs_mat_014589CD_11_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11) B18(8-11) B19(8-11) B18(8-11) B19(8-11) B1C(8-11) B1D(8-11) B1C(8-11) B1D(8-11)
4351 const __m512i rhs_mat_2367ABEF_11_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11) B1A(8-11) B1B(8-11) B1A(8-11) B1B(8-11) B1E(8-11) B1F(8-11) B1E(8-11) B1F(8-11)
4352
4353 const __m512i rhs_mat_014589CD_20_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3) B28(0-3) B29(0-3) B28(0-3) B29(0-3) B2C(0-3) B2D(0-3) B2C(0-3) B2D(0-3)
4354 const __m512i rhs_mat_2367ABEF_20_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3) B2A(0-3) B2B(0-3) B2A(0-3) B2B(0-3) B2E(0-3) B2F(0-3) B2E(0-3) B2F(0-3)
4355
4356 const __m512i rhs_mat_014589CD_21_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11) B28(8-11) B29(8-11) B28(8-11) B29(8-11) B2C(8-11) B2D(8-11) B2C(8-11) B2D(8-11)
4357 const __m512i rhs_mat_2367ABEF_21_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11) B2A(8-11) B2B(8-11) B2A(8-11) B2B(8-11) B2E(8-11) B2F(8-11) B2E(8-11) B2F(8-11)
4358 const __m512i rhs_mat_014589CD_30_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)136); ///B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3) B38(0-3) B39(0-3) B38(0-3) B39(0-3) B3C(0-3) B3D(0-3) B3C(0-3) B3D(0-3)
4359 const __m512i rhs_mat_2367ABEF_30_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3) B3A(0-3) B3B(0-3) B3A(0-3) B3B(0-3) B3E(0-3) B3F(0-3) B3E(0-3) B3F(0-3)
4360
4361 const __m512i rhs_mat_014589CD_31_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11) B38(8-11) B39(8-11) B38(8-11) B39(8-11) B3C(8-11) B3D(8-11) B3C(8-11) B3D(8-11)
4362 const __m512i rhs_mat_2367ABEF_31_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11) B3A(8-11) B3B(8-11) B3A(8-11) B3B(8-11) B3E(8-11) B3F(8-11) B3E(8-11) B3F(8-11)
4363
4364 const __m512i rhs_mat_014589CD_40_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3) B48(0-3) B49(0-3) B48(0-3) B49(0-3) B4C(0-3) B4D(0-3) B4C(0-3) B4D(0-3)
4365 const __m512i rhs_mat_2367ABEF_40_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3) B4A(0-3) B4B(0-3) B4A(0-3) B4B(0-3) B4E(0-3) B4F(0-3) B4E(0-3) B4F(0-3)
4366
4367 const __m512i rhs_mat_014589CD_41_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11) B48(8-11) B49(8-11) B48(8-11) B49(8-11) B4C(8-11) B4D(8-11) B4C(8-11) B4D(8-11)
4368 const __m512i rhs_mat_2367ABEF_41_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11) B4A(8-11) B4B(8-11) B4A(8-11) B4B(8-11) B4E(8-11) B4F(8-11) B4E(8-11) B4F(8-11)
4369
4370 const __m512i rhs_mat_014589CD_50_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3) B58(0-3) B59(0-3) B58(0-3) B59(0-3) B5C(0-3) B5D(0-3) B5C(0-3) B5D(0-3)
4371 const __m512i rhs_mat_2367ABEF_50_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3) B5A(0-3) B5B(0-3) B5A(0-3) B5B(0-3) B5E(0-3) B5F(0-3) B5E(0-3) B5F(0-3)
4372
4373 const __m512i rhs_mat_014589CD_51_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11) B58(8-11) B59(8-11) B58(8-11) B59(8-11) B5C(8-11) B5D(8-11) B5C(8-11) B5D(8-11)
4374 const __m512i rhs_mat_2367ABEF_51_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11) B5A(8-11) B5B(8-11) B5A(8-11) B5B(8-11) B5E(8-11) B5F(8-11) B5E(8-11) B5F(8-11)
4375
4376 const __m512i rhs_mat_014589CD_60_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3) B68(0-3) B69(0-3) B68(0-3) B69(0-3) B6C(0-3) B6D(0-3) B6C(0-3) B6D(0-3)
4377 const __m512i rhs_mat_2367ABEF_60_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3) B6A(0-3) B6B(0-3) B6A(0-3) B6B(0-3) B6E(0-3) B6F(0-3) B6E(0-3) B6F(0-3)
4378
4379 const __m512i rhs_mat_014589CD_61_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11) B68(8-11) B69(8-11) B68(8-11) B69(8-11) B6C(8-11) B6D(8-11) B6C(8-11) B6D(8-11)
4380 const __m512i rhs_mat_2367ABEF_61_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11) B6A(8-11) B6B(8-11) B6A(8-11) B6B(8-11) B6E(8-11) B6F(8-11) B6E(8-11) B6F(8-11)
4381
4382 const __m512i rhs_mat_014589CD_70_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3) B78(0-3) B79(0-3) B78(0-3) B79(0-3) B7C(0-3) B7D(0-3) B7C(0-3) B7D(0-3)
4383 const __m512i rhs_mat_2367ABEF_70_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3) B7A(0-3) B7B(0-3) B7A(0-3) B7B(0-3) B7E(0-3) B7F(0-3) B7E(0-3) B7F(0-3)
4384
4385 const __m512i rhs_mat_014589CD_71_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11) B08(8-11) B09(8-11) B08(8-11) B09(8-11) B0C(8-11) B0D(8-11) B0C(8-11) B0D(8-11)
4386 const __m512i rhs_mat_2367ABEF_71_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11) B7A(8-11) B7B(8-11) B7A(8-11) B7B(8-11) B7E(8-11) B7F(8-11) B7E(8-11) B7F(8-11)
4387
4388 const __m512i rhs_mat_014589CD_00_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_00, (_MM_PERM_ENUM)221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7) B08(4-7) B09(4-7) B08(4-7) B09(4-7) B0C(4-7) B0D(4-7) B0C(4-7) B0D(4-7)
4389 const __m512i rhs_mat_2367ABEF_00_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_00, (_MM_PERM_ENUM)221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7) B0A(4-7) B0B(4-7) B0A(4-7) B0B(4-7) B0E(4-7) B0F(4-7) B0E(4-7) B0F(4-7)
4390
4391 const __m512i rhs_mat_014589CD_01_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_01, (_MM_PERM_ENUM)221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15) B08(12-15) B09(12-15) B08(12-15) B09(12-15) B0C(12-15) B0D(12-15) B0C(12-15) B0D(12-15)
4392 const __m512i rhs_mat_2367ABEF_01_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_01, (_MM_PERM_ENUM)221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15) B0A(12-15) B0B(12-15) B0A(12-15) B0B(12-15) B0E(12-15) B0F(12-15) B0E(12-15) B0F(12-15)
4393
4394 const __m512i rhs_mat_014589CD_10_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_10, (_MM_PERM_ENUM)221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7) B18(4-7) B19(4-7) B18(4-7) B19(4-7) B1C(4-7) B1D(4-7) B1C(4-7) B1D(4-7)
4395 const __m512i rhs_mat_2367ABEF_10_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_10, (_MM_PERM_ENUM)221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7) B1A(4-7) B1B(4-7) B1A(4-7) B1B(4-7) B1E(4-7) B1F(4-7) B1E(4-7) B1F(4-7)
4396
4397 const __m512i rhs_mat_014589CD_11_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_11, (_MM_PERM_ENUM)221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15) B18(12-15) B19(12-15) B18(12-15) B19(12-15) B1C(12-15) B1D(12-15) B1C(12-15) B1D(12-15)
4398 const __m512i rhs_mat_2367ABEF_11_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_11, (_MM_PERM_ENUM)221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15) B1A(12-15) B1B(12-15) B1A(12-15) B1B(12-15) B1E(12-15) B1F(12-15) B1E(12-15) B1F(12-15)
4399
4400 const __m512i rhs_mat_014589CD_20_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_20, (_MM_PERM_ENUM)221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7) B28(4-7) B29(4-7) B28(4-7) B29(4-7) B2C(4-7) B2D(4-7) B2C(4-7) B2D(4-7)
4401 const __m512i rhs_mat_2367ABEF_20_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_20, (_MM_PERM_ENUM)221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7) B2A(4-7) B2B(4-7) B2A(4-7) B2B(4-7) B2E(4-7) B2F(4-7) B2E(4-7) B2F(4-7)
4402
4403 const __m512i rhs_mat_014589CD_21_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_21, (_MM_PERM_ENUM)221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15) B28(12-15) B29(12-15) B28(12-15) B29(12-15) B2C(12-15) B2D(12-15) B2C(12-15) B2D(12-15)
4404 const __m512i rhs_mat_2367ABEF_21_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_21, (_MM_PERM_ENUM)221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15) B2A(12-15) B2B(12-15) B2A(12-15) B2B(12-15) B2E(12-15) B2F(12-15) B2E(12-15) B2F(12-15)
4405
4406 const __m512i rhs_mat_014589CD_30_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_30, (_MM_PERM_ENUM)221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7) B38(4-7) B39(4-7) B38(4-7) B39(4-7) B3C(4-7) B3D(4-7) B3C(4-7) B3D(4-7)
4407 const __m512i rhs_mat_2367ABEF_30_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_30, (_MM_PERM_ENUM)221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7) B3A(4-7) B3B(4-7) B3A(4-7) B3B(4-7) B3E(4-7) B3F(4-7) B3E(4-7) B3F(4-7)
4408
4409 const __m512i rhs_mat_014589CD_31_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_31, (_MM_PERM_ENUM)221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15) B38(12-15) B39(12-15) B38(12-15) B39(12-15) B3C(12-15) B3D(12-15) B3C(12-15) B3D(12-15)
4410 const __m512i rhs_mat_2367ABEF_31_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_31, (_MM_PERM_ENUM)221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15) B3A(12-15) B3B(12-15) B3A(12-15) B3B(12-15) B3E(12-15) B3F(12-15) B3E(12-15) B3F(12-15)
4411
4412 const __m512i rhs_mat_014589CD_40_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_40, (_MM_PERM_ENUM)221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7) B48(4-7) B49(4-7) B48(4-7) B49(4-7) B4C(4-7) B4D(4-7) B4C(4-7) B4D(4-7)
4413 const __m512i rhs_mat_2367ABEF_40_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_40, (_MM_PERM_ENUM)221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7) B4A(4-7) B4B(4-7) B4A(4-7) B4B(4-7) B4E(4-7) B4F(4-7) B4E(4-7) B4F(4-7)
4414
4415 const __m512i rhs_mat_014589CD_41_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_41, (_MM_PERM_ENUM)221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15) B48(12-15) B49(12-15) B48(12-15) B49(12-15) B4C(12-15) B4D(12-15) B4C(12-15) B4D(12-15)
4416 const __m512i rhs_mat_2367ABEF_41_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_41, (_MM_PERM_ENUM)221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15) B4A(12-15) B4B(12-15) B4A(12-15) B4B(12-15) B4E(12-15) B4F(12-15) B4E(12-15) B4F(12-15)
4417
4418 const __m512i rhs_mat_014589CD_50_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_50, (_MM_PERM_ENUM)221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7) B58(4-7) B59(4-7) B58(4-7) B59(4-7) B5C(4-7) B5D(4-7) B5C(4-7) B5D(4-7)
4419 const __m512i rhs_mat_2367ABEF_50_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_50, (_MM_PERM_ENUM)221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7) B5A(4-7) B5B(4-7) B5A(4-7) B5B(4-7) B5E(4-7) B5F(4-7) B5E(4-7) B5F(4-7)
4420
4421 const __m512i rhs_mat_014589CD_51_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_51, (_MM_PERM_ENUM)221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15) B58(12-15) B59(12-15) B58(12-15) B59(12-15) B5C(12-15) B5D(12-15) B5C(12-15) B5D(12-15)
4422 const __m512i rhs_mat_2367ABEF_51_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_51, (_MM_PERM_ENUM)221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15) B5A(12-15) B5B(12-15) B5A(12-15) B5B(12-15) B5E(12-15) B5F(12-15) B5E(12-15) B5F(12-15)
4423
4424 const __m512i rhs_mat_014589CD_60_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_60, (_MM_PERM_ENUM)221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7) B68(4-7) B69(4-7) B68(4-7) B69(4-7) B6C(4-7) B6D(4-7) B6C(4-7) B6D(4-7)
4425 const __m512i rhs_mat_2367ABEF_60_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_60, (_MM_PERM_ENUM)221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7) B6A(4-7) B6B(4-7) B6A(4-7) B6B(4-7) B6E(4-7) B6F(4-7) B6E(4-7) B6F(4-7)
4426
4427 const __m512i rhs_mat_014589CD_61_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_61, (_MM_PERM_ENUM)221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15) B68(12-15) B69(12-15) B68(12-15) B69(12-15) B6C(12-15) B6D(12-15) B6C(12-15) B6D(12-15)
4428 const __m512i rhs_mat_2367ABEF_61_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_61, (_MM_PERM_ENUM)221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15) B6A(12-15) B6B(12-15) B6A(12-15) B6B(12-15) B6E(12-15) B6F(12-15) B6E(12-15) B6F(12-15)
4429
4430 const __m512i rhs_mat_014589CD_70_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_70, (_MM_PERM_ENUM)221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7) B78(4-7) B79(4-7) B78(4-7) B79(4-7) B7C(4-7) B7D(4-7) B7C(4-7) B7D(4-7)
4431 const __m512i rhs_mat_2367ABEF_70_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_70, (_MM_PERM_ENUM)221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7) B7A(4-7) B7B(4-7) B7A(4-7) B7B(4-7) B7E(4-7) B7F(4-7) B7E(4-7) B7F(4-7)
4432
4433 const __m512i rhs_mat_014589CD_71_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_71, (_MM_PERM_ENUM)221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15) B78(12-15) B79(12-15) B78(12-15) B79(12-15) B7C(12-15) B7D(12-15) B7C(12-15) B7D(12-15)
4434 const __m512i rhs_mat_2367ABEF_71_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_71, (_MM_PERM_ENUM)221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15) B7A(12-15) B7B(12-15) B7A(12-15) B7B(12-15) B7E(12-15) B7F(12-15) B7E(12-15) B7F(12-15)
4435
4436 //notation:superblock subblock
4437 //s00 m00 s01 m01 s10 m10 s11 m11 s20 m20 s21 m21 s30 m30 s31 m31 s40 m40 s41 m41 s50 m50 s51 m51 s60 m60 s61 m61 s70 m70 s71 m71
4438
4439 const __m128i mins_and_scales_01_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + sb * 64));
4440 const __m128i mins_and_scales_23_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 16 + sb * 64));
4441 const __m128i mins_and_scales_45_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 32 + sb * 64));
4442 const __m128i mins_and_scales_67_0 = _mm_loadu_si128((const __m128i *)(b_ptr_0[b].scales + 48 + sb * 64));
4443
4444 const __m128i mins_and_scales_01_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + sb * 64));
4445 const __m128i mins_and_scales_23_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 16 + sb * 64));
4446 const __m128i mins_and_scales_45_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 32 + sb * 64));
4447 const __m128i mins_and_scales_67_1 = _mm_loadu_si128((const __m128i *)(b_ptr_1[b].scales + 48 + sb * 64));
4448
4449 // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop
4450 const __m256i mins_and_scales_01 = _mm256_insertf128_si256(_mm256_castsi128_si256(mins_and_scales_01_0), mins_and_scales_01_1, 1);
4451 const __m256i mins_and_scales_23 = _mm256_insertf128_si256(_mm256_castsi128_si256(mins_and_scales_23_0), mins_and_scales_23_1, 1);
4452 const __m256i mins_and_scales_45 = _mm256_insertf128_si256(_mm256_castsi128_si256(mins_and_scales_45_0), mins_and_scales_45_1, 1);
4453 const __m256i mins_and_scales_67 = _mm256_insertf128_si256(_mm256_castsi128_si256(mins_and_scales_67_0), mins_and_scales_67_1, 1);
4454
4455 // Extract scales which is lower half from mins_and_scales
4456 const __m256i scales_01 = _mm256_and_si256(mins_and_scales_01, m4b);
4457 const __m256i scales_23 = _mm256_and_si256(mins_and_scales_23, m4b);
4458 const __m256i scales_45 = _mm256_and_si256(mins_and_scales_45, m4b);
4459 const __m256i scales_67 = _mm256_and_si256(mins_and_scales_67, m4b);
4460
4461 // Extract mins which is upper half from mins_and_scales
4462 const __m512i mins_01 = _mm512_cvtepu8_epi16(_mm256_and_si256(_mm256_srli_epi16(mins_and_scales_01, 4), m4b));
4463 const __m512i mins_23 = _mm512_cvtepu8_epi16(_mm256_and_si256(_mm256_srli_epi16(mins_and_scales_23, 4), m4b));
4464 const __m512i mins_45 = _mm512_cvtepu8_epi16(_mm256_and_si256(_mm256_srli_epi16(mins_and_scales_45, 4), m4b));
4465 const __m512i mins_67 = _mm512_cvtepu8_epi16(_mm256_and_si256(_mm256_srli_epi16(mins_and_scales_67, 4), m4b));
4466
4467 const __m512i scales_0 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_01, scalesmask1));
4468 const __m512i scales_1 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_01, scalesmask2));
4469 const __m512i scales_2 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_23, scalesmask1));
4470 const __m512i scales_3 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_23, scalesmask2));
4471 const __m512i scales_4 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_45, scalesmask1));
4472 const __m512i scales_5 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_45, scalesmask2));
4473 const __m512i scales_6 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_67, scalesmask1));
4474 const __m512i scales_7 = _mm512_cvtepu8_epi16(_mm256_shuffle_epi8(scales_67, scalesmask2));
4475
4476 const __m512i scale_014589CD_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)68);
4477 const __m512i scale_2367ABEF_0 = _mm512_shuffle_epi32(scales_0, (_MM_PERM_ENUM)238);
4478
4479 const __m512i scale_014589CD_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)68);
4480 const __m512i scale_2367ABEF_1 = _mm512_shuffle_epi32(scales_1, (_MM_PERM_ENUM)238);
4481
4482 const __m512i scale_014589CD_2 = _mm512_shuffle_epi32(scales_2, (_MM_PERM_ENUM)68);
4483 const __m512i scale_2367ABEF_2 = _mm512_shuffle_epi32(scales_2, (_MM_PERM_ENUM)238);
4484
4485 const __m512i scale_014589CD_3 = _mm512_shuffle_epi32(scales_3, (_MM_PERM_ENUM)68);
4486 const __m512i scale_2367ABEF_3 = _mm512_shuffle_epi32(scales_3, (_MM_PERM_ENUM)238);
4487
4488 const __m512i scale_014589CD_4 = _mm512_shuffle_epi32(scales_4, (_MM_PERM_ENUM)68);
4489 const __m512i scale_2367ABEF_4 = _mm512_shuffle_epi32(scales_4, (_MM_PERM_ENUM)238);
4490
4491 const __m512i scale_014589CD_5 = _mm512_shuffle_epi32(scales_5, (_MM_PERM_ENUM)68);
4492 const __m512i scale_2367ABEF_5 = _mm512_shuffle_epi32(scales_5, (_MM_PERM_ENUM)238);
4493
4494 const __m512i scale_014589CD_6 = _mm512_shuffle_epi32(scales_6, (_MM_PERM_ENUM)68);
4495 const __m512i scale_2367ABEF_6 = _mm512_shuffle_epi32(scales_6, (_MM_PERM_ENUM)238);
4496
4497 const __m512i scale_014589CD_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)68);
4498 const __m512i scale_2367ABEF_7 = _mm512_shuffle_epi32(scales_7, (_MM_PERM_ENUM)238);
4499
4500 // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
4501 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
4502 __m256i lhs_mat_ymm_0123_00 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 512 * sb)));
4503 __m256i lhs_mat_ymm_01_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 0);
4504 __m256i lhs_mat_ymm_23_00 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_00, lhs_mat_ymm_0123_00, 17);
4505 __m256i lhs_mat_ymm_0123_01 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 32 + 512 * sb)));
4506 __m256i lhs_mat_ymm_01_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 0);
4507 __m256i lhs_mat_ymm_23_01 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_01, lhs_mat_ymm_0123_01, 17);
4508 __m256i lhs_mat_ymm_0123_10 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 64 + 512 * sb)));
4509 __m256i lhs_mat_ymm_01_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 0);
4510 __m256i lhs_mat_ymm_23_10 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_10, lhs_mat_ymm_0123_10, 17);
4511 __m256i lhs_mat_ymm_0123_11 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 96 + 512 * sb)));
4512 __m256i lhs_mat_ymm_01_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 0);
4513 __m256i lhs_mat_ymm_23_11 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_11, lhs_mat_ymm_0123_11, 17);
4514 __m256i lhs_mat_ymm_0123_20 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 128 + 512 * sb)));
4515 __m256i lhs_mat_ymm_01_20 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_20, lhs_mat_ymm_0123_20, 0);
4516 __m256i lhs_mat_ymm_23_20 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_20, lhs_mat_ymm_0123_20, 17);
4517 __m256i lhs_mat_ymm_0123_21 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 160 + 512 * sb)));
4518 __m256i lhs_mat_ymm_01_21 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_21, lhs_mat_ymm_0123_21, 0);
4519 __m256i lhs_mat_ymm_23_21 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_21, lhs_mat_ymm_0123_21, 17);
4520 __m256i lhs_mat_ymm_0123_30 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 192 + 512 * sb)));
4521 __m256i lhs_mat_ymm_01_30 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_30, lhs_mat_ymm_0123_30, 0);
4522 __m256i lhs_mat_ymm_23_30 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_30, lhs_mat_ymm_0123_30, 17);
4523 __m256i lhs_mat_ymm_0123_31 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 224 + 512 * sb)));
4524 __m256i lhs_mat_ymm_01_31 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_31, lhs_mat_ymm_0123_31, 0);
4525 __m256i lhs_mat_ymm_23_31 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_31, lhs_mat_ymm_0123_31, 17);
4526
4527 __m256i lhs_mat_ymm_0123_40 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 256 + 512 * sb)));
4528 __m256i lhs_mat_ymm_01_40 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_40, lhs_mat_ymm_0123_40, 0);
4529 __m256i lhs_mat_ymm_23_40 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_40, lhs_mat_ymm_0123_40, 17);
4530 __m256i lhs_mat_ymm_0123_41 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 288 + 512 * sb)));
4531 __m256i lhs_mat_ymm_01_41 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_41, lhs_mat_ymm_0123_41, 0);
4532 __m256i lhs_mat_ymm_23_41 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_41, lhs_mat_ymm_0123_41, 17);
4533 __m256i lhs_mat_ymm_0123_50 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 320 + 512 * sb)));
4534 __m256i lhs_mat_ymm_01_50 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_50, lhs_mat_ymm_0123_50, 0);
4535 __m256i lhs_mat_ymm_23_50 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_50, lhs_mat_ymm_0123_50, 17);
4536 __m256i lhs_mat_ymm_0123_51 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 352 + 512 * sb)));
4537 __m256i lhs_mat_ymm_01_51 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_51, lhs_mat_ymm_0123_51, 0);
4538 __m256i lhs_mat_ymm_23_51 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_51, lhs_mat_ymm_0123_51, 17);
4539 __m256i lhs_mat_ymm_0123_60 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 384 + 512 * sb)));
4540 __m256i lhs_mat_ymm_01_60 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_60, lhs_mat_ymm_0123_60, 0);
4541 __m256i lhs_mat_ymm_23_60 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_60, lhs_mat_ymm_0123_60, 17);
4542 __m256i lhs_mat_ymm_0123_61 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 416 + 512 * sb)));
4543 __m256i lhs_mat_ymm_01_61 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_61, lhs_mat_ymm_0123_61, 0);
4544 __m256i lhs_mat_ymm_23_61 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_61, lhs_mat_ymm_0123_61, 17);
4545 __m256i lhs_mat_ymm_0123_70 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 448 + 512 * sb)));
4546 __m256i lhs_mat_ymm_01_70 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_70, lhs_mat_ymm_0123_70, 0);
4547 __m256i lhs_mat_ymm_23_70 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_70, lhs_mat_ymm_0123_70, 17);
4548 __m256i lhs_mat_ymm_0123_71 = _mm256_loadu_si256((const __m256i * )((a_ptr[b].qs + 480 + 512 * sb)));
4549 __m256i lhs_mat_ymm_01_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 0);
4550 __m256i lhs_mat_ymm_23_71 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_71, lhs_mat_ymm_0123_71, 17);
4551
4552 __m512i lhs_mat_01_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_00), lhs_mat_ymm_01_00, 1);
4553 __m512i lhs_mat_23_00 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_00), lhs_mat_ymm_23_00, 1);
4554 __m512i lhs_mat_01_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_01), lhs_mat_ymm_01_01, 1);
4555 __m512i lhs_mat_23_01 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_01), lhs_mat_ymm_23_01, 1);
4556
4557 __m512i lhs_mat_01_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_10), lhs_mat_ymm_01_10, 1);
4558 __m512i lhs_mat_23_10 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_10), lhs_mat_ymm_23_10, 1);
4559 __m512i lhs_mat_01_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_11), lhs_mat_ymm_01_11, 1);
4560 __m512i lhs_mat_23_11 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_11), lhs_mat_ymm_23_11, 1);
4561
4562 __m512i lhs_mat_01_20 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_20), lhs_mat_ymm_01_20, 1);
4563 __m512i lhs_mat_23_20 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_20), lhs_mat_ymm_23_20, 1);
4564 __m512i lhs_mat_01_21 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_21), lhs_mat_ymm_01_21, 1);
4565 __m512i lhs_mat_23_21 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_21), lhs_mat_ymm_23_21, 1);
4566
4567 __m512i lhs_mat_01_30 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_30), lhs_mat_ymm_01_30, 1);
4568 __m512i lhs_mat_23_30 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_30), lhs_mat_ymm_23_30, 1);
4569 __m512i lhs_mat_01_31 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_31), lhs_mat_ymm_01_31, 1);
4570 __m512i lhs_mat_23_31 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_31), lhs_mat_ymm_23_31, 1);
4571
4572 __m512i lhs_mat_01_40 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_40), lhs_mat_ymm_01_40, 1);
4573 __m512i lhs_mat_23_40 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_40), lhs_mat_ymm_23_40, 1);
4574 __m512i lhs_mat_01_41 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_41), lhs_mat_ymm_01_41, 1);
4575 __m512i lhs_mat_23_41 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_41), lhs_mat_ymm_23_41, 1);
4576
4577 __m512i lhs_mat_01_50 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_50), lhs_mat_ymm_01_50, 1);
4578 __m512i lhs_mat_23_50 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_50), lhs_mat_ymm_23_50, 1);
4579 __m512i lhs_mat_01_51 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_51), lhs_mat_ymm_01_51, 1);
4580 __m512i lhs_mat_23_51 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_51), lhs_mat_ymm_23_51, 1);
4581
4582 __m512i lhs_mat_01_60 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_60), lhs_mat_ymm_01_60, 1);
4583 __m512i lhs_mat_23_60 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_60), lhs_mat_ymm_23_60, 1);
4584 __m512i lhs_mat_01_61 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_61), lhs_mat_ymm_01_61, 1);
4585 __m512i lhs_mat_23_61 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_61), lhs_mat_ymm_23_61, 1);
4586
4587 __m512i lhs_mat_01_70 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_70), lhs_mat_ymm_01_70, 1);
4588 __m512i lhs_mat_23_70 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_70), lhs_mat_ymm_23_70, 1);
4589 __m512i lhs_mat_01_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_71), lhs_mat_ymm_01_71, 1);
4590 __m512i lhs_mat_23_71 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_71), lhs_mat_ymm_23_71, 1);
4591
4592 // Bsums are loaded for the different Q8_K blocks
4593 __m128i lhs_raw_bsums_01_0123 = _mm_loadu_si128((const __m128i *)((a_ptr[b].bsums + 32 * sb)));
4594 __m128i lhs_raw_bsums_23_0123 = _mm_loadu_si128((const __m128i *)(a_ptr[b].bsums + 8 + 32 * sb));
4595 __m128i lhs_raw_bsums_01_4567 = _mm_loadu_si128((const __m128i *)((a_ptr[b].bsums + 16 + 32 * sb)));
4596 __m128i lhs_raw_bsums_23_4567 = _mm_loadu_si128((const __m128i *)(a_ptr[b].bsums + 24 + 32 * sb));
4597
4598 __m256i lhs_bsums_ymm_01_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_01_0123), lhs_raw_bsums_01_0123, 1);
4599 __m512i lhs_bsums_01_0123 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_ymm_01_0123), lhs_bsums_ymm_01_0123, 1);
4600 __m256i lhs_bsums_ymm_23_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_23_0123), lhs_raw_bsums_23_0123, 1);
4601 __m512i lhs_bsums_23_0123 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_ymm_23_0123), lhs_bsums_ymm_23_0123, 1);
4602 __m256i lhs_bsums_ymm_01_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_01_4567), lhs_raw_bsums_01_4567, 1);
4603 __m512i lhs_bsums_01_4567 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_ymm_01_4567), lhs_bsums_ymm_01_4567, 1);
4604 __m256i lhs_bsums_ymm_23_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_23_4567), lhs_raw_bsums_23_4567, 1);
4605 __m512i lhs_bsums_23_4567 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_bsums_ymm_23_4567), lhs_bsums_ymm_23_4567, 1);
4606
4607 // Shuffle pattern one - left side input
4608 const __m512i lhs_mat_01_00_sp1 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3)
4609 const __m512i lhs_mat_23_00_sp1 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)160); //A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3) A02(0-3) A02(0-3) A03(0-3) A03(0-3)
4610
4611 const __m512i lhs_mat_01_01_sp1 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11)
4612 const __m512i lhs_mat_23_01_sp1 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)160); //A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11) A02(8-11) A02(8-11) A03(8-11) A03(8-11)
4613
4614 const __m512i lhs_mat_01_10_sp1 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3)
4615 const __m512i lhs_mat_23_10_sp1 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)160); //A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3) A12(0-3) A12(0-3) A13(0-3) A13(0-3)
4616
4617 const __m512i lhs_mat_01_11_sp1 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11)
4618 const __m512i lhs_mat_23_11_sp1 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)160); //A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11) A12(8-11) A12(8-11) A13(8-11) A13(8-11)
4619
4620 const __m512i lhs_mat_01_20_sp1 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3)
4621 const __m512i lhs_mat_23_20_sp1 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)160); //A22(0-3) A22(0-3) A23(0-3) A23(0-3) A22(0-3) A22(0-3) A23(0-3) A23(0-3) A22(0-3) A22(0-3) A23(0-3) A23(0-3) A22(0-3) A22(0-3) A23(0-3) A23(0-3)
4622
4623 const __m512i lhs_mat_01_21_sp1 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11)
4624 const __m512i lhs_mat_23_21_sp1 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)160); //A22(8-11) A22(8-11) A23(8-11) A23(8-11) A22(8-11) A22(8-11) A23(8-11) A23(8-11) A22(8-11) A22(8-11) A23(8-11) A23(8-11) A22(8-11) A22(8-11) A23(8-11) A23(8-11)
4625
4626 const __m512i lhs_mat_01_30_sp1 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3)
4627 const __m512i lhs_mat_23_30_sp1 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)160); //A32(0-3) A32(0-3) A33(0-3) A33(0-3) A32(0-3) A32(0-3) A33(0-3) A33(0-3) A32(0-3) A32(0-3) A33(0-3) A33(0-3) A32(0-3) A32(0-3) A33(0-3) A33(0-3)
4628
4629 const __m512i lhs_mat_01_31_sp1 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11)
4630 const __m512i lhs_mat_23_31_sp1 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)160); //A32(8-11) A32(8-11) A33(8-11) A33(8-11) A32(8-11) A32(8-11) A33(8-11) A33(8-11) A32(8-11) A32(8-11) A33(8-11) A33(8-11) A32(8-11) A32(8-11) A33(8-11) A33(8-11)
4631
4632 const __m512i lhs_mat_01_40_sp1 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3)
4633 const __m512i lhs_mat_23_40_sp1 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)160); //A42(0-3) A42(0-3) A43(0-3) A43(0-3) A42(0-3) A42(0-3) A43(0-3) A43(0-3) A42(0-3) A42(0-3) A43(0-3) A43(0-3) A42(0-3) A42(0-3) A43(0-3) A43(0-3)
4634
4635 const __m512i lhs_mat_01_41_sp1 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11)
4636 const __m512i lhs_mat_23_41_sp1 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)160); //A42(8-11) A42(8-11) A43(8-11) A43(8-11) A42(8-11) A42(8-11) A43(8-11) A43(8-11) A42(8-11) A42(8-11) A43(8-11) A43(8-11) A42(8-11) A42(8-11) A43(8-11) A43(8-11)
4637
4638 const __m512i lhs_mat_01_50_sp1 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3)
4639 const __m512i lhs_mat_23_50_sp1 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)160); //A52(0-3) A52(0-3) A53(0-3) A53(0-3) A52(0-3) A52(0-3) A53(0-3) A53(0-3) A52(0-3) A52(0-3) A53(0-3) A53(0-3) A52(0-3) A52(0-3) A53(0-3) A53(0-3)
4640
4641 const __m512i lhs_mat_01_51_sp1 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11)
4642 const __m512i lhs_mat_23_51_sp1 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)160); //A52(8-11) A52(8-11) A53(8-11) A53(8-11) A52(8-11) A52(8-11) A53(8-11) A53(8-11) A52(8-11) A52(8-11) A53(8-11) A53(8-11) A52(8-11) A52(8-11) A53(8-11) A53(8-11)
4643
4644 const __m512i lhs_mat_01_60_sp1 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3)
4645 const __m512i lhs_mat_23_60_sp1 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)160); //A62(0-3) A62(0-3) A63(0-3) A63(0-3) A62(0-3) A62(0-3) A63(0-3) A63(0-3) A62(0-3) A62(0-3) A63(0-3) A63(0-3) A62(0-3) A62(0-3) A63(0-3) A63(0-3)
4646
4647 const __m512i lhs_mat_01_61_sp1 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11)
4648 const __m512i lhs_mat_23_61_sp1 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)160); //A62(8-11) A62(8-11) A63(8-11) A63(8-11) A62(8-11) A62(8-11) A63(8-11) A63(8-11) A62(8-11) A62(8-11) A63(8-11) A63(8-11) A62(8-11) A62(8-11) A63(8-11) A63(8-11)
4649
4650 const __m512i lhs_mat_01_70_sp1 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3)
4651 const __m512i lhs_mat_23_70_sp1 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)160); //A72(0-3) A72(0-3) A73(0-3) A73(0-3) A72(0-3) A72(0-3) A73(0-3) A73(0-3) A72(0-3) A72(0-3) A73(0-3) A73(0-3) A72(0-3) A72(0-3) A73(0-3) A73(0-3)
4652
4653 const __m512i lhs_mat_01_71_sp1 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11)
4654 const __m512i lhs_mat_23_71_sp1 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)160); //A72(8-11) A72(8-11) A73(8-11) A73(8-11) A72(8-11) A72(8-11) A73(8-11) A73(8-11) A72(8-11) A72(8-11) A73(8-11) A73(8-11) A72(8-11) A72(8-11) A73(8-11) A73(8-11)
4655
4656 const __m512i lhs_mat_01_00_sp2 = _mm512_shuffle_epi32(lhs_mat_01_00, (_MM_PERM_ENUM)245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7)
4657 const __m512i lhs_mat_23_00_sp2 = _mm512_shuffle_epi32(lhs_mat_23_00, (_MM_PERM_ENUM)245); //A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7) A02(4-7) A02(4-7) A03(4-7) A03(4-7)
4658
4659 const __m512i lhs_mat_01_01_sp2 = _mm512_shuffle_epi32(lhs_mat_01_01, (_MM_PERM_ENUM)245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15)
4660 const __m512i lhs_mat_23_01_sp2 = _mm512_shuffle_epi32(lhs_mat_23_01, (_MM_PERM_ENUM)245); //A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15) A02(12-15) A02(12-15) A03(12-15) A03(12-15)
4661
4662 const __m512i lhs_mat_01_10_sp2 = _mm512_shuffle_epi32(lhs_mat_01_10, (_MM_PERM_ENUM)245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7)
4663 const __m512i lhs_mat_23_10_sp2 = _mm512_shuffle_epi32(lhs_mat_23_10, (_MM_PERM_ENUM)245); //A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7) A12(4-7) A12(4-7) A13(4-7) A13(4-7)
4664
4665 const __m512i lhs_mat_01_11_sp2 = _mm512_shuffle_epi32(lhs_mat_01_11, (_MM_PERM_ENUM)245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15)
4666 const __m512i lhs_mat_23_11_sp2 = _mm512_shuffle_epi32(lhs_mat_23_11, (_MM_PERM_ENUM)245); //A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15) A12(12-15) A12(12-15) A13(12-15) A13(12-15)
4667
4668 const __m512i lhs_mat_01_20_sp2 = _mm512_shuffle_epi32(lhs_mat_01_20, (_MM_PERM_ENUM)245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7)
4669 const __m512i lhs_mat_23_20_sp2 = _mm512_shuffle_epi32(lhs_mat_23_20, (_MM_PERM_ENUM)245); //A22(4-7) A22(4-7) A23(4-7) A23(4-7) A22(4-7) A22(4-7) A23(4-7) A23(4-7) A22(4-7) A22(4-7) A23(4-7) A23(4-7) A22(4-7) A22(4-7) A23(4-7) A23(4-7)
4670
4671 const __m512i lhs_mat_01_21_sp2 = _mm512_shuffle_epi32(lhs_mat_01_21, (_MM_PERM_ENUM)245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15)
4672 const __m512i lhs_mat_23_21_sp2 = _mm512_shuffle_epi32(lhs_mat_23_21, (_MM_PERM_ENUM)245); //A22(12-15) A22(12-15) A23(12-15) A23(12-15) A22(12-15) A22(12-15) A23(12-15) A23(12-15) A22(12-15) A22(12-15) A23(12-15) A23(12-15) A22(12-15) A22(12-15) A23(12-15) A23(12-15)
4673
4674 const __m512i lhs_mat_01_30_sp2 = _mm512_shuffle_epi32(lhs_mat_01_30, (_MM_PERM_ENUM)245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7)
4675 const __m512i lhs_mat_23_30_sp2 = _mm512_shuffle_epi32(lhs_mat_23_30, (_MM_PERM_ENUM)245); //A32(4-7) A32(4-7) A33(4-7) A33(4-7) A32(4-7) A32(4-7) A33(4-7) A33(4-7) A32(4-7) A32(4-7) A33(4-7) A33(4-7) A32(4-7) A32(4-7) A33(4-7) A33(4-7)
4676
4677 const __m512i lhs_mat_01_31_sp2 = _mm512_shuffle_epi32(lhs_mat_01_31, (_MM_PERM_ENUM)245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15)
4678 const __m512i lhs_mat_23_31_sp2 = _mm512_shuffle_epi32(lhs_mat_23_31, (_MM_PERM_ENUM)245); //A32(12-15) A32(12-15) A33(12-15) A33(12-15) A32(12-15) A32(12-15) A33(12-15) A33(12-15) A32(12-15) A32(12-15) A33(12-15) A33(12-15) A32(12-15) A32(12-15) A33(12-15) A33(12-15)
4679
4680 const __m512i lhs_mat_01_40_sp2 = _mm512_shuffle_epi32(lhs_mat_01_40, (_MM_PERM_ENUM)245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7)
4681 const __m512i lhs_mat_23_40_sp2 = _mm512_shuffle_epi32(lhs_mat_23_40, (_MM_PERM_ENUM)245); //A42(4-7) A42(4-7) A43(4-7) A43(4-7) A42(4-7) A42(4-7) A43(4-7) A43(4-7) A42(4-7) A42(4-7) A43(4-7) A43(4-7) A42(4-7) A42(4-7) A43(4-7) A43(4-7)
4682
4683 const __m512i lhs_mat_01_41_sp2 = _mm512_shuffle_epi32(lhs_mat_01_41, (_MM_PERM_ENUM)245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15)
4684 const __m512i lhs_mat_23_41_sp2 = _mm512_shuffle_epi32(lhs_mat_23_41, (_MM_PERM_ENUM)245); //A42(12-15) A42(12-15) A43(12-15) A43(12-15) A42(12-15) A42(12-15) A43(12-15) A43(12-15) A42(12-15) A42(12-15) A43(12-15) A43(12-15) A42(12-15) A42(12-15) A43(12-15) A43(12-15)
4685
4686 const __m512i lhs_mat_01_50_sp2 = _mm512_shuffle_epi32(lhs_mat_01_50, (_MM_PERM_ENUM)245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7)
4687 const __m512i lhs_mat_23_50_sp2 = _mm512_shuffle_epi32(lhs_mat_23_50, (_MM_PERM_ENUM)245); //A52(4-7) A52(4-7) A53(4-7) A53(4-7) A52(4-7) A52(4-7) A53(4-7) A53(4-7) A52(4-7) A52(4-7) A53(4-7) A53(4-7) A52(4-7) A52(4-7) A53(4-7) A53(4-7)
4688
4689 const __m512i lhs_mat_01_51_sp2 = _mm512_shuffle_epi32(lhs_mat_01_51, (_MM_PERM_ENUM)245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15)
4690 const __m512i lhs_mat_23_51_sp2 = _mm512_shuffle_epi32(lhs_mat_23_51, (_MM_PERM_ENUM)245); //A52(12-15) A52(12-15) A53(12-15) A53(12-15) A52(12-15) A52(12-15) A53(12-15) A53(12-15) A52(12-15) A52(12-15) A53(12-15) A53(12-15) A52(12-15) A52(12-15) A53(12-15) A53(12-15)
4691
4692 const __m512i lhs_mat_01_60_sp2 = _mm512_shuffle_epi32(lhs_mat_01_60, (_MM_PERM_ENUM)245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7)
4693 const __m512i lhs_mat_23_60_sp2 = _mm512_shuffle_epi32(lhs_mat_23_60, (_MM_PERM_ENUM)245); //A62(4-7) A62(4-7) A63(4-7) A63(4-7) A62(4-7) A62(4-7) A63(4-7) A63(4-7) A62(4-7) A62(4-7) A63(4-7) A63(4-7) A62(4-7) A62(4-7) A63(4-7) A63(4-7)
4694
4695 const __m512i lhs_mat_01_61_sp2 = _mm512_shuffle_epi32(lhs_mat_01_61, (_MM_PERM_ENUM)245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15)
4696 const __m512i lhs_mat_23_61_sp2 = _mm512_shuffle_epi32(lhs_mat_23_61, (_MM_PERM_ENUM)245); //A62(12-15) A62(12-15) A63(12-15) A63(12-15) A62(12-15) A62(12-15) A63(12-15) A63(12-15) A62(12-15) A62(12-15) A63(12-15) A63(12-15) A62(12-15) A62(12-15) A63(12-15) A63(12-15)
4697
4698 const __m512i lhs_mat_01_70_sp2 = _mm512_shuffle_epi32(lhs_mat_01_70, (_MM_PERM_ENUM)245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7)
4699 const __m512i lhs_mat_23_70_sp2 = _mm512_shuffle_epi32(lhs_mat_23_70, (_MM_PERM_ENUM)245); //A72(4-7) A72(4-7) A73(4-7) A73(4-7) A72(4-7) A72(4-7) A73(4-7) A73(4-7) A72(4-7) A72(4-7) A73(4-7) A73(4-7) A72(4-7) A72(4-7) A73(4-7) A73(4-7)
4700
4701 const __m512i lhs_mat_01_71_sp2 = _mm512_shuffle_epi32(lhs_mat_01_71, (_MM_PERM_ENUM)245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15)
4702 const __m512i lhs_mat_23_71_sp2 = _mm512_shuffle_epi32(lhs_mat_23_71, (_MM_PERM_ENUM)245); //A72(12-15) A72(12-15) A73(12-15) A73(12-15) A72(12-15) A72(12-15) A73(12-15) A73(12-15) A72(12-15) A72(12-15) A73(12-15) A73(12-15) A72(12-15) A72(12-15) A73(12-15) A73(12-15)
4703
4704 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
4705 __m512i iacc_mat_00_0_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_01_00_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_01_01_sp1));
4706 __m512i iacc_mat_01_0_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_01_00_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_01_01_sp1));
4707
4708 __m512i iacc_mat_10_0_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp1, lhs_mat_23_00_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp1, lhs_mat_23_01_sp1));
4709 __m512i iacc_mat_11_0_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp1, lhs_mat_23_00_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp1, lhs_mat_23_01_sp1));
4710
4711 __m512i iacc_mat_00_1_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_01_10_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_01_11_sp1));
4712 __m512i iacc_mat_01_1_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_01_10_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_01_11_sp1));
4713
4714 __m512i iacc_mat_10_1_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp1, lhs_mat_23_10_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp1, lhs_mat_23_11_sp1));
4715 __m512i iacc_mat_11_1_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp1, lhs_mat_23_10_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp1, lhs_mat_23_11_sp1));
4716
4717 __m512i iacc_mat_00_2_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp1, lhs_mat_01_20_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp1, lhs_mat_01_21_sp1));
4718 __m512i iacc_mat_01_2_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp1, lhs_mat_01_20_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp1, lhs_mat_01_21_sp1));
4719
4720 __m512i iacc_mat_10_2_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp1, lhs_mat_23_20_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp1, lhs_mat_23_21_sp1));
4721 __m512i iacc_mat_11_2_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp1, lhs_mat_23_20_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp1, lhs_mat_23_21_sp1));
4722
4723 __m512i iacc_mat_00_3_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp1, lhs_mat_01_30_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp1, lhs_mat_01_31_sp1));
4724 __m512i iacc_mat_01_3_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp1, lhs_mat_01_30_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp1, lhs_mat_01_31_sp1));
4725
4726 __m512i iacc_mat_10_3_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp1, lhs_mat_23_30_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp1, lhs_mat_23_31_sp1));
4727 __m512i iacc_mat_11_3_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp1, lhs_mat_23_30_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp1, lhs_mat_23_31_sp1));
4728
4729 __m512i iacc_mat_00_4_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp1, lhs_mat_01_40_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp1, lhs_mat_01_41_sp1));
4730 __m512i iacc_mat_01_4_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp1, lhs_mat_01_40_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp1, lhs_mat_01_41_sp1));
4731
4732 __m512i iacc_mat_10_4_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp1, lhs_mat_23_40_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp1, lhs_mat_23_41_sp1));
4733 __m512i iacc_mat_11_4_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp1, lhs_mat_23_40_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp1, lhs_mat_23_41_sp1));
4734
4735 __m512i iacc_mat_00_5_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp1, lhs_mat_01_50_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp1, lhs_mat_01_51_sp1));
4736 __m512i iacc_mat_01_5_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp1, lhs_mat_01_50_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp1, lhs_mat_01_51_sp1));
4737
4738 __m512i iacc_mat_10_5_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp1, lhs_mat_23_50_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp1, lhs_mat_23_51_sp1));
4739 __m512i iacc_mat_11_5_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp1, lhs_mat_23_50_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp1, lhs_mat_23_51_sp1));
4740
4741 __m512i iacc_mat_00_6_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp1, lhs_mat_01_60_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp1, lhs_mat_01_61_sp1));
4742 __m512i iacc_mat_01_6_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp1, lhs_mat_01_60_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp1, lhs_mat_01_61_sp1));
4743
4744 __m512i iacc_mat_10_6_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp1, lhs_mat_23_60_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp1, lhs_mat_23_61_sp1));
4745 __m512i iacc_mat_11_6_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp1, lhs_mat_23_60_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp1, lhs_mat_23_61_sp1));
4746
4747 __m512i iacc_mat_00_7_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp1, lhs_mat_01_70_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp1, lhs_mat_01_71_sp1));
4748 __m512i iacc_mat_01_7_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp1, lhs_mat_01_70_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp1, lhs_mat_01_71_sp1));
4749
4750 __m512i iacc_mat_10_7_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp1, lhs_mat_23_70_sp1),_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp1, lhs_mat_23_71_sp1));
4751 __m512i iacc_mat_11_7_sp1 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp1, lhs_mat_23_70_sp1),_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp1, lhs_mat_23_71_sp1));
4752
4753
4754 __m512i iacc_mat_00_0_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_01_00_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_01_01_sp2));
4755 __m512i iacc_mat_01_0_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_01_00_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_01_01_sp2));
4756
4757 __m512i iacc_mat_10_0_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_00_sp2, lhs_mat_23_00_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_01_sp2, lhs_mat_23_01_sp2));
4758 __m512i iacc_mat_11_0_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_00_sp2, lhs_mat_23_00_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_01_sp2, lhs_mat_23_01_sp2));
4759
4760 __m512i iacc_mat_00_1_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_01_10_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_01_11_sp2));
4761 __m512i iacc_mat_01_1_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_01_10_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_01_11_sp2));
4762
4763 __m512i iacc_mat_10_1_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_10_sp2, lhs_mat_23_10_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_11_sp2, lhs_mat_23_11_sp2));
4764 __m512i iacc_mat_11_1_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_10_sp2, lhs_mat_23_10_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_11_sp2, lhs_mat_23_11_sp2));
4765
4766 __m512i iacc_mat_00_2_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp2, lhs_mat_01_20_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp2, lhs_mat_01_21_sp2));
4767 __m512i iacc_mat_01_2_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp2, lhs_mat_01_20_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp2, lhs_mat_01_21_sp2));
4768
4769 __m512i iacc_mat_10_2_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_20_sp2, lhs_mat_23_20_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_21_sp2, lhs_mat_23_21_sp2));
4770 __m512i iacc_mat_11_2_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_20_sp2, lhs_mat_23_20_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_21_sp2, lhs_mat_23_21_sp2));
4771
4772 __m512i iacc_mat_00_3_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp2, lhs_mat_01_30_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp2, lhs_mat_01_31_sp2));
4773 __m512i iacc_mat_01_3_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp2, lhs_mat_01_30_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp2, lhs_mat_01_31_sp2));
4774
4775 __m512i iacc_mat_10_3_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_30_sp2, lhs_mat_23_30_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_31_sp2, lhs_mat_23_31_sp2));
4776 __m512i iacc_mat_11_3_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_30_sp2, lhs_mat_23_30_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_31_sp2, lhs_mat_23_31_sp2));
4777
4778 __m512i iacc_mat_00_4_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp2, lhs_mat_01_40_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp2, lhs_mat_01_41_sp2));
4779 __m512i iacc_mat_01_4_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp2, lhs_mat_01_40_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp2, lhs_mat_01_41_sp2));
4780
4781 __m512i iacc_mat_10_4_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_40_sp2, lhs_mat_23_40_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_41_sp2, lhs_mat_23_41_sp2));
4782 __m512i iacc_mat_11_4_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_40_sp2, lhs_mat_23_40_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_41_sp2, lhs_mat_23_41_sp2));
4783
4784 __m512i iacc_mat_00_5_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp2, lhs_mat_01_50_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp2, lhs_mat_01_51_sp2));
4785 __m512i iacc_mat_01_5_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp2, lhs_mat_01_50_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp2, lhs_mat_01_51_sp2));
4786
4787 __m512i iacc_mat_10_5_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_50_sp2, lhs_mat_23_50_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_51_sp2, lhs_mat_23_51_sp2));
4788 __m512i iacc_mat_11_5_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_50_sp2, lhs_mat_23_50_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_51_sp2, lhs_mat_23_51_sp2));
4789
4790 __m512i iacc_mat_00_6_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp2, lhs_mat_01_60_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp2, lhs_mat_01_61_sp2));
4791 __m512i iacc_mat_01_6_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp2, lhs_mat_01_60_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp2, lhs_mat_01_61_sp2));
4792
4793 __m512i iacc_mat_10_6_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_60_sp2, lhs_mat_23_60_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_61_sp2, lhs_mat_23_61_sp2));
4794 __m512i iacc_mat_11_6_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_60_sp2, lhs_mat_23_60_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_61_sp2, lhs_mat_23_61_sp2));
4795
4796 __m512i iacc_mat_00_7_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp2, lhs_mat_01_70_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp2, lhs_mat_01_71_sp2));
4797 __m512i iacc_mat_01_7_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp2, lhs_mat_01_70_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp2, lhs_mat_01_71_sp2));
4798
4799 __m512i iacc_mat_10_7_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_014589CD_70_sp2, lhs_mat_23_70_sp2),_mm512_maddubs_epi16(rhs_mat_014589CD_71_sp2, lhs_mat_23_71_sp2));
4800 __m512i iacc_mat_11_7_sp2 = _mm512_add_epi16(_mm512_maddubs_epi16(rhs_mat_2367ABEF_70_sp2, lhs_mat_23_70_sp2),_mm512_maddubs_epi16(rhs_mat_2367ABEF_71_sp2, lhs_mat_23_71_sp2));
4801
4802 // Combine results from both shuffle patterns for each output block
4803 __m512i iacc_mat_00_0 = _mm512_add_epi16(iacc_mat_00_0_sp1, iacc_mat_00_0_sp2);
4804 __m512i iacc_mat_01_0 = _mm512_add_epi16(iacc_mat_01_0_sp1, iacc_mat_01_0_sp2);
4805 __m512i iacc_mat_10_0 = _mm512_add_epi16(iacc_mat_10_0_sp1, iacc_mat_10_0_sp2);
4806 __m512i iacc_mat_11_0 = _mm512_add_epi16(iacc_mat_11_0_sp1, iacc_mat_11_0_sp2);
4807
4808 __m512i iacc_mat_00_1 = _mm512_add_epi16(iacc_mat_00_1_sp1, iacc_mat_00_1_sp2);
4809 __m512i iacc_mat_01_1 = _mm512_add_epi16(iacc_mat_01_1_sp1, iacc_mat_01_1_sp2);
4810 __m512i iacc_mat_10_1 = _mm512_add_epi16(iacc_mat_10_1_sp1, iacc_mat_10_1_sp2);
4811 __m512i iacc_mat_11_1 = _mm512_add_epi16(iacc_mat_11_1_sp1, iacc_mat_11_1_sp2);
4812
4813 __m512i iacc_mat_00_2 = _mm512_add_epi16(iacc_mat_00_2_sp1, iacc_mat_00_2_sp2);
4814 __m512i iacc_mat_01_2 = _mm512_add_epi16(iacc_mat_01_2_sp1, iacc_mat_01_2_sp2);
4815 __m512i iacc_mat_10_2 = _mm512_add_epi16(iacc_mat_10_2_sp1, iacc_mat_10_2_sp2);
4816 __m512i iacc_mat_11_2 = _mm512_add_epi16(iacc_mat_11_2_sp1, iacc_mat_11_2_sp2);
4817
4818 __m512i iacc_mat_00_3 = _mm512_add_epi16(iacc_mat_00_3_sp1, iacc_mat_00_3_sp2);
4819 __m512i iacc_mat_01_3 = _mm512_add_epi16(iacc_mat_01_3_sp1, iacc_mat_01_3_sp2);
4820 __m512i iacc_mat_10_3 = _mm512_add_epi16(iacc_mat_10_3_sp1, iacc_mat_10_3_sp2);
4821 __m512i iacc_mat_11_3 = _mm512_add_epi16(iacc_mat_11_3_sp1, iacc_mat_11_3_sp2);
4822
4823 __m512i iacc_mat_00_4 = _mm512_add_epi16(iacc_mat_00_4_sp1, iacc_mat_00_4_sp2);
4824 __m512i iacc_mat_01_4 = _mm512_add_epi16(iacc_mat_01_4_sp1, iacc_mat_01_4_sp2);
4825 __m512i iacc_mat_10_4 = _mm512_add_epi16(iacc_mat_10_4_sp1, iacc_mat_10_4_sp2);
4826 __m512i iacc_mat_11_4 = _mm512_add_epi16(iacc_mat_11_4_sp1, iacc_mat_11_4_sp2);
4827
4828 __m512i iacc_mat_00_5 = _mm512_add_epi16(iacc_mat_00_5_sp1, iacc_mat_00_5_sp2);
4829 __m512i iacc_mat_01_5 = _mm512_add_epi16(iacc_mat_01_5_sp1, iacc_mat_01_5_sp2);
4830 __m512i iacc_mat_10_5 = _mm512_add_epi16(iacc_mat_10_5_sp1, iacc_mat_10_5_sp2);
4831 __m512i iacc_mat_11_5 = _mm512_add_epi16(iacc_mat_11_5_sp1, iacc_mat_11_5_sp2);
4832
4833 __m512i iacc_mat_00_6 = _mm512_add_epi16(iacc_mat_00_6_sp1, iacc_mat_00_6_sp2);
4834 __m512i iacc_mat_01_6 = _mm512_add_epi16(iacc_mat_01_6_sp1, iacc_mat_01_6_sp2);
4835 __m512i iacc_mat_10_6 = _mm512_add_epi16(iacc_mat_10_6_sp1, iacc_mat_10_6_sp2);
4836 __m512i iacc_mat_11_6 = _mm512_add_epi16(iacc_mat_11_6_sp1, iacc_mat_11_6_sp2);
4837
4838 __m512i iacc_mat_00_7 = _mm512_add_epi16(iacc_mat_00_7_sp1, iacc_mat_00_7_sp2);
4839 __m512i iacc_mat_01_7 = _mm512_add_epi16(iacc_mat_01_7_sp1, iacc_mat_01_7_sp2);
4840 __m512i iacc_mat_10_7 = _mm512_add_epi16(iacc_mat_10_7_sp1, iacc_mat_10_7_sp2);
4841 __m512i iacc_mat_11_7 = _mm512_add_epi16(iacc_mat_11_7_sp1, iacc_mat_11_7_sp2);
4842
4843 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
4844 iacc_mat_00_0 = _mm512_madd_epi16(iacc_mat_00_0, scale_014589CD_0);
4845 iacc_mat_01_0 = _mm512_madd_epi16(iacc_mat_01_0, scale_2367ABEF_0);
4846 iacc_mat_10_0 = _mm512_madd_epi16(iacc_mat_10_0, scale_014589CD_0);
4847 iacc_mat_11_0 = _mm512_madd_epi16(iacc_mat_11_0, scale_2367ABEF_0);
4848
4849 iacc_mat_00_1 = _mm512_madd_epi16(iacc_mat_00_1, scale_014589CD_1);
4850 iacc_mat_01_1 = _mm512_madd_epi16(iacc_mat_01_1, scale_2367ABEF_1);
4851 iacc_mat_10_1 = _mm512_madd_epi16(iacc_mat_10_1, scale_014589CD_1);
4852 iacc_mat_11_1 = _mm512_madd_epi16(iacc_mat_11_1, scale_2367ABEF_1);
4853
4854 iacc_mat_00_2 = _mm512_madd_epi16(iacc_mat_00_2, scale_014589CD_2);
4855 iacc_mat_01_2 = _mm512_madd_epi16(iacc_mat_01_2, scale_2367ABEF_2);
4856 iacc_mat_10_2 = _mm512_madd_epi16(iacc_mat_10_2, scale_014589CD_2);
4857 iacc_mat_11_2 = _mm512_madd_epi16(iacc_mat_11_2, scale_2367ABEF_2);
4858
4859 iacc_mat_00_3 = _mm512_madd_epi16(iacc_mat_00_3, scale_014589CD_3);
4860 iacc_mat_01_3 = _mm512_madd_epi16(iacc_mat_01_3, scale_2367ABEF_3);
4861 iacc_mat_10_3 = _mm512_madd_epi16(iacc_mat_10_3, scale_014589CD_3);
4862 iacc_mat_11_3 = _mm512_madd_epi16(iacc_mat_11_3, scale_2367ABEF_3);
4863
4864 iacc_mat_00_4 = _mm512_madd_epi16(iacc_mat_00_4, scale_014589CD_4);
4865 iacc_mat_01_4 = _mm512_madd_epi16(iacc_mat_01_4, scale_2367ABEF_4);
4866 iacc_mat_10_4 = _mm512_madd_epi16(iacc_mat_10_4, scale_014589CD_4);
4867 iacc_mat_11_4 = _mm512_madd_epi16(iacc_mat_11_4, scale_2367ABEF_4);
4868
4869 iacc_mat_00_5 = _mm512_madd_epi16(iacc_mat_00_5, scale_014589CD_5);
4870 iacc_mat_01_5 = _mm512_madd_epi16(iacc_mat_01_5, scale_2367ABEF_5);
4871 iacc_mat_10_5 = _mm512_madd_epi16(iacc_mat_10_5, scale_014589CD_5);
4872 iacc_mat_11_5 = _mm512_madd_epi16(iacc_mat_11_5, scale_2367ABEF_5);
4873
4874 iacc_mat_00_6 = _mm512_madd_epi16(iacc_mat_00_6, scale_014589CD_6);
4875 iacc_mat_01_6 = _mm512_madd_epi16(iacc_mat_01_6, scale_2367ABEF_6);
4876 iacc_mat_10_6 = _mm512_madd_epi16(iacc_mat_10_6, scale_014589CD_6);
4877 iacc_mat_11_6 = _mm512_madd_epi16(iacc_mat_11_6, scale_2367ABEF_6);
4878
4879 iacc_mat_00_7 = _mm512_madd_epi16(iacc_mat_00_7, scale_014589CD_7);
4880 iacc_mat_01_7 = _mm512_madd_epi16(iacc_mat_01_7, scale_2367ABEF_7);
4881 iacc_mat_10_7 = _mm512_madd_epi16(iacc_mat_10_7, scale_014589CD_7);
4882 iacc_mat_11_7 = _mm512_madd_epi16(iacc_mat_11_7, scale_2367ABEF_7);
4883
4884 __m512i iacc_mat_00 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_0, iacc_mat_00_1), _mm512_add_epi32(iacc_mat_00_2, iacc_mat_00_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_00_4, iacc_mat_00_5), _mm512_add_epi32(iacc_mat_00_6, iacc_mat_00_7)));
4885 __m512i iacc_mat_01 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_0, iacc_mat_01_1), _mm512_add_epi32(iacc_mat_01_2, iacc_mat_01_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_01_4, iacc_mat_01_5), _mm512_add_epi32(iacc_mat_01_6, iacc_mat_01_7)));
4886 __m512i iacc_mat_10 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_0, iacc_mat_10_1), _mm512_add_epi32(iacc_mat_10_2, iacc_mat_10_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_10_4, iacc_mat_10_5), _mm512_add_epi32(iacc_mat_10_6, iacc_mat_10_7)));
4887 __m512i iacc_mat_11 = _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(iacc_mat_11_0, iacc_mat_11_1), _mm512_add_epi32(iacc_mat_11_2, iacc_mat_11_3)), _mm512_add_epi32(_mm512_add_epi32(iacc_mat_11_4, iacc_mat_11_5), _mm512_add_epi32(iacc_mat_11_6, iacc_mat_11_7)));
4888
4889 // Straighten out to make 4 row vectors
4890 __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
4891 __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
4892 __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
4893 __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);
4894
4895 // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes
4896 const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
4897 const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
4898 const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
4899
4900 // Multiply with appropiate scales and accumulate (for both d and dmin) below
4901 acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
4902 acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
4903 acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
4904 acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
4905
4906 // Take two bsums from two Q8_Ks at a time and multiply with corresponding mins values from each Q2_K
4907 __m512i iacc_row_min_0_01 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_0123, (_MM_PERM_ENUM)0), mins_01);
4908 __m512i iacc_row_min_1_01 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_0123, (_MM_PERM_ENUM)170), mins_01);
4909 __m512i iacc_row_min_2_01 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_0123, (_MM_PERM_ENUM)0), mins_01);
4910 __m512i iacc_row_min_3_01 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_0123, (_MM_PERM_ENUM)170), mins_01);
4911
4912 __m512i iacc_row_min_0_23 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_0123, (_MM_PERM_ENUM)85), mins_23);
4913 __m512i iacc_row_min_1_23 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_0123, (_MM_PERM_ENUM)255), mins_23);
4914 __m512i iacc_row_min_2_23 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_0123, (_MM_PERM_ENUM)85), mins_23);
4915 __m512i iacc_row_min_3_23 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_0123, (_MM_PERM_ENUM)255), mins_23);
4916
4917 __m512i iacc_row_min_0_45 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_4567, (_MM_PERM_ENUM)0), mins_45);
4918 __m512i iacc_row_min_1_45 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_4567, (_MM_PERM_ENUM)170), mins_45);
4919 __m512i iacc_row_min_2_45 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_4567, (_MM_PERM_ENUM)0), mins_45);
4920 __m512i iacc_row_min_3_45 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_4567, (_MM_PERM_ENUM)170), mins_45);
4921
4922 __m512i iacc_row_min_0_67 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_4567, (_MM_PERM_ENUM)85), mins_67);
4923 __m512i iacc_row_min_1_67 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_01_4567, (_MM_PERM_ENUM)255), mins_67);
4924 __m512i iacc_row_min_2_67 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_4567, (_MM_PERM_ENUM)85), mins_67);
4925 __m512i iacc_row_min_3_67 = _mm512_madd_epi16(_mm512_shuffle_epi32(lhs_bsums_23_4567, (_MM_PERM_ENUM)255), mins_67);
4926
4927 __m512i iacc_row_min_0 = _mm512_add_epi32(_mm512_add_epi32(iacc_row_min_0_01, iacc_row_min_0_23), _mm512_add_epi32(iacc_row_min_0_45,iacc_row_min_0_67));
4928 __m512i iacc_row_min_1 = _mm512_add_epi32(_mm512_add_epi32(iacc_row_min_1_01, iacc_row_min_1_23), _mm512_add_epi32(iacc_row_min_1_45,iacc_row_min_1_67));
4929 __m512i iacc_row_min_2 = _mm512_add_epi32(_mm512_add_epi32(iacc_row_min_2_01, iacc_row_min_2_23), _mm512_add_epi32(iacc_row_min_2_45,iacc_row_min_2_67));
4930 __m512i iacc_row_min_3 = _mm512_add_epi32(_mm512_add_epi32(iacc_row_min_3_01, iacc_row_min_3_23), _mm512_add_epi32(iacc_row_min_3_45,iacc_row_min_3_67));
4931
4932 acc_min_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_0), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_min_rows[0]);
4933 acc_min_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_1), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_min_rows[1]);
4934 acc_min_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_2), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_min_rows[2]);
4935 acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
4936 }
4937 }
4938 // Store accumlated values
4939 for (int i = 0; i < 4; i++) {
4940 _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
4941 }
4942 }
4943 }
4944
4945 if (anc != nc) {
4946 xstart = anc/8;
4947 y = 0;
4948 }
4949
4950#endif //AVX512F
4951
4952 // Take group of four block_q8_Kx4 structures at each pass of the loop and perform dot product operation
4953 for (; y < anr / 4; y += 4) {
4954
4955 const block_q8_Kx4 * a_ptrs[4];
4956
4957 a_ptrs[0] = a_ptr_start + (y * nb);
4958 for (int i = 0; i < 3; ++i) {
4959 a_ptrs[i + 1] = a_ptrs[i] + nb;
4960 }
4961
4962 // Take group of eight block_q2_kx8 structures at each pass of the loop and perform dot product operation
4963 for (int64_t x = xstart; x < nc / 8; x++) {
4964
4965 const block_q2_Kx8 * b_ptr = b_ptr_start + (x * b_nb);
4966
4967 // Master FP accumulators
4968 __m256 acc_rows[16];
4969 for (int i = 0; i < 16; i++) {
4970 acc_rows[i] = _mm256_setzero_ps();
4971 }
4972
4973 __m256 acc_min_rows[16];
4974 for (int i = 0; i < 16; i++) {
4975 acc_min_rows[i] = _mm256_setzero_ps();
4976 }
4977
4978 // For super block
4979 for (int64_t b = 0; b < nb; b++) {
4980 // Delta values - Load the eight scale values of block_q2_kx8
4981 const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
4982
4983 // dmin values - Load the eight dmin values of block_q2_kx8
4984 const __m256 col_dmin_f32 = GGML_F32Cx8_LOAD(b_ptr[b].dmin);
4985
4986 // Loop to iterate over the sixteen sub blocks of a super block - eight sub blocks are processed per iteration
4987 for (int sb = 0; sb < QK_K / 128; sb++) {
4988
4989 // Load the eight block_q2_K for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7
4990 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + sb * 256));
4991 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 32 + sb * 256));
4992 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 64 + sb * 256));
4993 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 96 + sb * 256));
4994 const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 128 + sb * 256));
4995 const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 160 + sb * 256));
4996 const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 192 + sb * 256));
4997 const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 224 + sb * 256));
4998
4999 // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
5000 //superblock sub block which part of sub block
5001 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
5002 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
5003
5004 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
5005 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
5006
5007 const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240);
5008 const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240);
5009
5010 const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240);
5011 const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240);
5012
5013 // 2-bit -> 8-bit
5014 // First sub block of the eight sub blocks processed in the iteration
5015 const __m256i rhs_mat_0145_00 = _mm256_and_si256(a: rhs_raw_mat_0145_0, b: m3b); //B00(0-7) B01(0-7) B04(0-7) B05(0-7)
5016 const __m256i rhs_mat_2367_00 = _mm256_and_si256(a: rhs_raw_mat_2367_0, b: m3b); //B02(0-7) B03(0-7) B06(0-7) B07(0-7)
5017
5018 const __m256i rhs_mat_0145_01 = _mm256_and_si256(a: rhs_raw_mat_0145_1, b: m3b); //B00(8-15) B01(8-15) B04(8-15) B05(8-15)
5019 const __m256i rhs_mat_2367_01 = _mm256_and_si256(a: rhs_raw_mat_2367_1, b: m3b); //B02(8-15) B03(8-15) B06(8-15) B07(8-15)
5020
5021 // Second sub block of the eight sub blocks processed in the iteration
5022 const __m256i rhs_mat_0145_10 = _mm256_and_si256(a: rhs_raw_mat_0145_2, b: m3b); //B10(0-7) B11(0-7) B14(0-7) B15(0-7)
5023 const __m256i rhs_mat_2367_10 = _mm256_and_si256(a: rhs_raw_mat_2367_2, b: m3b); //B12(0-7) B13(0-7) B16(0-7) B17(0-7)
5024
5025 const __m256i rhs_mat_0145_11 = _mm256_and_si256(a: rhs_raw_mat_0145_3, b: m3b); //B10(8-15) B11(8-15) B14(8-15) B15(8-15)
5026 const __m256i rhs_mat_2367_11 = _mm256_and_si256(a: rhs_raw_mat_2367_3, b: m3b); //B12(8-15) B13(8-15) B16(8-15) B17(8-15)
5027
5028 // Third sub block of the eight sub blocks processed in the iteration
5029 const __m256i rhs_mat_0145_20 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 2), b: m3b); //B20(0-7) B21(0-7) B24(0-7) B25(0-7)
5030 const __m256i rhs_mat_2367_20 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 2), b: m3b); //B22(0-7) B23(0-7) B26(0-7) B27(0-7)
5031
5032 const __m256i rhs_mat_0145_21 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 2), b: m3b); //B20(8-15) B21(8-15) B24(8-15) B25(8-15)
5033 const __m256i rhs_mat_2367_21 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 2), b: m3b); //B22(8-15) B23(8-15) B26(8-15) B27(8-15)
5034
5035 // Fourth sub block of the eight sub blocks processed in the iteration
5036 const __m256i rhs_mat_0145_30 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_2, count: 2), b: m3b); //B30(0-7) B31(0-7) B34(0-7) B35(0-7)
5037 const __m256i rhs_mat_2367_30 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_2, count: 2), b: m3b); //B32(0-7) B33(0-7) B36(0-7) B37(0-7)
5038
5039 const __m256i rhs_mat_0145_31 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_3, count: 2), b: m3b); //B30(8-15) B31(8-15) B34(8-15) B35(8-15)
5040 const __m256i rhs_mat_2367_31 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_3, count: 2), b: m3b); //B32(8-15) B33(8-15) B36(8-15) B37(8-15)
5041
5042 // Fifth sub block of the eight sub blocks processed in the iteration
5043 const __m256i rhs_mat_0145_40 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 4), b: m3b); //B40(0-7) B41(0-7) B44(0-7) B45(0-7)
5044 const __m256i rhs_mat_2367_40 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 4), b: m3b); //B42(0-7) B43(0-7) B46(0-7) B47(0-7)
5045
5046 const __m256i rhs_mat_0145_41 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 4), b: m3b); //B40(8-15) B41(8-15) B44(8-15) B45(8-15)
5047 const __m256i rhs_mat_2367_41 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 4), b: m3b); //B42(8-15) B43(8-15) B46(8-15) B47(8-15)
5048
5049 // Sixth sub block of the eight sub blocks processed in the iteration
5050 const __m256i rhs_mat_0145_50 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_2, count: 4), b: m3b); //B50(0-7) B51(0-7) B54(0-7) B55(0-7)
5051 const __m256i rhs_mat_2367_50 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_2, count: 4), b: m3b); //B52(0-7) B53(0-7) B56(0-7) B57(0-7)
5052
5053 const __m256i rhs_mat_0145_51 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_3, count: 4), b: m3b); //B50(8-15) B51(8-15) B54(8-15) B55(8-15)
5054 const __m256i rhs_mat_2367_51 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_3, count: 4), b: m3b); //B52(8-15) B53(8-15) B56(8-15) B57(8-15)
5055
5056 // Seventh sub block of the eight sub blocks processed in the iteration
5057 const __m256i rhs_mat_0145_60 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 6), b: m3b); //B60(0-7) B61(0-7) B64(0-7) B65(0-7)
5058 const __m256i rhs_mat_2367_60 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 6), b: m3b); //B62(0-7) B63(0-7) B66(0-7) B67(0-7)
5059
5060 const __m256i rhs_mat_0145_61 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 6), b: m3b); //B60(8-15) B61(8-15) B64(8-15) B65(8-15)
5061 const __m256i rhs_mat_2367_61 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 6), b: m3b); //B62(8-15) B63(8-15) B66(8-15) B67(8-15)
5062
5063 // Eighth sub block of the eight sub blocks processed in the iteration
5064 const __m256i rhs_mat_0145_70 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_2, count: 6), b: m3b); //B70(0-7) B71(0-7) B74(0-7) B75(0-7)
5065 const __m256i rhs_mat_2367_70 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_2, count: 6), b: m3b); //B72(0-7) B73(0-7) B76(0-7) B77(0-7)
5066
5067 const __m256i rhs_mat_0145_71 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_3, count: 6), b: m3b); //B70(8-15) B71(8-15) B74(8-15) B75(8-15)
5068 const __m256i rhs_mat_2367_71 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_3, count: 6), b: m3b); //B72(8-15) B73(8-15) B76(8-15) B77(8-15)
5069
5070 // Shuffle pattern one - right side input
5071 const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3)
5072 const __m256i rhs_mat_2367_00_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_00, 136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3)
5073
5074 const __m256i rhs_mat_0145_01_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_01, 136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11)
5075 const __m256i rhs_mat_2367_01_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_01, 136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11)
5076
5077 const __m256i rhs_mat_0145_10_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_10, 136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3)
5078 const __m256i rhs_mat_2367_10_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_10, 136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3)
5079
5080 const __m256i rhs_mat_0145_11_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_11, 136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11)
5081 const __m256i rhs_mat_2367_11_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_11, 136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11)
5082
5083 const __m256i rhs_mat_0145_20_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_20, 136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3)
5084 const __m256i rhs_mat_2367_20_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_20, 136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3)
5085
5086 const __m256i rhs_mat_0145_21_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_21, 136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11)
5087 const __m256i rhs_mat_2367_21_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_21, 136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11)
5088
5089 const __m256i rhs_mat_0145_30_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_30, 136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3)
5090 const __m256i rhs_mat_2367_30_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_30, 136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3)
5091
5092 const __m256i rhs_mat_0145_31_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_31, 136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11
5093 const __m256i rhs_mat_2367_31_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_31, 136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11)
5094
5095 const __m256i rhs_mat_0145_40_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_40, 136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3)
5096 const __m256i rhs_mat_2367_40_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_40, 136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3)
5097
5098 const __m256i rhs_mat_0145_41_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_41, 136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11)
5099 const __m256i rhs_mat_2367_41_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_41, 136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11)
5100
5101 const __m256i rhs_mat_0145_50_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_50, 136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3)
5102 const __m256i rhs_mat_2367_50_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_50, 136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3)
5103
5104 const __m256i rhs_mat_0145_51_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_51, 136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11)
5105 const __m256i rhs_mat_2367_51_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_51, 136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11)
5106
5107 const __m256i rhs_mat_0145_60_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_60, 136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3)
5108 const __m256i rhs_mat_2367_60_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_60, 136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3)
5109
5110 const __m256i rhs_mat_0145_61_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_61, 136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11)
5111 const __m256i rhs_mat_2367_61_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_61, 136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11)
5112
5113 const __m256i rhs_mat_0145_70_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_70, 136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3)
5114 const __m256i rhs_mat_2367_70_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_70, 136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3)
5115
5116 const __m256i rhs_mat_0145_71_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_71, 136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11)
5117 const __m256i rhs_mat_2367_71_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_71, 136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11)
5118
5119
5120 // Shuffle pattern two - right side input
5121 const __m256i rhs_mat_0145_00_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_00, 221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7)
5122 const __m256i rhs_mat_2367_00_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_00, 221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7)
5123
5124 const __m256i rhs_mat_0145_01_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_01, 221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15)
5125 const __m256i rhs_mat_2367_01_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_01, 221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15)
5126
5127 const __m256i rhs_mat_0145_10_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_10, 221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7)
5128 const __m256i rhs_mat_2367_10_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_10, 221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7)
5129
5130 const __m256i rhs_mat_0145_11_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_11, 221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15)
5131 const __m256i rhs_mat_2367_11_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_11, 221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15)
5132
5133 const __m256i rhs_mat_0145_20_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_20, 221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7)
5134 const __m256i rhs_mat_2367_20_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_20, 221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7)
5135
5136 const __m256i rhs_mat_0145_21_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_21, 221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15)
5137 const __m256i rhs_mat_2367_21_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_21, 221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15)
5138
5139 const __m256i rhs_mat_0145_30_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_30, 221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7)
5140 const __m256i rhs_mat_2367_30_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_30, 221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7)
5141
5142 const __m256i rhs_mat_0145_31_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_31, 221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15)
5143 const __m256i rhs_mat_2367_31_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_31, 221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15)
5144
5145 const __m256i rhs_mat_0145_40_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_40, 221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7)
5146 const __m256i rhs_mat_2367_40_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_40, 221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7)
5147
5148 const __m256i rhs_mat_0145_41_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_41, 221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15)
5149 const __m256i rhs_mat_2367_41_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_41, 221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15)
5150
5151 const __m256i rhs_mat_0145_50_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_50, 221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7)
5152 const __m256i rhs_mat_2367_50_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_50, 221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7)
5153
5154 const __m256i rhs_mat_0145_51_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_51, 221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15)
5155 const __m256i rhs_mat_2367_51_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_51, 221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15)
5156
5157 const __m256i rhs_mat_0145_60_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_60, 221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7)
5158 const __m256i rhs_mat_2367_60_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_60, 221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7)
5159
5160 const __m256i rhs_mat_0145_61_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_61, 221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15)
5161 const __m256i rhs_mat_2367_61_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_61, 221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15)
5162
5163 const __m256i rhs_mat_0145_70_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_70, 221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7)
5164 const __m256i rhs_mat_2367_70_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_70, 221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7)
5165
5166 const __m256i rhs_mat_0145_71_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_71, 221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15)
5167 const __m256i rhs_mat_2367_71_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_71, 221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15)
5168
5169 //Scales and Mins of corresponding sub blocks from different Q2_K structures are stored together
5170 //s00 m00 s01 m01 s10 m10 s11 m11 s20 m20 s21 m21 s30 m30 s31 m31 s40 m40 s41 m41 s50 m50 s51 m51 s60 m60 s61 m61 s70 m70 s71 m71
5171
5172 // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop
5173 const __m128i mins_and_scales_01 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + sb * 64));
5174 const __m128i mins_and_scales_23 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 16 + sb * 64));
5175 const __m128i mins_and_scales_45 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 32 + sb * 64));
5176 const __m128i mins_and_scales_67 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 48 + sb * 64));
5177
5178 // Extract scales which is lower half from mins_and_scales
5179 const __m128i scales_01 = _mm_and_si128(a: mins_and_scales_01, b: m4b_sse);
5180 const __m128i scales_23 = _mm_and_si128(a: mins_and_scales_23, b: m4b_sse);
5181 const __m128i scales_45 = _mm_and_si128(a: mins_and_scales_45, b: m4b_sse);
5182 const __m128i scales_67 = _mm_and_si128(a: mins_and_scales_67, b: m4b_sse);
5183
5184 // Extract mins which is upper half from mins_and_scales
5185 const __m256i mins_01 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_01, count: 4), b: m4b_sse));
5186 const __m256i mins_23 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_23, count: 4), b: m4b_sse));
5187 const __m256i mins_45 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_45, count: 4), b: m4b_sse));
5188 const __m256i mins_67 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_67, count: 4), b: m4b_sse));
5189
5190 const __m256i scales_0 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_01, b: scalesmask1_sse));
5191 const __m256i scales_1 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_01, b: scalesmask2_sse));
5192
5193 const __m256i scales_2 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_23, b: scalesmask1_sse));
5194 const __m256i scales_3 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_23, b: scalesmask2_sse));
5195
5196 const __m256i scales_4 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_45, b: scalesmask1_sse));
5197 const __m256i scales_5 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_45, b: scalesmask2_sse));
5198
5199 const __m256i scales_6 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_67, b: scalesmask1_sse));
5200 const __m256i scales_7 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_67, b: scalesmask2_sse));
5201
5202 const __m256i scale_0145_0 = _mm256_shuffle_epi32(scales_0, 68);
5203 const __m256i scale_2367_0 = _mm256_shuffle_epi32(scales_0, 238);
5204
5205 const __m256i scale_0145_1 = _mm256_shuffle_epi32(scales_1, 68);
5206 const __m256i scale_2367_1 = _mm256_shuffle_epi32(scales_1, 238);
5207
5208 const __m256i scale_0145_2 = _mm256_shuffle_epi32(scales_2, 68);
5209 const __m256i scale_2367_2 = _mm256_shuffle_epi32(scales_2, 238);
5210
5211 const __m256i scale_0145_3 = _mm256_shuffle_epi32(scales_3, 68);
5212 const __m256i scale_2367_3 = _mm256_shuffle_epi32(scales_3, 238);
5213
5214 const __m256i scale_0145_4 = _mm256_shuffle_epi32(scales_4, 68);
5215 const __m256i scale_2367_4 = _mm256_shuffle_epi32(scales_4, 238);
5216
5217 const __m256i scale_0145_5 = _mm256_shuffle_epi32(scales_5, 68);
5218 const __m256i scale_2367_5 = _mm256_shuffle_epi32(scales_5, 238);
5219
5220 const __m256i scale_0145_6 = _mm256_shuffle_epi32(scales_6, 68);
5221 const __m256i scale_2367_6 = _mm256_shuffle_epi32(scales_6, 238);
5222
5223 const __m256i scale_0145_7 = _mm256_shuffle_epi32(scales_7, 68);
5224 const __m256i scale_2367_7 = _mm256_shuffle_epi32(scales_7, 238);
5225
5226
5227 for (int rp = 0; rp < 4; rp++) {
5228
5229 // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
5230 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
5231 __m256i lhs_mat_0123_00 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 512 * sb)));
5232 __m256i lhs_mat_01_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 0);
5233 __m256i lhs_mat_23_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 17);
5234 __m256i lhs_mat_0123_01 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 32 + 512 * sb)));
5235 __m256i lhs_mat_01_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 0);
5236 __m256i lhs_mat_23_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 17);
5237 __m256i lhs_mat_0123_10 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 64 + 512 * sb)));
5238 __m256i lhs_mat_01_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 0);
5239 __m256i lhs_mat_23_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 17);
5240 __m256i lhs_mat_0123_11 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 96 + 512 * sb)));
5241 __m256i lhs_mat_01_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 0);
5242 __m256i lhs_mat_23_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 17);
5243 __m256i lhs_mat_0123_20 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 128 + 512 * sb)));
5244 __m256i lhs_mat_01_20 = _mm256_permute2f128_si256(lhs_mat_0123_20, lhs_mat_0123_20, 0);
5245 __m256i lhs_mat_23_20 = _mm256_permute2f128_si256(lhs_mat_0123_20, lhs_mat_0123_20, 17);
5246 __m256i lhs_mat_0123_21 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 160 + 512 * sb)));
5247 __m256i lhs_mat_01_21 = _mm256_permute2f128_si256(lhs_mat_0123_21, lhs_mat_0123_21, 0);
5248 __m256i lhs_mat_23_21 = _mm256_permute2f128_si256(lhs_mat_0123_21, lhs_mat_0123_21, 17);
5249 __m256i lhs_mat_0123_30 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 192 + 512 * sb)));
5250 __m256i lhs_mat_01_30 = _mm256_permute2f128_si256(lhs_mat_0123_30, lhs_mat_0123_30, 0);
5251 __m256i lhs_mat_23_30 = _mm256_permute2f128_si256(lhs_mat_0123_30, lhs_mat_0123_30, 17);
5252 __m256i lhs_mat_0123_31 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 224 + 512 * sb)));
5253 __m256i lhs_mat_01_31 = _mm256_permute2f128_si256(lhs_mat_0123_31, lhs_mat_0123_31, 0);
5254 __m256i lhs_mat_23_31 = _mm256_permute2f128_si256(lhs_mat_0123_31, lhs_mat_0123_31, 17);
5255
5256 __m256i lhs_mat_0123_40 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 256 + 512 * sb)));
5257 __m256i lhs_mat_01_40 = _mm256_permute2f128_si256(lhs_mat_0123_40, lhs_mat_0123_40, 0);
5258 __m256i lhs_mat_23_40 = _mm256_permute2f128_si256(lhs_mat_0123_40, lhs_mat_0123_40, 17);
5259 __m256i lhs_mat_0123_41 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 288 + 512 * sb)));
5260 __m256i lhs_mat_01_41 = _mm256_permute2f128_si256(lhs_mat_0123_41, lhs_mat_0123_41, 0);
5261 __m256i lhs_mat_23_41 = _mm256_permute2f128_si256(lhs_mat_0123_41, lhs_mat_0123_41, 17);
5262 __m256i lhs_mat_0123_50 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 320 + 512 * sb)));
5263 __m256i lhs_mat_01_50 = _mm256_permute2f128_si256(lhs_mat_0123_50, lhs_mat_0123_50, 0);
5264 __m256i lhs_mat_23_50 = _mm256_permute2f128_si256(lhs_mat_0123_50, lhs_mat_0123_50, 17);
5265 __m256i lhs_mat_0123_51 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 352 + 512 * sb)));
5266 __m256i lhs_mat_01_51 = _mm256_permute2f128_si256(lhs_mat_0123_51, lhs_mat_0123_51, 0);
5267 __m256i lhs_mat_23_51 = _mm256_permute2f128_si256(lhs_mat_0123_51, lhs_mat_0123_51, 17);
5268 __m256i lhs_mat_0123_60 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 384 + 512 * sb)));
5269 __m256i lhs_mat_01_60 = _mm256_permute2f128_si256(lhs_mat_0123_60, lhs_mat_0123_60, 0);
5270 __m256i lhs_mat_23_60 = _mm256_permute2f128_si256(lhs_mat_0123_60, lhs_mat_0123_60, 17);
5271 __m256i lhs_mat_0123_61 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 416 + 512 * sb)));
5272 __m256i lhs_mat_01_61 = _mm256_permute2f128_si256(lhs_mat_0123_61, lhs_mat_0123_61, 0);
5273 __m256i lhs_mat_23_61 = _mm256_permute2f128_si256(lhs_mat_0123_61, lhs_mat_0123_61, 17);
5274 __m256i lhs_mat_0123_70 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 448 + 512 * sb)));
5275 __m256i lhs_mat_01_70 = _mm256_permute2f128_si256(lhs_mat_0123_70, lhs_mat_0123_70, 0);
5276 __m256i lhs_mat_23_70 = _mm256_permute2f128_si256(lhs_mat_0123_70, lhs_mat_0123_70, 17);
5277 __m256i lhs_mat_0123_71 = _mm256_loadu_si256(p: (const __m256i * )((a_ptrs[rp][b].qs + 480 + 512 * sb)));
5278 __m256i lhs_mat_01_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 0);
5279 __m256i lhs_mat_23_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 17);
5280
5281 // Bsums are loaded for the different Q8_K blocks
5282 __m128i lhs_raw_bsums_01_0123 = _mm_loadu_si128(p: (const __m128i *)((a_ptrs[rp][b].bsums + 32 * sb)));
5283 __m128i lhs_raw_bsums_23_0123 = _mm_loadu_si128(p: (const __m128i *)(a_ptrs[rp][b].bsums + 8 + 32 * sb));
5284 __m128i lhs_raw_bsums_01_4567 = _mm_loadu_si128(p: (const __m128i *)((a_ptrs[rp][b].bsums + 16 + 32 * sb)));
5285 __m128i lhs_raw_bsums_23_4567 = _mm_loadu_si128(p: (const __m128i *)(a_ptrs[rp][b].bsums + 24 + 32 * sb));
5286
5287 // Shuffle pattern one - left side input
5288 const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3)
5289 const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3)
5290
5291 const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11)
5292 const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11)
5293
5294 const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3)
5295 const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3)
5296
5297 const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11)
5298 const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11)
5299
5300 const __m256i lhs_mat_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3)
5301 const __m256i lhs_mat_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3)
5302
5303 const __m256i lhs_mat_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11)
5304 const __m256i lhs_mat_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11)
5305
5306 const __m256i lhs_mat_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3)
5307 const __m256i lhs_mat_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3)
5308
5309 const __m256i lhs_mat_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11)
5310 const __m256i lhs_mat_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11)
5311
5312 const __m256i lhs_mat_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3)
5313 const __m256i lhs_mat_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3)
5314
5315 const __m256i lhs_mat_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11)
5316 const __m256i lhs_mat_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11)
5317
5318 const __m256i lhs_mat_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3)
5319 const __m256i lhs_mat_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3)
5320
5321 const __m256i lhs_mat_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11)
5322 const __m256i lhs_mat_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11)
5323
5324 const __m256i lhs_mat_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3)
5325 const __m256i lhs_mat_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3)
5326
5327 const __m256i lhs_mat_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11)
5328 const __m256i lhs_mat_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11)
5329
5330 const __m256i lhs_mat_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3)
5331 const __m256i lhs_mat_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3)
5332
5333 const __m256i lhs_mat_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11)
5334 const __m256i lhs_mat_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11)
5335
5336 // Shuffle pattern two- left side input
5337 const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7)
5338 const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7)
5339
5340 const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15)
5341 const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15)
5342
5343 const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7)
5344 const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7)
5345
5346 const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15)
5347 const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15)
5348
5349 const __m256i lhs_mat_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7)
5350 const __m256i lhs_mat_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7)
5351
5352 const __m256i lhs_mat_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15)
5353 const __m256i lhs_mat_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15)
5354
5355 const __m256i lhs_mat_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7)
5356 const __m256i lhs_mat_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7)
5357
5358 const __m256i lhs_mat_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15)
5359 const __m256i lhs_mat_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15)
5360
5361 const __m256i lhs_mat_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7)
5362 const __m256i lhs_mat_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7)
5363
5364 const __m256i lhs_mat_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15)
5365 const __m256i lhs_mat_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15)
5366
5367 const __m256i lhs_mat_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7)
5368 const __m256i lhs_mat_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7)
5369
5370 const __m256i lhs_mat_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15)
5371 const __m256i lhs_mat_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15)
5372
5373 const __m256i lhs_mat_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7)
5374 const __m256i lhs_mat_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7)
5375
5376 const __m256i lhs_mat_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15)
5377 const __m256i lhs_mat_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15)
5378
5379 const __m256i lhs_mat_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7)
5380 const __m256i lhs_mat_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7)
5381
5382 const __m256i lhs_mat_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15)
5383 const __m256i lhs_mat_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15)
5384
5385 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
5386 __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp1, b: lhs_mat_01_00_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp1, b: lhs_mat_01_01_sp1));
5387 __m256i iacc_mat_01_0_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp1, b: lhs_mat_01_00_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp1, b: lhs_mat_01_01_sp1));
5388
5389 __m256i iacc_mat_10_0_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp1, b: lhs_mat_23_00_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp1, b: lhs_mat_23_01_sp1));
5390 __m256i iacc_mat_11_0_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp1, b: lhs_mat_23_00_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp1, b: lhs_mat_23_01_sp1));
5391
5392 __m256i iacc_mat_00_1_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp1, b: lhs_mat_01_10_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp1, b: lhs_mat_01_11_sp1));
5393 __m256i iacc_mat_01_1_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp1, b: lhs_mat_01_10_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp1, b: lhs_mat_01_11_sp1));
5394
5395 __m256i iacc_mat_10_1_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp1, b: lhs_mat_23_10_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp1, b: lhs_mat_23_11_sp1));
5396 __m256i iacc_mat_11_1_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp1, b: lhs_mat_23_10_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp1, b: lhs_mat_23_11_sp1));
5397
5398 __m256i iacc_mat_00_2_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_20_sp1, b: lhs_mat_01_20_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_21_sp1, b: lhs_mat_01_21_sp1));
5399 __m256i iacc_mat_01_2_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_20_sp1, b: lhs_mat_01_20_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_21_sp1, b: lhs_mat_01_21_sp1));
5400
5401 __m256i iacc_mat_10_2_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_20_sp1, b: lhs_mat_23_20_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_21_sp1, b: lhs_mat_23_21_sp1));
5402 __m256i iacc_mat_11_2_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_20_sp1, b: lhs_mat_23_20_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_21_sp1, b: lhs_mat_23_21_sp1));
5403
5404 __m256i iacc_mat_00_3_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_30_sp1, b: lhs_mat_01_30_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_31_sp1, b: lhs_mat_01_31_sp1));
5405 __m256i iacc_mat_01_3_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_30_sp1, b: lhs_mat_01_30_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_31_sp1, b: lhs_mat_01_31_sp1));
5406
5407 __m256i iacc_mat_10_3_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_30_sp1, b: lhs_mat_23_30_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_31_sp1, b: lhs_mat_23_31_sp1));
5408 __m256i iacc_mat_11_3_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_30_sp1, b: lhs_mat_23_30_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_31_sp1, b: lhs_mat_23_31_sp1));
5409
5410 __m256i iacc_mat_00_4_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_40_sp1, b: lhs_mat_01_40_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_41_sp1, b: lhs_mat_01_41_sp1));
5411 __m256i iacc_mat_01_4_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_40_sp1, b: lhs_mat_01_40_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_41_sp1, b: lhs_mat_01_41_sp1));
5412
5413 __m256i iacc_mat_10_4_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_40_sp1, b: lhs_mat_23_40_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_41_sp1, b: lhs_mat_23_41_sp1));
5414 __m256i iacc_mat_11_4_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_40_sp1, b: lhs_mat_23_40_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_41_sp1, b: lhs_mat_23_41_sp1));
5415
5416 __m256i iacc_mat_00_5_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_50_sp1, b: lhs_mat_01_50_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_51_sp1, b: lhs_mat_01_51_sp1));
5417 __m256i iacc_mat_01_5_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_50_sp1, b: lhs_mat_01_50_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_51_sp1, b: lhs_mat_01_51_sp1));
5418
5419 __m256i iacc_mat_10_5_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_50_sp1, b: lhs_mat_23_50_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_51_sp1, b: lhs_mat_23_51_sp1));
5420 __m256i iacc_mat_11_5_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_50_sp1, b: lhs_mat_23_50_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_51_sp1, b: lhs_mat_23_51_sp1));
5421
5422 __m256i iacc_mat_00_6_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_60_sp1, b: lhs_mat_01_60_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_61_sp1, b: lhs_mat_01_61_sp1));
5423 __m256i iacc_mat_01_6_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_60_sp1, b: lhs_mat_01_60_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_61_sp1, b: lhs_mat_01_61_sp1));
5424
5425 __m256i iacc_mat_10_6_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_60_sp1, b: lhs_mat_23_60_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_61_sp1, b: lhs_mat_23_61_sp1));
5426 __m256i iacc_mat_11_6_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_60_sp1, b: lhs_mat_23_60_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_61_sp1, b: lhs_mat_23_61_sp1));
5427
5428 __m256i iacc_mat_00_7_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_70_sp1, b: lhs_mat_01_70_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_71_sp1, b: lhs_mat_01_71_sp1));
5429 __m256i iacc_mat_01_7_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_70_sp1, b: lhs_mat_01_70_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_71_sp1, b: lhs_mat_01_71_sp1));
5430
5431 __m256i iacc_mat_10_7_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_70_sp1, b: lhs_mat_23_70_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_71_sp1, b: lhs_mat_23_71_sp1));
5432 __m256i iacc_mat_11_7_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_70_sp1, b: lhs_mat_23_70_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_71_sp1, b: lhs_mat_23_71_sp1));
5433
5434
5435 __m256i iacc_mat_00_0_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp2, b: lhs_mat_01_00_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp2, b: lhs_mat_01_01_sp2));
5436 __m256i iacc_mat_01_0_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp2, b: lhs_mat_01_00_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp2, b: lhs_mat_01_01_sp2));
5437
5438 __m256i iacc_mat_10_0_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp2, b: lhs_mat_23_00_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp2, b: lhs_mat_23_01_sp2));
5439 __m256i iacc_mat_11_0_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp2, b: lhs_mat_23_00_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp2, b: lhs_mat_23_01_sp2));
5440
5441 __m256i iacc_mat_00_1_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp2, b: lhs_mat_01_10_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp2, b: lhs_mat_01_11_sp2));
5442 __m256i iacc_mat_01_1_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp2, b: lhs_mat_01_10_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp2, b: lhs_mat_01_11_sp2));
5443
5444 __m256i iacc_mat_10_1_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp2, b: lhs_mat_23_10_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp2, b: lhs_mat_23_11_sp2));
5445 __m256i iacc_mat_11_1_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp2, b: lhs_mat_23_10_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp2, b: lhs_mat_23_11_sp2));
5446
5447 __m256i iacc_mat_00_2_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_20_sp2, b: lhs_mat_01_20_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_21_sp2, b: lhs_mat_01_21_sp2));
5448 __m256i iacc_mat_01_2_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_20_sp2, b: lhs_mat_01_20_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_21_sp2, b: lhs_mat_01_21_sp2));
5449
5450 __m256i iacc_mat_10_2_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_20_sp2, b: lhs_mat_23_20_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_21_sp2, b: lhs_mat_23_21_sp2));
5451 __m256i iacc_mat_11_2_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_20_sp2, b: lhs_mat_23_20_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_21_sp2, b: lhs_mat_23_21_sp2));
5452
5453 __m256i iacc_mat_00_3_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_30_sp2, b: lhs_mat_01_30_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_31_sp2, b: lhs_mat_01_31_sp2));
5454 __m256i iacc_mat_01_3_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_30_sp2, b: lhs_mat_01_30_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_31_sp2, b: lhs_mat_01_31_sp2));
5455
5456 __m256i iacc_mat_10_3_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_30_sp2, b: lhs_mat_23_30_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_31_sp2, b: lhs_mat_23_31_sp2));
5457 __m256i iacc_mat_11_3_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_30_sp2, b: lhs_mat_23_30_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_31_sp2, b: lhs_mat_23_31_sp2));
5458
5459 __m256i iacc_mat_00_4_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_40_sp2, b: lhs_mat_01_40_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_41_sp2, b: lhs_mat_01_41_sp2));
5460 __m256i iacc_mat_01_4_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_40_sp2, b: lhs_mat_01_40_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_41_sp2, b: lhs_mat_01_41_sp2));
5461
5462 __m256i iacc_mat_10_4_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_40_sp2, b: lhs_mat_23_40_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_41_sp2, b: lhs_mat_23_41_sp2));
5463 __m256i iacc_mat_11_4_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_40_sp2, b: lhs_mat_23_40_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_41_sp2, b: lhs_mat_23_41_sp2));
5464
5465 __m256i iacc_mat_00_5_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_50_sp2, b: lhs_mat_01_50_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_51_sp2, b: lhs_mat_01_51_sp2));
5466 __m256i iacc_mat_01_5_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_50_sp2, b: lhs_mat_01_50_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_51_sp2, b: lhs_mat_01_51_sp2));
5467
5468 __m256i iacc_mat_10_5_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_50_sp2, b: lhs_mat_23_50_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_51_sp2, b: lhs_mat_23_51_sp2));
5469 __m256i iacc_mat_11_5_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_50_sp2, b: lhs_mat_23_50_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_51_sp2, b: lhs_mat_23_51_sp2));
5470
5471 __m256i iacc_mat_00_6_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_60_sp2, b: lhs_mat_01_60_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_61_sp2, b: lhs_mat_01_61_sp2));
5472 __m256i iacc_mat_01_6_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_60_sp2, b: lhs_mat_01_60_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_61_sp2, b: lhs_mat_01_61_sp2));
5473
5474 __m256i iacc_mat_10_6_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_60_sp2, b: lhs_mat_23_60_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_61_sp2, b: lhs_mat_23_61_sp2));
5475 __m256i iacc_mat_11_6_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_60_sp2, b: lhs_mat_23_60_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_61_sp2, b: lhs_mat_23_61_sp2));
5476
5477 __m256i iacc_mat_00_7_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_70_sp2, b: lhs_mat_01_70_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_71_sp2, b: lhs_mat_01_71_sp2));
5478 __m256i iacc_mat_01_7_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_70_sp2, b: lhs_mat_01_70_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_71_sp2, b: lhs_mat_01_71_sp2));
5479
5480 __m256i iacc_mat_10_7_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_70_sp2, b: lhs_mat_23_70_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_71_sp2, b: lhs_mat_23_71_sp2));
5481 __m256i iacc_mat_11_7_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_70_sp2, b: lhs_mat_23_70_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_71_sp2, b: lhs_mat_23_71_sp2));
5482
5483 // Combine results from both shuffle patterns for each output block
5484 __m256i iacc_mat_00_0 = _mm256_add_epi16(a: iacc_mat_00_0_sp1, b: iacc_mat_00_0_sp2);
5485 __m256i iacc_mat_01_0 = _mm256_add_epi16(a: iacc_mat_01_0_sp1, b: iacc_mat_01_0_sp2);
5486 __m256i iacc_mat_10_0 = _mm256_add_epi16(a: iacc_mat_10_0_sp1, b: iacc_mat_10_0_sp2);
5487 __m256i iacc_mat_11_0 = _mm256_add_epi16(a: iacc_mat_11_0_sp1, b: iacc_mat_11_0_sp2);
5488
5489 __m256i iacc_mat_00_1 = _mm256_add_epi16(a: iacc_mat_00_1_sp1, b: iacc_mat_00_1_sp2);
5490 __m256i iacc_mat_01_1 = _mm256_add_epi16(a: iacc_mat_01_1_sp1, b: iacc_mat_01_1_sp2);
5491 __m256i iacc_mat_10_1 = _mm256_add_epi16(a: iacc_mat_10_1_sp1, b: iacc_mat_10_1_sp2);
5492 __m256i iacc_mat_11_1 = _mm256_add_epi16(a: iacc_mat_11_1_sp1, b: iacc_mat_11_1_sp2);
5493
5494 __m256i iacc_mat_00_2 = _mm256_add_epi16(a: iacc_mat_00_2_sp1, b: iacc_mat_00_2_sp2);
5495 __m256i iacc_mat_01_2 = _mm256_add_epi16(a: iacc_mat_01_2_sp1, b: iacc_mat_01_2_sp2);
5496 __m256i iacc_mat_10_2 = _mm256_add_epi16(a: iacc_mat_10_2_sp1, b: iacc_mat_10_2_sp2);
5497 __m256i iacc_mat_11_2 = _mm256_add_epi16(a: iacc_mat_11_2_sp1, b: iacc_mat_11_2_sp2);
5498
5499 __m256i iacc_mat_00_3 = _mm256_add_epi16(a: iacc_mat_00_3_sp1, b: iacc_mat_00_3_sp2);
5500 __m256i iacc_mat_01_3 = _mm256_add_epi16(a: iacc_mat_01_3_sp1, b: iacc_mat_01_3_sp2);
5501 __m256i iacc_mat_10_3 = _mm256_add_epi16(a: iacc_mat_10_3_sp1, b: iacc_mat_10_3_sp2);
5502 __m256i iacc_mat_11_3 = _mm256_add_epi16(a: iacc_mat_11_3_sp1, b: iacc_mat_11_3_sp2);
5503
5504 __m256i iacc_mat_00_4 = _mm256_add_epi16(a: iacc_mat_00_4_sp1, b: iacc_mat_00_4_sp2);
5505 __m256i iacc_mat_01_4 = _mm256_add_epi16(a: iacc_mat_01_4_sp1, b: iacc_mat_01_4_sp2);
5506 __m256i iacc_mat_10_4 = _mm256_add_epi16(a: iacc_mat_10_4_sp1, b: iacc_mat_10_4_sp2);
5507 __m256i iacc_mat_11_4 = _mm256_add_epi16(a: iacc_mat_11_4_sp1, b: iacc_mat_11_4_sp2);
5508
5509 __m256i iacc_mat_00_5 = _mm256_add_epi16(a: iacc_mat_00_5_sp1, b: iacc_mat_00_5_sp2);
5510 __m256i iacc_mat_01_5 = _mm256_add_epi16(a: iacc_mat_01_5_sp1, b: iacc_mat_01_5_sp2);
5511 __m256i iacc_mat_10_5 = _mm256_add_epi16(a: iacc_mat_10_5_sp1, b: iacc_mat_10_5_sp2);
5512 __m256i iacc_mat_11_5 = _mm256_add_epi16(a: iacc_mat_11_5_sp1, b: iacc_mat_11_5_sp2);
5513
5514 __m256i iacc_mat_00_6 = _mm256_add_epi16(a: iacc_mat_00_6_sp1, b: iacc_mat_00_6_sp2);
5515 __m256i iacc_mat_01_6 = _mm256_add_epi16(a: iacc_mat_01_6_sp1, b: iacc_mat_01_6_sp2);
5516 __m256i iacc_mat_10_6 = _mm256_add_epi16(a: iacc_mat_10_6_sp1, b: iacc_mat_10_6_sp2);
5517 __m256i iacc_mat_11_6 = _mm256_add_epi16(a: iacc_mat_11_6_sp1, b: iacc_mat_11_6_sp2);
5518
5519 __m256i iacc_mat_00_7 = _mm256_add_epi16(a: iacc_mat_00_7_sp1, b: iacc_mat_00_7_sp2);
5520 __m256i iacc_mat_01_7 = _mm256_add_epi16(a: iacc_mat_01_7_sp1, b: iacc_mat_01_7_sp2);
5521 __m256i iacc_mat_10_7 = _mm256_add_epi16(a: iacc_mat_10_7_sp1, b: iacc_mat_10_7_sp2);
5522 __m256i iacc_mat_11_7 = _mm256_add_epi16(a: iacc_mat_11_7_sp1, b: iacc_mat_11_7_sp2);
5523
5524 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
5525 iacc_mat_00_0 = _mm256_madd_epi16(a: iacc_mat_00_0, b: scale_0145_0);
5526 iacc_mat_01_0 = _mm256_madd_epi16(a: iacc_mat_01_0, b: scale_2367_0);
5527 iacc_mat_10_0 = _mm256_madd_epi16(a: iacc_mat_10_0, b: scale_0145_0);
5528 iacc_mat_11_0 = _mm256_madd_epi16(a: iacc_mat_11_0, b: scale_2367_0);
5529
5530 iacc_mat_00_1 = _mm256_madd_epi16(a: iacc_mat_00_1, b: scale_0145_1);
5531 iacc_mat_01_1 = _mm256_madd_epi16(a: iacc_mat_01_1, b: scale_2367_1);
5532 iacc_mat_10_1 = _mm256_madd_epi16(a: iacc_mat_10_1, b: scale_0145_1);
5533 iacc_mat_11_1 = _mm256_madd_epi16(a: iacc_mat_11_1, b: scale_2367_1);
5534
5535 iacc_mat_00_2 = _mm256_madd_epi16(a: iacc_mat_00_2, b: scale_0145_2);
5536 iacc_mat_01_2 = _mm256_madd_epi16(a: iacc_mat_01_2, b: scale_2367_2);
5537 iacc_mat_10_2 = _mm256_madd_epi16(a: iacc_mat_10_2, b: scale_0145_2);
5538 iacc_mat_11_2 = _mm256_madd_epi16(a: iacc_mat_11_2, b: scale_2367_2);
5539
5540 iacc_mat_00_3 = _mm256_madd_epi16(a: iacc_mat_00_3, b: scale_0145_3);
5541 iacc_mat_01_3 = _mm256_madd_epi16(a: iacc_mat_01_3, b: scale_2367_3);
5542 iacc_mat_10_3 = _mm256_madd_epi16(a: iacc_mat_10_3, b: scale_0145_3);
5543 iacc_mat_11_3 = _mm256_madd_epi16(a: iacc_mat_11_3, b: scale_2367_3);
5544
5545 iacc_mat_00_4 = _mm256_madd_epi16(a: iacc_mat_00_4, b: scale_0145_4);
5546 iacc_mat_01_4 = _mm256_madd_epi16(a: iacc_mat_01_4, b: scale_2367_4);
5547 iacc_mat_10_4 = _mm256_madd_epi16(a: iacc_mat_10_4, b: scale_0145_4);
5548 iacc_mat_11_4 = _mm256_madd_epi16(a: iacc_mat_11_4, b: scale_2367_4);
5549
5550 iacc_mat_00_5 = _mm256_madd_epi16(a: iacc_mat_00_5, b: scale_0145_5);
5551 iacc_mat_01_5 = _mm256_madd_epi16(a: iacc_mat_01_5, b: scale_2367_5);
5552 iacc_mat_10_5 = _mm256_madd_epi16(a: iacc_mat_10_5, b: scale_0145_5);
5553 iacc_mat_11_5 = _mm256_madd_epi16(a: iacc_mat_11_5, b: scale_2367_5);
5554
5555 iacc_mat_00_6 = _mm256_madd_epi16(a: iacc_mat_00_6, b: scale_0145_6);
5556 iacc_mat_01_6 = _mm256_madd_epi16(a: iacc_mat_01_6, b: scale_2367_6);
5557 iacc_mat_10_6 = _mm256_madd_epi16(a: iacc_mat_10_6, b: scale_0145_6);
5558 iacc_mat_11_6 = _mm256_madd_epi16(a: iacc_mat_11_6, b: scale_2367_6);
5559
5560 iacc_mat_00_7 = _mm256_madd_epi16(a: iacc_mat_00_7, b: scale_0145_7);
5561 iacc_mat_01_7 = _mm256_madd_epi16(a: iacc_mat_01_7, b: scale_2367_7);
5562 iacc_mat_10_7 = _mm256_madd_epi16(a: iacc_mat_10_7, b: scale_0145_7);
5563 iacc_mat_11_7 = _mm256_madd_epi16(a: iacc_mat_11_7, b: scale_2367_7);
5564
5565 __m256i iacc_mat_00 = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_00_0, b: iacc_mat_00_1), b: _mm256_add_epi32(a: iacc_mat_00_2, b: iacc_mat_00_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_00_4, b: iacc_mat_00_5), b: _mm256_add_epi32(a: iacc_mat_00_6, b: iacc_mat_00_7)));
5566 __m256i iacc_mat_01 = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_01_0, b: iacc_mat_01_1), b: _mm256_add_epi32(a: iacc_mat_01_2, b: iacc_mat_01_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_01_4, b: iacc_mat_01_5), b: _mm256_add_epi32(a: iacc_mat_01_6, b: iacc_mat_01_7)));
5567 __m256i iacc_mat_10 = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_10_0, b: iacc_mat_10_1), b: _mm256_add_epi32(a: iacc_mat_10_2, b: iacc_mat_10_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_10_4, b: iacc_mat_10_5), b: _mm256_add_epi32(a: iacc_mat_10_6, b: iacc_mat_10_7)));
5568 __m256i iacc_mat_11 = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_11_0, b: iacc_mat_11_1), b: _mm256_add_epi32(a: iacc_mat_11_2, b: iacc_mat_11_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_11_4, b: iacc_mat_11_5), b: _mm256_add_epi32(a: iacc_mat_11_6, b: iacc_mat_11_7)));
5569
5570 // Straighten out to make 4 row vectors
5571 __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
5572 __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
5573 __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
5574 __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
5575
5576 // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes
5577 const __m128 row_scale_f32_sse = _mm_load_ps(p: a_ptrs[rp][b].d);
5578 const __m256 row_scale_f32 = _mm256_set_m128(hi: row_scale_f32_sse, lo: row_scale_f32_sse);
5579
5580 // Multiply with appropiate scales and accumulate (for both d and dmin) below
5581 acc_rows[rp * 4] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_0), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_rows[rp * 4]);
5582 acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_1), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_rows[rp * 4 + 1]);
5583 acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_2), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_rows[rp * 4 + 2]);
5584 acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_3), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_rows[rp * 4 + 3]);
5585
5586 __m256i lhs_bsums_01_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_01_0123), lhs_raw_bsums_01_0123, 1);
5587 __m256i lhs_bsums_23_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_23_0123), lhs_raw_bsums_23_0123, 1);
5588 __m256i lhs_bsums_01_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_01_4567), lhs_raw_bsums_01_4567, 1);
5589 __m256i lhs_bsums_23_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_23_4567), lhs_raw_bsums_23_4567, 1);
5590
5591 // Take two bsums from two Q8_Ks at a time and multiply with corresponding mins values from each Q2_K
5592 __m256i iacc_row_min_0_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_0123, 0), b: mins_01);
5593 __m256i iacc_row_min_1_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_0123, 170), b: mins_01);
5594 __m256i iacc_row_min_2_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_0123, 0), b: mins_01);
5595 __m256i iacc_row_min_3_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_0123, 170), b: mins_01);
5596
5597 __m256i iacc_row_min_0_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_0123, 85), b: mins_23);
5598 __m256i iacc_row_min_1_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_0123, 255), b: mins_23);
5599 __m256i iacc_row_min_2_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_0123, 85), b: mins_23);
5600 __m256i iacc_row_min_3_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_0123, 255), b: mins_23);
5601
5602 __m256i iacc_row_min_0_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_4567, 0), b: mins_45);
5603 __m256i iacc_row_min_1_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_4567, 170), b: mins_45);
5604 __m256i iacc_row_min_2_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_4567, 0), b: mins_45);
5605 __m256i iacc_row_min_3_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_4567, 170), b: mins_45);
5606
5607 __m256i iacc_row_min_0_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_4567, 85), b: mins_67);
5608 __m256i iacc_row_min_1_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_4567, 255), b: mins_67);
5609 __m256i iacc_row_min_2_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_4567, 85), b: mins_67);
5610 __m256i iacc_row_min_3_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_4567, 255), b: mins_67);
5611
5612 __m256i iacc_row_min_0 = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_row_min_0_01, b: iacc_row_min_0_23), b: _mm256_add_epi32(a: iacc_row_min_0_45,b: iacc_row_min_0_67));
5613 __m256i iacc_row_min_1 = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_row_min_1_01, b: iacc_row_min_1_23), b: _mm256_add_epi32(a: iacc_row_min_1_45,b: iacc_row_min_1_67));
5614 __m256i iacc_row_min_2 = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_row_min_2_01, b: iacc_row_min_2_23), b: _mm256_add_epi32(a: iacc_row_min_2_45,b: iacc_row_min_2_67));
5615 __m256i iacc_row_min_3 = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_row_min_3_01, b: iacc_row_min_3_23), b: _mm256_add_epi32(a: iacc_row_min_3_45,b: iacc_row_min_3_67));
5616
5617 acc_min_rows[rp * 4] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_0), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_min_rows[rp * 4]);
5618 acc_min_rows[rp * 4 + 1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_1), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_min_rows[rp * 4 + 1]);
5619 acc_min_rows[rp * 4 + 2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_2), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_min_rows[rp * 4 + 2]);
5620 acc_min_rows[rp * 4 + 3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_3), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_min_rows[rp * 4 + 3]);
5621
5622 }
5623 }
5624 }
5625 // Store the accumulated values
5626 for (int i = 0; i < 16; i++) {
5627 _mm256_storeu_ps(p: (float * )(s + ((y * 4 + i) * bs + x * 8)), a: _mm256_sub_ps(a: acc_rows[i], b: acc_min_rows[i]));
5628
5629 }
5630 }
5631 }
5632
5633 for (; y < nr / 4; y ++) {
5634
5635 const block_q8_Kx4 * a_ptr = a_ptr_start + (y * nb);
5636
5637 // Take group of eight block_q2_kx8 structures at each pass of the loop and perform dot product operation
5638 for (int64_t x = xstart; x < nc / 8; x++) {
5639
5640 const block_q2_Kx8 * b_ptr = b_ptr_start + (x * b_nb);
5641
5642 // Master FP accumulators
5643 __m256 acc_rows[4];
5644 for (int i = 0; i < 4; i++) {
5645 acc_rows[i] = _mm256_setzero_ps();
5646 }
5647
5648 __m256 acc_min_rows[4];
5649 for (int i = 0; i < 4; i++) {
5650 acc_min_rows[i] = _mm256_setzero_ps();
5651 }
5652
5653 for (int64_t b = 0; b < nb; b++) {
5654 // Delta values - Load the eight scale values of block_q2_kx8
5655 const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
5656
5657 // dmin values - Load the eight dmin values of block_q2_kx8
5658 const __m256 col_dmin_f32 = GGML_F32Cx8_LOAD(b_ptr[b].dmin);
5659
5660 // Loop to iterate over the sixteen sub blocks of a super block - eight sub blocks are processed per iteration
5661 for (int sb = 0; sb < QK_K / 128; sb++) {
5662
5663 // Load the eight block_q2_k for eight sub blocks quantized values interleaved with each other in chunks of eight bytes - B0,B1 ....B6,B7
5664 const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + sb * 256));
5665 const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 32 + sb * 256));
5666 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 64 + sb * 256));
5667 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 96 + sb * 256));
5668 const __m256i rhs_raw_mat_0123_2 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 128 + sb * 256));
5669 const __m256i rhs_raw_mat_4567_2 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 160 + sb * 256));
5670 const __m256i rhs_raw_mat_0123_3 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 192 + sb * 256));
5671 const __m256i rhs_raw_mat_4567_3 = _mm256_loadu_si256(p: (const __m256i *)(b_ptr[b].qs + 224 + sb * 256));
5672
5673 // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
5674 //superblock sub block which part of sub block
5675 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
5676 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
5677
5678 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
5679 const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
5680
5681 const __m256i rhs_raw_mat_0145_2 = _mm256_blend_epi32(rhs_raw_mat_0123_2, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_2, requiredOrder), 240);
5682 const __m256i rhs_raw_mat_2367_2 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_2, requiredOrder), rhs_raw_mat_4567_2, 240);
5683
5684 const __m256i rhs_raw_mat_0145_3 = _mm256_blend_epi32(rhs_raw_mat_0123_3, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_3, requiredOrder), 240);
5685 const __m256i rhs_raw_mat_2367_3 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_3, requiredOrder), rhs_raw_mat_4567_3, 240);
5686
5687 // 2-bit -> 8-bit
5688 // First sub block of the eight sub blocks processed in the iteration
5689 const __m256i rhs_mat_0145_00 = _mm256_and_si256(a: rhs_raw_mat_0145_0, b: m3b); //B00(0-7) B01(0-7) B04(0-7) B05(0-7)
5690 const __m256i rhs_mat_2367_00 = _mm256_and_si256(a: rhs_raw_mat_2367_0, b: m3b); //B02(0-7) B03(0-7) B06(0-7) B07(0-7)
5691
5692 const __m256i rhs_mat_0145_01 = _mm256_and_si256(a: rhs_raw_mat_0145_1, b: m3b); //B00(8-15) B01(8-15) B04(8-15) B05(8-15)
5693 const __m256i rhs_mat_2367_01 = _mm256_and_si256(a: rhs_raw_mat_2367_1, b: m3b); //B02(8-15) B03(8-15) B06(8-15) B07(8-15)
5694
5695 // Second sub block of the eight sub blocks processed in the iteration
5696 const __m256i rhs_mat_0145_10 = _mm256_and_si256(a: rhs_raw_mat_0145_2, b: m3b); //B10(0-7) B11(0-7) B14(0-7) B15(0-7)
5697 const __m256i rhs_mat_2367_10 = _mm256_and_si256(a: rhs_raw_mat_2367_2, b: m3b); //B12(0-7) B13(0-7) B16(0-7) B17(0-7)
5698
5699 const __m256i rhs_mat_0145_11 = _mm256_and_si256(a: rhs_raw_mat_0145_3, b: m3b); //B10(8-15) B11(8-15) B14(8-15) B15(8-15)
5700 const __m256i rhs_mat_2367_11 = _mm256_and_si256(a: rhs_raw_mat_2367_3, b: m3b); //B12(8-15) B13(8-15) B16(8-15) B17(8-15)
5701
5702 // Third sub block of the eight sub blocks processed in the iteration
5703 const __m256i rhs_mat_0145_20 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 2), b: m3b); //B20(0-7) B21(0-7) B24(0-7) B25(0-7)
5704 const __m256i rhs_mat_2367_20 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 2), b: m3b); //B22(0-7) B23(0-7) B26(0-7) B27(0-7)
5705
5706 const __m256i rhs_mat_0145_21 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 2), b: m3b); //B20(8-15) B21(8-15) B24(8-15) B25(8-15)
5707 const __m256i rhs_mat_2367_21 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 2), b: m3b); //B22(8-15) B23(8-15) B26(8-15) B27(8-15)
5708
5709 // Fourth sub block of the eight sub blocks processed in the iteration
5710 const __m256i rhs_mat_0145_30 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_2, count: 2), b: m3b); //B30(0-7) B31(0-7) B34(0-7) B35(0-7)
5711 const __m256i rhs_mat_2367_30 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_2, count: 2), b: m3b); //B32(0-7) B33(0-7) B36(0-7) B37(0-7)
5712
5713 const __m256i rhs_mat_0145_31 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_3, count: 2), b: m3b); //B30(8-15) B31(8-15) B34(8-15) B35(8-15)
5714 const __m256i rhs_mat_2367_31 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_3, count: 2), b: m3b); //B32(8-15) B33(8-15) B36(8-15) B37(8-15)
5715
5716 // Fifth sub block of the eight sub blocks processed in the iteration
5717 const __m256i rhs_mat_0145_40 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 4), b: m3b); //B40(0-7) B41(0-7) B44(0-7) B45(0-7)
5718 const __m256i rhs_mat_2367_40 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 4), b: m3b); //B42(0-7) B43(0-7) B46(0-7) B47(0-7)
5719
5720 const __m256i rhs_mat_0145_41 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 4), b: m3b); //B40(8-15) B41(8-15) B44(8-15) B45(8-15)
5721 const __m256i rhs_mat_2367_41 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 4), b: m3b); //B42(8-15) B43(8-15) B46(8-15) B47(8-15)
5722
5723 // Sixth sub block of the eight sub blocks processed in the iteration
5724 const __m256i rhs_mat_0145_50 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_2, count: 4), b: m3b); //B50(0-7) B51(0-7) B54(0-7) B55(0-7)
5725 const __m256i rhs_mat_2367_50 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_2, count: 4), b: m3b); //B52(0-7) B53(0-7) B56(0-7) B57(0-7)
5726
5727 const __m256i rhs_mat_0145_51 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_3, count: 4), b: m3b); //B50(8-15) B51(8-15) B54(8-15) B55(8-15)
5728 const __m256i rhs_mat_2367_51 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_3, count: 4), b: m3b); //B52(8-15) B53(8-15) B56(8-15) B57(8-15)
5729
5730 // Seventh sub block of the eight sub blocks processed in the iteration
5731 const __m256i rhs_mat_0145_60 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_0, count: 6), b: m3b); //B60(0-7) B61(0-7) B64(0-7) B65(0-7)
5732 const __m256i rhs_mat_2367_60 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_0, count: 6), b: m3b); //B62(0-7) B63(0-7) B66(0-7) B67(0-7)
5733
5734 const __m256i rhs_mat_0145_61 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_1, count: 6), b: m3b); //B60(8-15) B61(8-15) B64(8-15) B65(8-15)
5735 const __m256i rhs_mat_2367_61 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_1, count: 6), b: m3b); //B62(8-15) B63(8-15) B66(8-15) B67(8-15)
5736
5737 // Eighth sub block of the eight sub blocks processed in the iteration
5738 const __m256i rhs_mat_0145_70 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_2, count: 6), b: m3b); //B70(0-7) B71(0-7) B74(0-7) B75(0-7)
5739 const __m256i rhs_mat_2367_70 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_2, count: 6), b: m3b); //B72(0-7) B73(0-7) B76(0-7) B77(0-7)
5740
5741 const __m256i rhs_mat_0145_71 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_0145_3, count: 6), b: m3b); //B70(8-15) B71(8-15) B74(8-15) B75(8-15)
5742 const __m256i rhs_mat_2367_71 = _mm256_and_si256(a: _mm256_srli_epi16(a: rhs_raw_mat_2367_3, count: 6), b: m3b); //B72(8-15) B73(8-15) B76(8-15) B77(8-15)
5743
5744 // Shuffle pattern one - right side input
5745 const __m256i rhs_mat_0145_00_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_00, 136); //B00(0-3) B01(0-3) B00(0-3) B01(0-3) B04(0-3) B05(0-3) B04(0-3) B05(0-3)
5746 const __m256i rhs_mat_2367_00_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_00, 136); //B02(0-3) B03(0-3) B02(0-3) B03(0-3) B06(0-3) B07(0-3) B06(0-3) B07(0-3)
5747
5748 const __m256i rhs_mat_0145_01_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_01, 136); //B00(8-11) B01(8-11) B00(8-11) B01(8-11) B04(8-11) B05(8-11) B04(8-11) B05(8-11)
5749 const __m256i rhs_mat_2367_01_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_01, 136); //B02(8-11) B03(8-11) B02(8-11) B03(8-11) B06(8-11) B07(8-11) B06(8-11) B07(8-11)
5750
5751 const __m256i rhs_mat_0145_10_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_10, 136); //B10(0-3) B11(0-3) B10(0-3) B11(0-3) B14(0-3) B15(0-3) B14(0-3) B15(0-3)
5752 const __m256i rhs_mat_2367_10_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_10, 136); //B12(0-3) B13(0-3) B12(0-3) B13(0-3) B16(0-3) B17(0-3) B16(0-3) B17(0-3)
5753
5754 const __m256i rhs_mat_0145_11_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_11, 136); //B10(8-11) B11(8-11) B10(8-11) B11(8-11) B14(8-11) B15(8-11) B14(8-11) B15(8-11)
5755 const __m256i rhs_mat_2367_11_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_11, 136); //B12(8-11) B13(8-11) B12(8-11) B13(8-11) B16(8-11) B17(8-11) B16(8-11) B17(8-11)
5756
5757 const __m256i rhs_mat_0145_20_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_20, 136); //B20(0-3) B21(0-3) B20(0-3) B21(0-3) B24(0-3) B25(0-3) B24(0-3) B25(0-3)
5758 const __m256i rhs_mat_2367_20_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_20, 136); //B22(0-3) B23(0-3) B22(0-3) B23(0-3) B26(0-3) B27(0-3) B26(0-3) B27(0-3)
5759
5760 const __m256i rhs_mat_0145_21_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_21, 136); //B20(8-11) B21(8-11) B20(8-11) B21(8-11) B24(8-11) B25(8-11) B24(8-11) B25(8-11)
5761 const __m256i rhs_mat_2367_21_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_21, 136); //B22(8-11) B23(8-11) B22(8-11) B23(8-11) B26(8-11) B27(8-11) B26(8-11) B27(8-11)
5762
5763 const __m256i rhs_mat_0145_30_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_30, 136); //B30(0-3) B31(0-3) B30(0-3) B31(0-3) B34(0-3) B35(0-3) B34(0-3) B35(0-3)
5764 const __m256i rhs_mat_2367_30_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_30, 136); //B32(0-3) B33(0-3) B32(0-3) B33(0-3) B36(0-3) B37(0-3) B36(0-3) B37(0-3)
5765
5766 const __m256i rhs_mat_0145_31_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_31, 136); //B30(8-11) B31(8-11) B30(8-11) B31(8-11) B34(8-11) B35(8-11) B34(8-11) B35(8-11
5767 const __m256i rhs_mat_2367_31_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_31, 136); //B32(8-11) B33(8-11) B32(8-11) B33(8-11) B36(8-11) B37(8-11) B36(8-11) B37(8-11)
5768
5769 const __m256i rhs_mat_0145_40_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_40, 136); //B40(0-3) B41(0-3) B40(0-3) B41(0-3) B44(0-3) B45(0-3) B44(0-3) B45(0-3)
5770 const __m256i rhs_mat_2367_40_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_40, 136); //B42(0-3) B43(0-3) B42(0-3) B43(0-3) B46(0-3) B47(0-3) B46(0-3) B47(0-3)
5771
5772 const __m256i rhs_mat_0145_41_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_41, 136); //B40(8-11) B41(8-11) B40(8-11) B41(8-11) B44(8-11) B45(8-11) B44(8-11) B45(8-11)
5773 const __m256i rhs_mat_2367_41_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_41, 136); //B42(8-11) B43(8-11) B42(8-11) B43(8-11) B46(8-11) B47(8-11) B46(8-11) B47(8-11)
5774
5775 const __m256i rhs_mat_0145_50_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_50, 136); //B50(0-3) B51(0-3) B50(0-3) B51(0-3) B54(0-3) B55(0-3) B54(0-3) B55(0-3)
5776 const __m256i rhs_mat_2367_50_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_50, 136); //B52(0-3) B53(0-3) B52(0-3) B53(0-3) B56(0-3) B57(0-3) B56(0-3) B57(0-3)
5777
5778 const __m256i rhs_mat_0145_51_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_51, 136); //B50(8-11) B51(8-11) B50(8-11) B51(8-11) B54(8-11) B55(8-11) B54(8-11) B55(8-11)
5779 const __m256i rhs_mat_2367_51_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_51, 136); //B52(8-11) B53(8-11) B52(8-11) B53(8-11) B56(8-11) B57(8-11) B56(8-11) B57(8-11)
5780
5781 const __m256i rhs_mat_0145_60_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_60, 136); //B60(0-3) B61(0-3) B60(0-3) B61(0-3) B64(0-3) B65(0-3) B64(0-3) B65(0-3)
5782 const __m256i rhs_mat_2367_60_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_60, 136); //B62(0-3) B63(0-3) B62(0-3) B63(0-3) B66(0-3) B67(0-3) B66(0-3) B67(0-3)
5783
5784 const __m256i rhs_mat_0145_61_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_61, 136); //B60(8-11) B61(8-11) B60(8-11) B61(8-11) B64(8-11) B65(8-11) B64(8-11) B65(8-11)
5785 const __m256i rhs_mat_2367_61_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_61, 136); //B62(8-11) B63(8-11) B62(8-11) B63(8-11) B66(8-11) B67(8-11) B66(8-11) B67(8-11)
5786
5787 const __m256i rhs_mat_0145_70_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_70, 136); //B70(0-3) B71(0-3) B70(0-3) B71(0-3) B74(0-3) B75(0-3) B74(0-3) B75(0-3)
5788 const __m256i rhs_mat_2367_70_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_70, 136); //B72(0-3) B73(0-3) B72(0-3) B73(0-3) B76(0-3) B77(0-3) B76(0-3) B77(0-3)
5789
5790 const __m256i rhs_mat_0145_71_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_71, 136); //B70(8-11) B71(8-11) B70(8-11) B71(8-11) B74(8-11) B75(8-11) B74(8-11) B75(8-11)
5791 const __m256i rhs_mat_2367_71_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_71, 136); //B72(8-11) B73(8-11) B72(8-11) B73(8-11) B76(8-11) B77(8-11) B76(8-11) B77(8-11)
5792
5793
5794 // Shuffle pattern two - right side input
5795 const __m256i rhs_mat_0145_00_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_00, 221); //B00(4-7) B01(4-7) B00(4-7) B01(4-7) B04(4-7) B05(4-7) B04(4-7) B05(4-7)
5796 const __m256i rhs_mat_2367_00_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_00, 221); //B02(4-7) B03(4-7) B02(4-7) B03(4-7) B06(4-7) B07(4-7) B06(4-7) B07(4-7)
5797
5798 const __m256i rhs_mat_0145_01_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_01, 221); //B00(12-15) B01(12-15) B00(12-15) B01(12-15) B04(12-15) B05(12-15) B04(12-15) B05(12-15)
5799 const __m256i rhs_mat_2367_01_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_01, 221); //B02(12-15) B03(12-15) B02(12-15) B03(12-15) B06(12-15) B07(12-15) B06(12-15) B07(12-15)
5800
5801 const __m256i rhs_mat_0145_10_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_10, 221); //B10(4-7) B11(4-7) B10(4-7) B11(4-7) B14(4-7) B15(4-7) B14(4-7) B15(4-7)
5802 const __m256i rhs_mat_2367_10_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_10, 221); //B12(4-7) B13(4-7) B12(4-7) B13(4-7) B16(4-7) B17(4-7) B16(4-7) B17(4-7)
5803
5804 const __m256i rhs_mat_0145_11_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_11, 221); //B10(12-15) B11(12-15) B10(12-15) B11(12-15) B14(12-15) B15(12-15) B14(12-15) B15(12-15)
5805 const __m256i rhs_mat_2367_11_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_11, 221); //B12(12-15) B13(12-15) B12(12-15) B13(12-15) B16(12-15) B17(12-15) B16(12-15) B17(12-15)
5806
5807 const __m256i rhs_mat_0145_20_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_20, 221); //B20(4-7) B21(4-7) B20(4-7) B21(4-7) B24(4-7) B25(4-7) B24(4-7) B25(4-7)
5808 const __m256i rhs_mat_2367_20_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_20, 221); //B22(4-7) B23(4-7) B22(4-7) B23(4-7) B26(4-7) B27(4-7) B26(4-7) B27(4-7)
5809
5810 const __m256i rhs_mat_0145_21_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_21, 221); //B20(12-15) B21(12-15) B20(12-15) B21(12-15) B24(12-15) B25(12-15) B24(12-15) B25(12-15)
5811 const __m256i rhs_mat_2367_21_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_21, 221); //B22(12-15) B23(12-15) B22(12-15) B23(12-15) B26(12-15) B27(12-15) B26(12-15) B27(12-15)
5812
5813 const __m256i rhs_mat_0145_30_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_30, 221); //B30(4-7) B31(4-7) B30(4-7) B31(4-7) B34(4-7) B35(4-7) B34(4-7) B35(4-7)
5814 const __m256i rhs_mat_2367_30_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_30, 221); //B32(4-7) B33(4-7) B32(4-7) B33(4-7) B36(4-7) B37(4-7) B36(4-7) B37(4-7)
5815
5816 const __m256i rhs_mat_0145_31_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_31, 221); //B30(12-15) B31(12-15) B30(12-15) B31(12-15) B34(12-15) B35(12-15) B34(12-15) B35(12-15)
5817 const __m256i rhs_mat_2367_31_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_31, 221); //B32(12-15) B33(12-15) B32(12-15) B33(12-15) B36(12-15) B37(12-15) B36(12-15) B37(12-15)
5818
5819 const __m256i rhs_mat_0145_40_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_40, 221); //B40(4-7) B41(4-7) B40(4-7) B41(4-7) B44(4-7) B45(4-7) B44(4-7) B45(4-7)
5820 const __m256i rhs_mat_2367_40_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_40, 221); //B42(4-7) B43(4-7) B42(4-7) B43(4-7) B46(4-7) B47(4-7) B46(4-7) B47(4-7)
5821
5822 const __m256i rhs_mat_0145_41_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_41, 221); //B40(12-15) B41(12-15) B40(12-15) B41(12-15) B44(12-15) B45(12-15) B44(12-15) B45(12-15)
5823 const __m256i rhs_mat_2367_41_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_41, 221); //B42(12-15) B43(12-15) B42(12-15) B43(12-15) B46(12-15) B47(12-15) B46(12-15) B47(12-15)
5824
5825 const __m256i rhs_mat_0145_50_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_50, 221); //B50(4-7) B51(4-7) B50(4-7) B51(4-7) B54(4-7) B55(4-7) B54(4-7) B55(4-7)
5826 const __m256i rhs_mat_2367_50_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_50, 221); //B52(4-7) B53(4-7) B52(4-7) B53(4-7) B56(4-7) B57(4-7) B56(4-7) B57(4-7)
5827
5828 const __m256i rhs_mat_0145_51_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_51, 221); //B50(12-15) B51(12-15) B50(12-15) B51(12-15) B54(12-15) B55(12-15) B54(12-15) B55(12-15)
5829 const __m256i rhs_mat_2367_51_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_51, 221); //B52(12-15) B53(12-15) B52(12-15) B53(12-15) B56(12-15) B57(12-15) B56(12-15) B57(12-15)
5830
5831 const __m256i rhs_mat_0145_60_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_60, 221); //B60(4-7) B61(4-7) B60(4-7) B61(4-7) B64(4-7) B65(4-7) B64(4-7) B65(4-7)
5832 const __m256i rhs_mat_2367_60_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_60, 221); //B62(4-7) B63(4-7) B62(4-7) B63(4-7) B66(4-7) B67(4-7) B66(4-7) B67(4-7)
5833
5834 const __m256i rhs_mat_0145_61_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_61, 221); //B60(12-15) B61(12-15) B60(12-15) B61(12-15) B64(12-15) B65(12-15) B64(12-15) B65(12-15)
5835 const __m256i rhs_mat_2367_61_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_61, 221); //B62(12-15) B63(12-15) B62(12-15) B63(12-15) B66(12-15) B67(12-15) B66(12-15) B67(12-15)
5836
5837 const __m256i rhs_mat_0145_70_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_70, 221); //B70(4-7) B71(4-7) B70(4-7) B71(4-7) B74(4-7) B75(4-7) B74(4-7) B75(4-7)
5838 const __m256i rhs_mat_2367_70_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_70, 221); //B72(4-7) B73(4-7) B72(4-7) B73(4-7) B76(4-7) B77(4-7) B76(4-7) B77(4-7)
5839
5840 const __m256i rhs_mat_0145_71_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_71, 221); //B70(12-15) B71(12-15) B70(12-15) B71(12-15) B74(12-15) B75(12-15) B74(12-15) B75(12-15)
5841 const __m256i rhs_mat_2367_71_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_71, 221); //B72(12-15) B73(12-15) B72(12-15) B73(12-15) B76(12-15) B77(12-15) B76(12-15) B77(12-15)
5842
5843
5844 //Scales and Mins of corresponding sub blocks from different Q2_K structures are stored together
5845 //s00 m00 s01 m01 s10 m10 s11 m11 s20 m20 s21 m21 s30 m30 s31 m31 s40 m40 s41 m41 s50 m50 s51 m51 s60 m60 s61 m61 s70 m70 s71 m71
5846
5847 // Combine mins and scales for sub-blocks: 0-1, 2-3, 4-5, 6-7 in the sb loop
5848 const __m128i mins_and_scales_01 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + sb * 64));
5849 const __m128i mins_and_scales_23 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 16 + sb * 64));
5850 const __m128i mins_and_scales_45 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 32 + sb * 64));
5851 const __m128i mins_and_scales_67 = _mm_loadu_si128(p: (const __m128i *)(b_ptr[b].scales + 48 + sb * 64));
5852
5853 // Extract scales which is lower half from mins_and_scales
5854 const __m128i scales_01 = _mm_and_si128(a: mins_and_scales_01, b: m4b_sse);
5855 const __m128i scales_23 = _mm_and_si128(a: mins_and_scales_23, b: m4b_sse);
5856 const __m128i scales_45 = _mm_and_si128(a: mins_and_scales_45, b: m4b_sse);
5857 const __m128i scales_67 = _mm_and_si128(a: mins_and_scales_67, b: m4b_sse);
5858
5859 // Extract mins which is upper half from mins_and_scales
5860 const __m256i mins_01 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_01, count: 4), b: m4b_sse));
5861 const __m256i mins_23 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_23, count: 4), b: m4b_sse));
5862 const __m256i mins_45 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_45, count: 4), b: m4b_sse));
5863 const __m256i mins_67 = _mm256_cvtepu8_epi16(V: _mm_and_si128(a: _mm_srli_epi16(a: mins_and_scales_67, count: 4), b: m4b_sse));
5864
5865 const __m256i scales_0 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_01, b: scalesmask1_sse));
5866 const __m256i scales_1 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_01, b: scalesmask2_sse));
5867
5868 const __m256i scales_2 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_23, b: scalesmask1_sse));
5869 const __m256i scales_3 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_23, b: scalesmask2_sse));
5870
5871 const __m256i scales_4 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_45, b: scalesmask1_sse));
5872 const __m256i scales_5 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_45, b: scalesmask2_sse));
5873
5874 const __m256i scales_6 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_67, b: scalesmask1_sse));
5875 const __m256i scales_7 = _mm256_cvtepu8_epi16(V: _mm_shuffle_epi8(a: scales_67, b: scalesmask2_sse));
5876
5877 const __m256i scale_0145_0 = _mm256_shuffle_epi32(scales_0, 68);
5878 const __m256i scale_2367_0 = _mm256_shuffle_epi32(scales_0, 238);
5879
5880 const __m256i scale_0145_1 = _mm256_shuffle_epi32(scales_1, 68);
5881 const __m256i scale_2367_1 = _mm256_shuffle_epi32(scales_1, 238);
5882
5883 const __m256i scale_0145_2 = _mm256_shuffle_epi32(scales_2, 68);
5884 const __m256i scale_2367_2 = _mm256_shuffle_epi32(scales_2, 238);
5885
5886 const __m256i scale_0145_3 = _mm256_shuffle_epi32(scales_3, 68);
5887 const __m256i scale_2367_3 = _mm256_shuffle_epi32(scales_3, 238);
5888
5889 const __m256i scale_0145_4 = _mm256_shuffle_epi32(scales_4, 68);
5890 const __m256i scale_2367_4 = _mm256_shuffle_epi32(scales_4, 238);
5891
5892 const __m256i scale_0145_5 = _mm256_shuffle_epi32(scales_5, 68);
5893 const __m256i scale_2367_5 = _mm256_shuffle_epi32(scales_5, 238);
5894
5895 const __m256i scale_0145_6 = _mm256_shuffle_epi32(scales_6, 68);
5896 const __m256i scale_2367_6 = _mm256_shuffle_epi32(scales_6, 238);
5897
5898 const __m256i scale_0145_7 = _mm256_shuffle_epi32(scales_7, 68);
5899 const __m256i scale_2367_7 = _mm256_shuffle_epi32(scales_7, 238);
5900
5901 // Load the four block_q8_k quantized values interleaved with each other in chunks of eight bytes - A0,A1,A2,A3
5902 // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
5903 __m256i lhs_mat_0123_00 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 512 * sb)));
5904 __m256i lhs_mat_01_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 0);
5905 __m256i lhs_mat_23_00 = _mm256_permute2f128_si256(lhs_mat_0123_00, lhs_mat_0123_00, 17);
5906 __m256i lhs_mat_0123_01 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 32 + 512 * sb)));
5907 __m256i lhs_mat_01_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 0);
5908 __m256i lhs_mat_23_01 = _mm256_permute2f128_si256(lhs_mat_0123_01, lhs_mat_0123_01, 17);
5909 __m256i lhs_mat_0123_10 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 64 + 512 * sb)));
5910 __m256i lhs_mat_01_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 0);
5911 __m256i lhs_mat_23_10 = _mm256_permute2f128_si256(lhs_mat_0123_10, lhs_mat_0123_10, 17);
5912 __m256i lhs_mat_0123_11 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 96 + 512 * sb)));
5913 __m256i lhs_mat_01_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 0);
5914 __m256i lhs_mat_23_11 = _mm256_permute2f128_si256(lhs_mat_0123_11, lhs_mat_0123_11, 17);
5915 __m256i lhs_mat_0123_20 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 128 + 512 * sb)));
5916 __m256i lhs_mat_01_20 = _mm256_permute2f128_si256(lhs_mat_0123_20, lhs_mat_0123_20, 0);
5917 __m256i lhs_mat_23_20 = _mm256_permute2f128_si256(lhs_mat_0123_20, lhs_mat_0123_20, 17);
5918 __m256i lhs_mat_0123_21 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 160 + 512 * sb)));
5919 __m256i lhs_mat_01_21 = _mm256_permute2f128_si256(lhs_mat_0123_21, lhs_mat_0123_21, 0);
5920 __m256i lhs_mat_23_21 = _mm256_permute2f128_si256(lhs_mat_0123_21, lhs_mat_0123_21, 17);
5921 __m256i lhs_mat_0123_30 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 192 + 512 * sb)));
5922 __m256i lhs_mat_01_30 = _mm256_permute2f128_si256(lhs_mat_0123_30, lhs_mat_0123_30, 0);
5923 __m256i lhs_mat_23_30 = _mm256_permute2f128_si256(lhs_mat_0123_30, lhs_mat_0123_30, 17);
5924 __m256i lhs_mat_0123_31 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 224 + 512 * sb)));
5925 __m256i lhs_mat_01_31 = _mm256_permute2f128_si256(lhs_mat_0123_31, lhs_mat_0123_31, 0);
5926 __m256i lhs_mat_23_31 = _mm256_permute2f128_si256(lhs_mat_0123_31, lhs_mat_0123_31, 17);
5927
5928 __m256i lhs_mat_0123_40 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 256 + 512 * sb)));
5929 __m256i lhs_mat_01_40 = _mm256_permute2f128_si256(lhs_mat_0123_40, lhs_mat_0123_40, 0);
5930 __m256i lhs_mat_23_40 = _mm256_permute2f128_si256(lhs_mat_0123_40, lhs_mat_0123_40, 17);
5931 __m256i lhs_mat_0123_41 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 288 + 512 * sb)));
5932 __m256i lhs_mat_01_41 = _mm256_permute2f128_si256(lhs_mat_0123_41, lhs_mat_0123_41, 0);
5933 __m256i lhs_mat_23_41 = _mm256_permute2f128_si256(lhs_mat_0123_41, lhs_mat_0123_41, 17);
5934 __m256i lhs_mat_0123_50 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 320 + 512 * sb)));
5935 __m256i lhs_mat_01_50 = _mm256_permute2f128_si256(lhs_mat_0123_50, lhs_mat_0123_50, 0);
5936 __m256i lhs_mat_23_50 = _mm256_permute2f128_si256(lhs_mat_0123_50, lhs_mat_0123_50, 17);
5937 __m256i lhs_mat_0123_51 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 352 + 512 * sb)));
5938 __m256i lhs_mat_01_51 = _mm256_permute2f128_si256(lhs_mat_0123_51, lhs_mat_0123_51, 0);
5939 __m256i lhs_mat_23_51 = _mm256_permute2f128_si256(lhs_mat_0123_51, lhs_mat_0123_51, 17);
5940 __m256i lhs_mat_0123_60 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 384 + 512 * sb)));
5941 __m256i lhs_mat_01_60 = _mm256_permute2f128_si256(lhs_mat_0123_60, lhs_mat_0123_60, 0);
5942 __m256i lhs_mat_23_60 = _mm256_permute2f128_si256(lhs_mat_0123_60, lhs_mat_0123_60, 17);
5943 __m256i lhs_mat_0123_61 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 416 + 512 * sb)));
5944 __m256i lhs_mat_01_61 = _mm256_permute2f128_si256(lhs_mat_0123_61, lhs_mat_0123_61, 0);
5945 __m256i lhs_mat_23_61 = _mm256_permute2f128_si256(lhs_mat_0123_61, lhs_mat_0123_61, 17);
5946 __m256i lhs_mat_0123_70 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 448 + 512 * sb)));
5947 __m256i lhs_mat_01_70 = _mm256_permute2f128_si256(lhs_mat_0123_70, lhs_mat_0123_70, 0);
5948 __m256i lhs_mat_23_70 = _mm256_permute2f128_si256(lhs_mat_0123_70, lhs_mat_0123_70, 17);
5949 __m256i lhs_mat_0123_71 = _mm256_loadu_si256(p: (const __m256i * )((a_ptr[b].qs + 480 + 512 * sb)));
5950 __m256i lhs_mat_01_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 0);
5951 __m256i lhs_mat_23_71 = _mm256_permute2f128_si256(lhs_mat_0123_71, lhs_mat_0123_71, 17);
5952
5953 // Bsums are loaded for the different Q8_K blocks
5954 __m128i lhs_raw_bsums_01_0123 = _mm_loadu_si128(p: (const __m128i *)((a_ptr[b].bsums + 32 * sb)));
5955 __m128i lhs_raw_bsums_23_0123 = _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].bsums + 8 + 32 * sb));
5956 __m128i lhs_raw_bsums_01_4567 = _mm_loadu_si128(p: (const __m128i *)((a_ptr[b].bsums + 16 + 32 * sb)));
5957 __m128i lhs_raw_bsums_23_4567 = _mm_loadu_si128(p: (const __m128i *)(a_ptr[b].bsums + 24 + 32 * sb));
5958
5959 // Shuffle pattern one - left side input
5960 const __m256i lhs_mat_01_00_sp1 = _mm256_shuffle_epi32(lhs_mat_01_00, 160); //A00(0-3) A00(0-3) A01(0-3) A01(0-3) A00(0-3) A00(0-3) A01(0-3) A01(0-3)
5961 const __m256i lhs_mat_23_00_sp1 = _mm256_shuffle_epi32(lhs_mat_23_00, 160); //A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3) A02(0-3) A03(0-3)
5962
5963 const __m256i lhs_mat_01_01_sp1 = _mm256_shuffle_epi32(lhs_mat_01_01, 160); //A00(8-11) A00(8-11) A01(8-11) A01(8-11) A00(8-11) A00(8-11) A01(8-11) A01(8-11)
5964 const __m256i lhs_mat_23_01_sp1 = _mm256_shuffle_epi32(lhs_mat_23_01, 160); //A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11) A02(8-11) A03(8-11)
5965
5966 const __m256i lhs_mat_01_10_sp1 = _mm256_shuffle_epi32(lhs_mat_01_10, 160); //A10(0-3) A10(0-3) A11(0-3) A11(0-3) A10(0-3) A10(0-3) A11(0-3) A11(0-3)
5967 const __m256i lhs_mat_23_10_sp1 = _mm256_shuffle_epi32(lhs_mat_23_10, 160); //A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3) A12(0-3) A13(0-3)
5968
5969 const __m256i lhs_mat_01_11_sp1 = _mm256_shuffle_epi32(lhs_mat_01_11, 160); //A10(8-11) A10(8-11) A11(8-11) A11(8-11) A10(8-11) A10(8-11) A11(8-11) A11(8-11)
5970 const __m256i lhs_mat_23_11_sp1 = _mm256_shuffle_epi32(lhs_mat_23_11, 160); //A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11) A12(8-11) A13(8-11)
5971
5972 const __m256i lhs_mat_01_20_sp1 = _mm256_shuffle_epi32(lhs_mat_01_20, 160); //A20(0-3) A20(0-3) A21(0-3) A21(0-3) A20(0-3) A20(0-3) A21(0-3) A21(0-3)
5973 const __m256i lhs_mat_23_20_sp1 = _mm256_shuffle_epi32(lhs_mat_23_20, 160); //A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3) A22(0-3) A23(0-3)
5974
5975 const __m256i lhs_mat_01_21_sp1 = _mm256_shuffle_epi32(lhs_mat_01_21, 160); //A20(8-11) A20(8-11) A21(8-11) A21(8-11) A20(8-11) A20(8-11) A21(8-11) A21(8-11)
5976 const __m256i lhs_mat_23_21_sp1 = _mm256_shuffle_epi32(lhs_mat_23_21, 160); //A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11) A22(8-11) A23(8-11)
5977
5978 const __m256i lhs_mat_01_30_sp1 = _mm256_shuffle_epi32(lhs_mat_01_30, 160); //A30(0-3) A30(0-3) A31(0-3) A31(0-3) A30(0-3) A30(0-3) A31(0-3) A31(0-3)
5979 const __m256i lhs_mat_23_30_sp1 = _mm256_shuffle_epi32(lhs_mat_23_30, 160); //A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3) A32(0-3) A33(0-3)
5980
5981 const __m256i lhs_mat_01_31_sp1 = _mm256_shuffle_epi32(lhs_mat_01_31, 160); //A30(8-11) A30(8-11) A31(8-11) A31(8-11) A30(8-11) A30(8-11) A31(8-11) A31(8-11)
5982 const __m256i lhs_mat_23_31_sp1 = _mm256_shuffle_epi32(lhs_mat_23_31, 160); //A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11) A32(8-11) A33(8-11)
5983
5984 const __m256i lhs_mat_01_40_sp1 = _mm256_shuffle_epi32(lhs_mat_01_40, 160); //A40(0-3) A40(0-3) A41(0-3) A41(0-3) A40(0-3) A40(0-3) A41(0-3) A41(0-3)
5985 const __m256i lhs_mat_23_40_sp1 = _mm256_shuffle_epi32(lhs_mat_23_40, 160); //A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3) A42(0-3) A43(0-3)
5986
5987 const __m256i lhs_mat_01_41_sp1 = _mm256_shuffle_epi32(lhs_mat_01_41, 160); //A40(8-11) A40(8-11) A41(8-11) A41(8-11) A40(8-11) A40(8-11) A41(8-11) A41(8-11)
5988 const __m256i lhs_mat_23_41_sp1 = _mm256_shuffle_epi32(lhs_mat_23_41, 160); //A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11) A42(8-11) A43(8-11)
5989
5990 const __m256i lhs_mat_01_50_sp1 = _mm256_shuffle_epi32(lhs_mat_01_50, 160); //A50(0-3) A50(0-3) A51(0-3) A51(0-3) A50(0-3) A50(0-3) A51(0-3) A51(0-3)
5991 const __m256i lhs_mat_23_50_sp1 = _mm256_shuffle_epi32(lhs_mat_23_50, 160); //A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3) A52(0-3) A53(0-3)
5992
5993 const __m256i lhs_mat_01_51_sp1 = _mm256_shuffle_epi32(lhs_mat_01_51, 160); //A50(8-11) A50(8-11) A51(8-11) A51(8-11) A50(8-11) A50(8-11) A51(8-11) A51(8-11)
5994 const __m256i lhs_mat_23_51_sp1 = _mm256_shuffle_epi32(lhs_mat_23_51, 160); //A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11) A52(8-11) A53(8-11)
5995
5996 const __m256i lhs_mat_01_60_sp1 = _mm256_shuffle_epi32(lhs_mat_01_60, 160); //A60(0-3) A60(0-3) A61(0-3) A61(0-3) A60(0-3) A60(0-3) A61(0-3) A61(0-3)
5997 const __m256i lhs_mat_23_60_sp1 = _mm256_shuffle_epi32(lhs_mat_23_60, 160); //A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3) A62(0-3) A63(0-3)
5998
5999 const __m256i lhs_mat_01_61_sp1 = _mm256_shuffle_epi32(lhs_mat_01_61, 160); //A60(8-11) A60(8-11) A61(8-11) A61(8-11) A60(8-11) A60(8-11) A61(8-11) A61(8-11)
6000 const __m256i lhs_mat_23_61_sp1 = _mm256_shuffle_epi32(lhs_mat_23_61, 160); //A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11) A62(8-11) A63(8-11)
6001
6002 const __m256i lhs_mat_01_70_sp1 = _mm256_shuffle_epi32(lhs_mat_01_70, 160); //A70(0-3) A70(0-3) A71(0-3) A71(0-3) A70(0-3) A70(0-3) A71(0-3) A71(0-3)
6003 const __m256i lhs_mat_23_70_sp1 = _mm256_shuffle_epi32(lhs_mat_23_70, 160); //A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3) A72(0-3) A73(0-3)
6004
6005 const __m256i lhs_mat_01_71_sp1 = _mm256_shuffle_epi32(lhs_mat_01_71, 160); //A70(8-11) A70(8-11) A71(8-11) A71(8-11) A70(8-11) A70(8-11) A71(8-11) A71(8-11)
6006 const __m256i lhs_mat_23_71_sp1 = _mm256_shuffle_epi32(lhs_mat_23_71, 160); //A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11) A72(8-11) A73(8-11)
6007
6008 // Shuffle pattern two- left side input
6009 const __m256i lhs_mat_01_00_sp2 = _mm256_shuffle_epi32(lhs_mat_01_00, 245); //A00(4-7) A00(4-7) A01(4-7) A01(4-7) A00(4-7) A00(4-7) A01(4-7) A01(4-7)
6010 const __m256i lhs_mat_23_00_sp2 = _mm256_shuffle_epi32(lhs_mat_23_00, 245); //A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7) A02(4-7) A03(4-7)
6011
6012 const __m256i lhs_mat_01_01_sp2 = _mm256_shuffle_epi32(lhs_mat_01_01, 245); //A00(12-15) A00(12-15) A01(12-15) A01(12-15) A00(12-15) A00(12-15) A01(12-15) A01(12-15)
6013 const __m256i lhs_mat_23_01_sp2 = _mm256_shuffle_epi32(lhs_mat_23_01, 245); //A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15) A02(12-15) A03(12-15)
6014
6015 const __m256i lhs_mat_01_10_sp2 = _mm256_shuffle_epi32(lhs_mat_01_10, 245); //A10(4-7) A10(4-7) A11(4-7) A11(4-7) A10(4-7) A10(4-7) A11(4-7) A11(4-7)
6016 const __m256i lhs_mat_23_10_sp2 = _mm256_shuffle_epi32(lhs_mat_23_10, 245); //A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7) A12(4-7) A13(4-7)
6017
6018 const __m256i lhs_mat_01_11_sp2 = _mm256_shuffle_epi32(lhs_mat_01_11, 245); //A10(12-15) A10(12-15) A11(12-15) A11(12-15) A10(12-15) A10(12-15) A11(12-15) A11(12-15)
6019 const __m256i lhs_mat_23_11_sp2 = _mm256_shuffle_epi32(lhs_mat_23_11, 245); //A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15) A12(12-15) A13(12-15)
6020
6021 const __m256i lhs_mat_01_20_sp2 = _mm256_shuffle_epi32(lhs_mat_01_20, 245); //A20(4-7) A20(4-7) A21(4-7) A21(4-7) A20(4-7) A20(4-7) A21(4-7) A21(4-7)
6022 const __m256i lhs_mat_23_20_sp2 = _mm256_shuffle_epi32(lhs_mat_23_20, 245); //A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7) A22(4-7) A23(4-7)
6023
6024 const __m256i lhs_mat_01_21_sp2 = _mm256_shuffle_epi32(lhs_mat_01_21, 245); //A20(12-15) A20(12-15) A21(12-15) A21(12-15) A20(12-15) A20(12-15) A21(12-15) A21(12-15)
6025 const __m256i lhs_mat_23_21_sp2 = _mm256_shuffle_epi32(lhs_mat_23_21, 245); //A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15) A22(12-15) A23(12-15)
6026
6027 const __m256i lhs_mat_01_30_sp2 = _mm256_shuffle_epi32(lhs_mat_01_30, 245); //A30(4-7) A30(4-7) A31(4-7) A31(4-7) A30(4-7) A30(4-7) A31(4-7) A31(4-7)
6028 const __m256i lhs_mat_23_30_sp2 = _mm256_shuffle_epi32(lhs_mat_23_30, 245); //A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7) A32(4-7) A33(4-7)
6029
6030 const __m256i lhs_mat_01_31_sp2 = _mm256_shuffle_epi32(lhs_mat_01_31, 245); //A30(12-15) A30(12-15) A31(12-15) A31(12-15) A30(12-15) A30(12-15) A31(12-15) A31(12-15)
6031 const __m256i lhs_mat_23_31_sp2 = _mm256_shuffle_epi32(lhs_mat_23_31, 245); //A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15) A32(12-15) A33(12-15)
6032
6033 const __m256i lhs_mat_01_40_sp2 = _mm256_shuffle_epi32(lhs_mat_01_40, 245); //A40(4-7) A40(4-7) A41(4-7) A41(4-7) A40(4-7) A40(4-7) A41(4-7) A41(4-7)
6034 const __m256i lhs_mat_23_40_sp2 = _mm256_shuffle_epi32(lhs_mat_23_40, 245); //A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7) A42(4-7) A43(4-7)
6035
6036 const __m256i lhs_mat_01_41_sp2 = _mm256_shuffle_epi32(lhs_mat_01_41, 245); //A40(12-15) A40(12-15) A41(12-15) A41(12-15) A40(12-15) A40(12-15) A41(12-15) A41(12-15)
6037 const __m256i lhs_mat_23_41_sp2 = _mm256_shuffle_epi32(lhs_mat_23_41, 245); //A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15) A42(12-15) A43(12-15)
6038
6039 const __m256i lhs_mat_01_50_sp2 = _mm256_shuffle_epi32(lhs_mat_01_50, 245); //A50(4-7) A50(4-7) A51(4-7) A51(4-7) A50(4-7) A50(4-7) A51(4-7) A51(4-7)
6040 const __m256i lhs_mat_23_50_sp2 = _mm256_shuffle_epi32(lhs_mat_23_50, 245); //A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7) A52(4-7) A53(4-7)
6041
6042 const __m256i lhs_mat_01_51_sp2 = _mm256_shuffle_epi32(lhs_mat_01_51, 245); //A50(12-15) A50(12-15) A51(12-15) A51(12-15) A50(12-15) A50(12-15) A51(12-15) A51(12-15)
6043 const __m256i lhs_mat_23_51_sp2 = _mm256_shuffle_epi32(lhs_mat_23_51, 245); //A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15) A52(12-15) A53(12-15)
6044
6045 const __m256i lhs_mat_01_60_sp2 = _mm256_shuffle_epi32(lhs_mat_01_60, 245); //A60(4-7) A60(4-7) A61(4-7) A61(4-7) A60(4-7) A60(4-7) A61(4-7) A61(4-7)
6046 const __m256i lhs_mat_23_60_sp2 = _mm256_shuffle_epi32(lhs_mat_23_60, 245); //A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7) A62(4-7) A63(4-7)
6047
6048 const __m256i lhs_mat_01_61_sp2 = _mm256_shuffle_epi32(lhs_mat_01_61, 245); //A60(12-15) A60(12-15) A61(12-15) A61(12-15) A60(12-15) A60(12-15) A61(12-15) A61(12-15)
6049 const __m256i lhs_mat_23_61_sp2 = _mm256_shuffle_epi32(lhs_mat_23_61, 245); //A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15) A62(12-15) A63(12-15)
6050
6051 const __m256i lhs_mat_01_70_sp2 = _mm256_shuffle_epi32(lhs_mat_01_70, 245); //A70(4-7) A70(4-7) A71(4-7) A71(4-7) A70(4-7) A70(4-7) A71(4-7) A71(4-7)
6052 const __m256i lhs_mat_23_70_sp2 = _mm256_shuffle_epi32(lhs_mat_23_70, 245); //A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7) A72(4-7) A73(4-7)
6053
6054 const __m256i lhs_mat_01_71_sp2 = _mm256_shuffle_epi32(lhs_mat_01_71, 245); //A70(12-15) A70(12-15) A71(12-15) A71(12-15) A70(12-15) A70(12-15) A71(12-15) A71(12-15)
6055 const __m256i lhs_mat_23_71_sp2 = _mm256_shuffle_epi32(lhs_mat_23_71, 245); //A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15) A72(12-15) A73(12-15)
6056
6057 // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
6058 __m256i iacc_mat_00_0_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp1, b: lhs_mat_01_00_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp1, b: lhs_mat_01_01_sp1));
6059 __m256i iacc_mat_01_0_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp1, b: lhs_mat_01_00_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp1, b: lhs_mat_01_01_sp1));
6060
6061 __m256i iacc_mat_10_0_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp1, b: lhs_mat_23_00_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp1, b: lhs_mat_23_01_sp1));
6062 __m256i iacc_mat_11_0_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp1, b: lhs_mat_23_00_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp1, b: lhs_mat_23_01_sp1));
6063
6064 __m256i iacc_mat_00_1_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp1, b: lhs_mat_01_10_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp1, b: lhs_mat_01_11_sp1));
6065 __m256i iacc_mat_01_1_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp1, b: lhs_mat_01_10_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp1, b: lhs_mat_01_11_sp1));
6066
6067 __m256i iacc_mat_10_1_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp1, b: lhs_mat_23_10_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp1, b: lhs_mat_23_11_sp1));
6068 __m256i iacc_mat_11_1_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp1, b: lhs_mat_23_10_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp1, b: lhs_mat_23_11_sp1));
6069
6070 __m256i iacc_mat_00_2_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_20_sp1, b: lhs_mat_01_20_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_21_sp1, b: lhs_mat_01_21_sp1));
6071 __m256i iacc_mat_01_2_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_20_sp1, b: lhs_mat_01_20_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_21_sp1, b: lhs_mat_01_21_sp1));
6072
6073 __m256i iacc_mat_10_2_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_20_sp1, b: lhs_mat_23_20_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_21_sp1, b: lhs_mat_23_21_sp1));
6074 __m256i iacc_mat_11_2_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_20_sp1, b: lhs_mat_23_20_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_21_sp1, b: lhs_mat_23_21_sp1));
6075
6076 __m256i iacc_mat_00_3_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_30_sp1, b: lhs_mat_01_30_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_31_sp1, b: lhs_mat_01_31_sp1));
6077 __m256i iacc_mat_01_3_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_30_sp1, b: lhs_mat_01_30_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_31_sp1, b: lhs_mat_01_31_sp1));
6078
6079 __m256i iacc_mat_10_3_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_30_sp1, b: lhs_mat_23_30_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_31_sp1, b: lhs_mat_23_31_sp1));
6080 __m256i iacc_mat_11_3_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_30_sp1, b: lhs_mat_23_30_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_31_sp1, b: lhs_mat_23_31_sp1));
6081
6082 __m256i iacc_mat_00_4_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_40_sp1, b: lhs_mat_01_40_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_41_sp1, b: lhs_mat_01_41_sp1));
6083 __m256i iacc_mat_01_4_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_40_sp1, b: lhs_mat_01_40_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_41_sp1, b: lhs_mat_01_41_sp1));
6084
6085 __m256i iacc_mat_10_4_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_40_sp1, b: lhs_mat_23_40_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_41_sp1, b: lhs_mat_23_41_sp1));
6086 __m256i iacc_mat_11_4_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_40_sp1, b: lhs_mat_23_40_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_41_sp1, b: lhs_mat_23_41_sp1));
6087
6088 __m256i iacc_mat_00_5_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_50_sp1, b: lhs_mat_01_50_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_51_sp1, b: lhs_mat_01_51_sp1));
6089 __m256i iacc_mat_01_5_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_50_sp1, b: lhs_mat_01_50_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_51_sp1, b: lhs_mat_01_51_sp1));
6090
6091 __m256i iacc_mat_10_5_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_50_sp1, b: lhs_mat_23_50_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_51_sp1, b: lhs_mat_23_51_sp1));
6092 __m256i iacc_mat_11_5_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_50_sp1, b: lhs_mat_23_50_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_51_sp1, b: lhs_mat_23_51_sp1));
6093
6094 __m256i iacc_mat_00_6_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_60_sp1, b: lhs_mat_01_60_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_61_sp1, b: lhs_mat_01_61_sp1));
6095 __m256i iacc_mat_01_6_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_60_sp1, b: lhs_mat_01_60_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_61_sp1, b: lhs_mat_01_61_sp1));
6096
6097 __m256i iacc_mat_10_6_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_60_sp1, b: lhs_mat_23_60_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_61_sp1, b: lhs_mat_23_61_sp1));
6098 __m256i iacc_mat_11_6_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_60_sp1, b: lhs_mat_23_60_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_61_sp1, b: lhs_mat_23_61_sp1));
6099
6100 __m256i iacc_mat_00_7_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_70_sp1, b: lhs_mat_01_70_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_71_sp1, b: lhs_mat_01_71_sp1));
6101 __m256i iacc_mat_01_7_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_70_sp1, b: lhs_mat_01_70_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_71_sp1, b: lhs_mat_01_71_sp1));
6102
6103 __m256i iacc_mat_10_7_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_70_sp1, b: lhs_mat_23_70_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_0145_71_sp1, b: lhs_mat_23_71_sp1));
6104 __m256i iacc_mat_11_7_sp1 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_70_sp1, b: lhs_mat_23_70_sp1),b: _mm256_maddubs_epi16(a: rhs_mat_2367_71_sp1, b: lhs_mat_23_71_sp1));
6105
6106
6107 __m256i iacc_mat_00_0_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp2, b: lhs_mat_01_00_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp2, b: lhs_mat_01_01_sp2));
6108 __m256i iacc_mat_01_0_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp2, b: lhs_mat_01_00_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp2, b: lhs_mat_01_01_sp2));
6109
6110 __m256i iacc_mat_10_0_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_00_sp2, b: lhs_mat_23_00_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_01_sp2, b: lhs_mat_23_01_sp2));
6111 __m256i iacc_mat_11_0_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_00_sp2, b: lhs_mat_23_00_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_01_sp2, b: lhs_mat_23_01_sp2));
6112
6113 __m256i iacc_mat_00_1_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp2, b: lhs_mat_01_10_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp2, b: lhs_mat_01_11_sp2));
6114 __m256i iacc_mat_01_1_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp2, b: lhs_mat_01_10_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp2, b: lhs_mat_01_11_sp2));
6115
6116 __m256i iacc_mat_10_1_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_10_sp2, b: lhs_mat_23_10_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_11_sp2, b: lhs_mat_23_11_sp2));
6117 __m256i iacc_mat_11_1_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_10_sp2, b: lhs_mat_23_10_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_11_sp2, b: lhs_mat_23_11_sp2));
6118
6119 __m256i iacc_mat_00_2_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_20_sp2, b: lhs_mat_01_20_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_21_sp2, b: lhs_mat_01_21_sp2));
6120 __m256i iacc_mat_01_2_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_20_sp2, b: lhs_mat_01_20_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_21_sp2, b: lhs_mat_01_21_sp2));
6121
6122 __m256i iacc_mat_10_2_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_20_sp2, b: lhs_mat_23_20_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_21_sp2, b: lhs_mat_23_21_sp2));
6123 __m256i iacc_mat_11_2_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_20_sp2, b: lhs_mat_23_20_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_21_sp2, b: lhs_mat_23_21_sp2));
6124
6125 __m256i iacc_mat_00_3_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_30_sp2, b: lhs_mat_01_30_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_31_sp2, b: lhs_mat_01_31_sp2));
6126 __m256i iacc_mat_01_3_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_30_sp2, b: lhs_mat_01_30_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_31_sp2, b: lhs_mat_01_31_sp2));
6127
6128 __m256i iacc_mat_10_3_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_30_sp2, b: lhs_mat_23_30_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_31_sp2, b: lhs_mat_23_31_sp2));
6129 __m256i iacc_mat_11_3_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_30_sp2, b: lhs_mat_23_30_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_31_sp2, b: lhs_mat_23_31_sp2));
6130
6131 __m256i iacc_mat_00_4_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_40_sp2, b: lhs_mat_01_40_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_41_sp2, b: lhs_mat_01_41_sp2));
6132 __m256i iacc_mat_01_4_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_40_sp2, b: lhs_mat_01_40_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_41_sp2, b: lhs_mat_01_41_sp2));
6133
6134 __m256i iacc_mat_10_4_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_40_sp2, b: lhs_mat_23_40_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_41_sp2, b: lhs_mat_23_41_sp2));
6135 __m256i iacc_mat_11_4_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_40_sp2, b: lhs_mat_23_40_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_41_sp2, b: lhs_mat_23_41_sp2));
6136
6137 __m256i iacc_mat_00_5_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_50_sp2, b: lhs_mat_01_50_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_51_sp2, b: lhs_mat_01_51_sp2));
6138 __m256i iacc_mat_01_5_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_50_sp2, b: lhs_mat_01_50_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_51_sp2, b: lhs_mat_01_51_sp2));
6139
6140 __m256i iacc_mat_10_5_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_50_sp2, b: lhs_mat_23_50_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_51_sp2, b: lhs_mat_23_51_sp2));
6141 __m256i iacc_mat_11_5_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_50_sp2, b: lhs_mat_23_50_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_51_sp2, b: lhs_mat_23_51_sp2));
6142
6143 __m256i iacc_mat_00_6_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_60_sp2, b: lhs_mat_01_60_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_61_sp2, b: lhs_mat_01_61_sp2));
6144 __m256i iacc_mat_01_6_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_60_sp2, b: lhs_mat_01_60_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_61_sp2, b: lhs_mat_01_61_sp2));
6145
6146 __m256i iacc_mat_10_6_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_60_sp2, b: lhs_mat_23_60_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_61_sp2, b: lhs_mat_23_61_sp2));
6147 __m256i iacc_mat_11_6_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_60_sp2, b: lhs_mat_23_60_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_61_sp2, b: lhs_mat_23_61_sp2));
6148
6149 __m256i iacc_mat_00_7_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_70_sp2, b: lhs_mat_01_70_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_71_sp2, b: lhs_mat_01_71_sp2));
6150 __m256i iacc_mat_01_7_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_70_sp2, b: lhs_mat_01_70_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_71_sp2, b: lhs_mat_01_71_sp2));
6151
6152 __m256i iacc_mat_10_7_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_0145_70_sp2, b: lhs_mat_23_70_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_0145_71_sp2, b: lhs_mat_23_71_sp2));
6153 __m256i iacc_mat_11_7_sp2 = _mm256_add_epi16(a: _mm256_maddubs_epi16(a: rhs_mat_2367_70_sp2, b: lhs_mat_23_70_sp2),b: _mm256_maddubs_epi16(a: rhs_mat_2367_71_sp2, b: lhs_mat_23_71_sp2));
6154
6155 // Combine results from both shuffle patterns for each output block.
6156 __m256i iacc_mat_00_0 = _mm256_add_epi16(a: iacc_mat_00_0_sp1, b: iacc_mat_00_0_sp2);
6157 __m256i iacc_mat_01_0 = _mm256_add_epi16(a: iacc_mat_01_0_sp1, b: iacc_mat_01_0_sp2);
6158 __m256i iacc_mat_10_0 = _mm256_add_epi16(a: iacc_mat_10_0_sp1, b: iacc_mat_10_0_sp2);
6159 __m256i iacc_mat_11_0 = _mm256_add_epi16(a: iacc_mat_11_0_sp1, b: iacc_mat_11_0_sp2);
6160
6161 __m256i iacc_mat_00_1 = _mm256_add_epi16(a: iacc_mat_00_1_sp1, b: iacc_mat_00_1_sp2);
6162 __m256i iacc_mat_01_1 = _mm256_add_epi16(a: iacc_mat_01_1_sp1, b: iacc_mat_01_1_sp2);
6163 __m256i iacc_mat_10_1 = _mm256_add_epi16(a: iacc_mat_10_1_sp1, b: iacc_mat_10_1_sp2);
6164 __m256i iacc_mat_11_1 = _mm256_add_epi16(a: iacc_mat_11_1_sp1, b: iacc_mat_11_1_sp2);
6165
6166 __m256i iacc_mat_00_2 = _mm256_add_epi16(a: iacc_mat_00_2_sp1, b: iacc_mat_00_2_sp2);
6167 __m256i iacc_mat_01_2 = _mm256_add_epi16(a: iacc_mat_01_2_sp1, b: iacc_mat_01_2_sp2);
6168 __m256i iacc_mat_10_2 = _mm256_add_epi16(a: iacc_mat_10_2_sp1, b: iacc_mat_10_2_sp2);
6169 __m256i iacc_mat_11_2 = _mm256_add_epi16(a: iacc_mat_11_2_sp1, b: iacc_mat_11_2_sp2);
6170
6171 __m256i iacc_mat_00_3 = _mm256_add_epi16(a: iacc_mat_00_3_sp1, b: iacc_mat_00_3_sp2);
6172 __m256i iacc_mat_01_3 = _mm256_add_epi16(a: iacc_mat_01_3_sp1, b: iacc_mat_01_3_sp2);
6173 __m256i iacc_mat_10_3 = _mm256_add_epi16(a: iacc_mat_10_3_sp1, b: iacc_mat_10_3_sp2);
6174 __m256i iacc_mat_11_3 = _mm256_add_epi16(a: iacc_mat_11_3_sp1, b: iacc_mat_11_3_sp2);
6175
6176 __m256i iacc_mat_00_4 = _mm256_add_epi16(a: iacc_mat_00_4_sp1, b: iacc_mat_00_4_sp2);
6177 __m256i iacc_mat_01_4 = _mm256_add_epi16(a: iacc_mat_01_4_sp1, b: iacc_mat_01_4_sp2);
6178 __m256i iacc_mat_10_4 = _mm256_add_epi16(a: iacc_mat_10_4_sp1, b: iacc_mat_10_4_sp2);
6179 __m256i iacc_mat_11_4 = _mm256_add_epi16(a: iacc_mat_11_4_sp1, b: iacc_mat_11_4_sp2);
6180
6181 __m256i iacc_mat_00_5 = _mm256_add_epi16(a: iacc_mat_00_5_sp1, b: iacc_mat_00_5_sp2);
6182 __m256i iacc_mat_01_5 = _mm256_add_epi16(a: iacc_mat_01_5_sp1, b: iacc_mat_01_5_sp2);
6183 __m256i iacc_mat_10_5 = _mm256_add_epi16(a: iacc_mat_10_5_sp1, b: iacc_mat_10_5_sp2);
6184 __m256i iacc_mat_11_5 = _mm256_add_epi16(a: iacc_mat_11_5_sp1, b: iacc_mat_11_5_sp2);
6185
6186 __m256i iacc_mat_00_6 = _mm256_add_epi16(a: iacc_mat_00_6_sp1, b: iacc_mat_00_6_sp2);
6187 __m256i iacc_mat_01_6 = _mm256_add_epi16(a: iacc_mat_01_6_sp1, b: iacc_mat_01_6_sp2);
6188 __m256i iacc_mat_10_6 = _mm256_add_epi16(a: iacc_mat_10_6_sp1, b: iacc_mat_10_6_sp2);
6189 __m256i iacc_mat_11_6 = _mm256_add_epi16(a: iacc_mat_11_6_sp1, b: iacc_mat_11_6_sp2);
6190
6191 __m256i iacc_mat_00_7 = _mm256_add_epi16(a: iacc_mat_00_7_sp1, b: iacc_mat_00_7_sp2);
6192 __m256i iacc_mat_01_7 = _mm256_add_epi16(a: iacc_mat_01_7_sp1, b: iacc_mat_01_7_sp2);
6193 __m256i iacc_mat_10_7 = _mm256_add_epi16(a: iacc_mat_10_7_sp1, b: iacc_mat_10_7_sp2);
6194 __m256i iacc_mat_11_7 = _mm256_add_epi16(a: iacc_mat_11_7_sp1, b: iacc_mat_11_7_sp2);
6195
6196 // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
6197 iacc_mat_00_0 = _mm256_madd_epi16(a: iacc_mat_00_0, b: scale_0145_0);
6198 iacc_mat_01_0 = _mm256_madd_epi16(a: iacc_mat_01_0, b: scale_2367_0);
6199 iacc_mat_10_0 = _mm256_madd_epi16(a: iacc_mat_10_0, b: scale_0145_0);
6200 iacc_mat_11_0 = _mm256_madd_epi16(a: iacc_mat_11_0, b: scale_2367_0);
6201
6202 iacc_mat_00_1 = _mm256_madd_epi16(a: iacc_mat_00_1, b: scale_0145_1);
6203 iacc_mat_01_1 = _mm256_madd_epi16(a: iacc_mat_01_1, b: scale_2367_1);
6204 iacc_mat_10_1 = _mm256_madd_epi16(a: iacc_mat_10_1, b: scale_0145_1);
6205 iacc_mat_11_1 = _mm256_madd_epi16(a: iacc_mat_11_1, b: scale_2367_1);
6206
6207 iacc_mat_00_2 = _mm256_madd_epi16(a: iacc_mat_00_2, b: scale_0145_2);
6208 iacc_mat_01_2 = _mm256_madd_epi16(a: iacc_mat_01_2, b: scale_2367_2);
6209 iacc_mat_10_2 = _mm256_madd_epi16(a: iacc_mat_10_2, b: scale_0145_2);
6210 iacc_mat_11_2 = _mm256_madd_epi16(a: iacc_mat_11_2, b: scale_2367_2);
6211
6212 iacc_mat_00_3 = _mm256_madd_epi16(a: iacc_mat_00_3, b: scale_0145_3);
6213 iacc_mat_01_3 = _mm256_madd_epi16(a: iacc_mat_01_3, b: scale_2367_3);
6214 iacc_mat_10_3 = _mm256_madd_epi16(a: iacc_mat_10_3, b: scale_0145_3);
6215 iacc_mat_11_3 = _mm256_madd_epi16(a: iacc_mat_11_3, b: scale_2367_3);
6216
6217 iacc_mat_00_4 = _mm256_madd_epi16(a: iacc_mat_00_4, b: scale_0145_4);
6218 iacc_mat_01_4 = _mm256_madd_epi16(a: iacc_mat_01_4, b: scale_2367_4);
6219 iacc_mat_10_4 = _mm256_madd_epi16(a: iacc_mat_10_4, b: scale_0145_4);
6220 iacc_mat_11_4 = _mm256_madd_epi16(a: iacc_mat_11_4, b: scale_2367_4);
6221
6222 iacc_mat_00_5 = _mm256_madd_epi16(a: iacc_mat_00_5, b: scale_0145_5);
6223 iacc_mat_01_5 = _mm256_madd_epi16(a: iacc_mat_01_5, b: scale_2367_5);
6224 iacc_mat_10_5 = _mm256_madd_epi16(a: iacc_mat_10_5, b: scale_0145_5);
6225 iacc_mat_11_5 = _mm256_madd_epi16(a: iacc_mat_11_5, b: scale_2367_5);
6226
6227 iacc_mat_00_6 = _mm256_madd_epi16(a: iacc_mat_00_6, b: scale_0145_6);
6228 iacc_mat_01_6 = _mm256_madd_epi16(a: iacc_mat_01_6, b: scale_2367_6);
6229 iacc_mat_10_6 = _mm256_madd_epi16(a: iacc_mat_10_6, b: scale_0145_6);
6230 iacc_mat_11_6 = _mm256_madd_epi16(a: iacc_mat_11_6, b: scale_2367_6);
6231
6232 iacc_mat_00_7 = _mm256_madd_epi16(a: iacc_mat_00_7, b: scale_0145_7);
6233 iacc_mat_01_7 = _mm256_madd_epi16(a: iacc_mat_01_7, b: scale_2367_7);
6234 iacc_mat_10_7 = _mm256_madd_epi16(a: iacc_mat_10_7, b: scale_0145_7);
6235 iacc_mat_11_7 = _mm256_madd_epi16(a: iacc_mat_11_7, b: scale_2367_7);
6236
6237 __m256i iacc_mat_00 = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_00_0, b: iacc_mat_00_1), b: _mm256_add_epi32(a: iacc_mat_00_2, b: iacc_mat_00_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_00_4, b: iacc_mat_00_5), b: _mm256_add_epi32(a: iacc_mat_00_6, b: iacc_mat_00_7)));
6238 __m256i iacc_mat_01 = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_01_0, b: iacc_mat_01_1), b: _mm256_add_epi32(a: iacc_mat_01_2, b: iacc_mat_01_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_01_4, b: iacc_mat_01_5), b: _mm256_add_epi32(a: iacc_mat_01_6, b: iacc_mat_01_7)));
6239 __m256i iacc_mat_10 = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_10_0, b: iacc_mat_10_1), b: _mm256_add_epi32(a: iacc_mat_10_2, b: iacc_mat_10_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_10_4, b: iacc_mat_10_5), b: _mm256_add_epi32(a: iacc_mat_10_6, b: iacc_mat_10_7)));
6240 __m256i iacc_mat_11 = _mm256_add_epi32(a: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_11_0, b: iacc_mat_11_1), b: _mm256_add_epi32(a: iacc_mat_11_2, b: iacc_mat_11_3)), b: _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_mat_11_4, b: iacc_mat_11_5), b: _mm256_add_epi32(a: iacc_mat_11_6, b: iacc_mat_11_7)));
6241
6242 // Straighten out to make 4 row vectors
6243 __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
6244 __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
6245 __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
6246 __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
6247
6248 // Load the scale(d) values for all the 4 Q8_k blocks and repeat it across lanes
6249 const __m128 row_scale_f32_sse = _mm_load_ps(p: a_ptr[b].d);
6250 const __m256 row_scale_f32 = _mm256_set_m128(hi: row_scale_f32_sse, lo: row_scale_f32_sse);
6251
6252 // Multiply with appropiate scales and accumulate (for both d and dmin) below
6253 acc_rows[0] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_0), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_rows[0]);
6254 acc_rows[1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_1), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_rows[1]);
6255 acc_rows[2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_2), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_rows[2]);
6256 acc_rows[3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_3), B: _mm256_mul_ps(a: col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_rows[3]);
6257
6258 __m256i lhs_bsums_01_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_01_0123), lhs_raw_bsums_01_0123, 1);
6259 __m256i lhs_bsums_23_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_23_0123), lhs_raw_bsums_23_0123, 1);
6260 __m256i lhs_bsums_01_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_01_4567), lhs_raw_bsums_01_4567, 1);
6261 __m256i lhs_bsums_23_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(lhs_raw_bsums_23_4567), lhs_raw_bsums_23_4567, 1);
6262
6263 // Take two bsums from two Q8_Ks at a time and multiply with corresponding mins values from each Q2_K
6264 __m256i iacc_row_min_0_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_0123, 0), b: mins_01);
6265 __m256i iacc_row_min_1_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_0123, 170), b: mins_01);
6266 __m256i iacc_row_min_2_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_0123, 0), b: mins_01);
6267 __m256i iacc_row_min_3_01 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_0123, 170), b: mins_01);
6268
6269 __m256i iacc_row_min_0_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_0123, 85), b: mins_23);
6270 __m256i iacc_row_min_1_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_0123, 255), b: mins_23);
6271 __m256i iacc_row_min_2_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_0123, 85), b: mins_23);
6272 __m256i iacc_row_min_3_23 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_0123, 255), b: mins_23);
6273
6274 __m256i iacc_row_min_0_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_4567, 0), b: mins_45);
6275 __m256i iacc_row_min_1_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_4567, 170), b: mins_45);
6276 __m256i iacc_row_min_2_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_4567, 0), b: mins_45);
6277 __m256i iacc_row_min_3_45 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_4567, 170), b: mins_45);
6278
6279 __m256i iacc_row_min_0_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_4567, 85), b: mins_67);
6280 __m256i iacc_row_min_1_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_01_4567, 255), b: mins_67);
6281 __m256i iacc_row_min_2_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_4567, 85), b: mins_67);
6282 __m256i iacc_row_min_3_67 = _mm256_madd_epi16(_mm256_shuffle_epi32(lhs_bsums_23_4567, 255), b: mins_67);
6283
6284 __m256i iacc_row_min_0 = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_row_min_0_01, b: iacc_row_min_0_23), b: _mm256_add_epi32(a: iacc_row_min_0_45,b: iacc_row_min_0_67));
6285 __m256i iacc_row_min_1 = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_row_min_1_01, b: iacc_row_min_1_23), b: _mm256_add_epi32(a: iacc_row_min_1_45,b: iacc_row_min_1_67));
6286 __m256i iacc_row_min_2 = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_row_min_2_01, b: iacc_row_min_2_23), b: _mm256_add_epi32(a: iacc_row_min_2_45,b: iacc_row_min_2_67));
6287 __m256i iacc_row_min_3 = _mm256_add_epi32(a: _mm256_add_epi32(a: iacc_row_min_3_01, b: iacc_row_min_3_23), b: _mm256_add_epi32(a: iacc_row_min_3_45,b: iacc_row_min_3_67));
6288
6289 acc_min_rows[0] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_0), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), C: acc_min_rows[0]);
6290 acc_min_rows[1] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_1), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), C: acc_min_rows[1]);
6291 acc_min_rows[2] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_2), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), C: acc_min_rows[2]);
6292 acc_min_rows[3] = _mm256_fmadd_ps(A: _mm256_cvtepi32_ps(a: iacc_row_min_3), B: _mm256_mul_ps(a: col_dmin_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), C: acc_min_rows[3]);
6293 }
6294 }
6295 // Store the accumulated values
6296 for (int i = 0; i < 4; i++) {
6297 _mm256_storeu_ps(p: (float * )(s + ((y * 4 + i) * bs + x * 8)), a: _mm256_sub_ps(a: acc_rows[i], b: acc_min_rows[i]));
6298 }
6299 }
6300 }
6301#else
6302
6303 ggml_gemm_q2_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
6304
6305
6306#endif
6307}
6308