fsst_avx512.cpp source code [Velox/build/_deps/duckdb-src/third_party/fsst/fsst_avx512.cpp]

1	// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
2	//
3	// Copyright 2018-2020, CWI, TU Munich, FSU Jena
4	//
5	// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
6	// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
7	// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
8	// furnished to do so, subject to the following conditions:
9	//
10	// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11	//
12	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
13	// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
14	// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
15	// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16	//
17	// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
18	#include "libfsst.hpp"
19
20	#if DUCKDB_FSST_ENABLE_INTRINSINCS && (defined(__x86_64__) \|\| defined(_M_X64))
21	#include <immintrin.h>
22
23	#ifdef _WIN32
24	bool duckdb_fsst_hasAVX512() {
25	int info[`4`];
26	__cpuidex(info, `0x00000007`, `0`);
27	return (info[`1`]>>`16`)&`1`;
28	}
29	#else
30	#include <cpuid.h>
31	bool duckdb_fsst_hasAVX512() {
32	int info[`4`];
33	__cpuid_count(`0x00000007`, `0`, info[`0`], info[`1`], info[`2`], info[`3`]);
34	return (info[`1`]>>`16`)&`1`;
35	}
36	#endif
37	#else
38	bool duckdb_fsst_hasAVX512() { return false; }
39	#endif
40
41	// BULK COMPRESSION OF STRINGS
42	//
43	// In one call of this function, we can compress 512 strings, each of maximum length 511 bytes.
44	// strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up.
45	//
46	// In each iteration of the while loop, we find one code in each of the unroll8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4*
47	// unroll3 performs best on my hardware
48	//
49	// In the worst case, each final encoded string occupies 512KB bytes (5121024; with 1024=512xexception, exception = 2 bytes).*
50	// - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs).
51	//
52	// 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits)
53	// The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order.
54	//
55	// Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings.
56	// This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process.
57	// - so 'processed' is the amount of strings we started processing and it is between [480,512].
58	// Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished.
59	// - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method.
60	//
61	// Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings.
62	// In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings.
63	//
64	// For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to.
65	// For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch).
66	//
67	// THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS
68	// in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_ expressions however,*
69	// will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them.
70	// This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization.
71	// Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores.
72
73	size_t duckdb_fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob input, SIMDjob output, size_t n, size_t unroll) {
74	size_t processed = `0`;
75	// define some constants (all_x means that all 8 lanes contain 64-bits value X)
76	#if defined(__AVX512F__) and DUCKDB_FSST_ENABLE_INTRINSINCS
77	//__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c
78	__m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -`1`));
79	__m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME));
80	__m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE));
81	#define all_HASH _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE)
82	#define all_ONE _mm512_srli_epi64(all_MASK, 63)
83	#define all_M19 _mm512_srli_epi64(all_MASK, 45)
84	#define all_M18 _mm512_srli_epi64(all_MASK, 46)
85	#define all_M28 _mm512_srli_epi64(all_MASK, 36)
86	#define all_FFFFFF _mm512_srli_epi64(all_MASK, 40)
87	#define all_FFFF _mm512_srli_epi64(all_MASK, 48)
88	#define all_FF _mm512_srli_epi64(all_MASK, 56)
89
90	SIMDjob *inputEnd = input+n;
91	assert(n >= unroll`8` && n <= `512`); // should be close to 512*
92	__m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4
93	__mmask8 loadmask1 = `255`, loadmask2 = `255`(unroll>`1`), loadmask3 = `255`(unroll>`2`), loadmask4 = `255`(unroll>`3`); // 2b loaded new strings bitmask per unroll*
94	u32 delta1 = `8`, delta2 = `8`(unroll>`1`), delta3 = `8`(unroll>`2`), delta4 = `8`(unroll>`3`); // #new loads this SIMD iteration per unroll*
95
96	if (unroll >= `4`) {
97	while (input+delta1+delta2+delta3+delta4 < inputEnd) {
98	#include "fsst_avx512_unroll4.inc"
99	}
100	} else if (unroll == `3`) {
101	while (input+delta1+delta2+delta3 < inputEnd) {
102	#include "fsst_avx512_unroll3.inc"
103	}
104	} else if (unroll == `2`) {
105	while (input+delta1+delta2 < inputEnd) {
106	#include "fsst_avx512_unroll2.inc"
107	}
108	} else {
109	while (input+delta1 < inputEnd) {
110	#include "fsst_avx512_unroll1.inc"
111	}
112	}
113
114	// flush the job states of the unfinished strings at the end of output[]
115	processed = n - (inputEnd - input);
116	u32 unfinished = `0`;
117	if (unroll > `1`) {
118	if (unroll > `2`) {
119	if (unroll > `3`) {
120	_mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4);
121	unfinished += _mm_popcnt_u32((int) loadmask4);
122	}
123	_mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3);
124	unfinished += _mm_popcnt_u32((int) loadmask3);
125	}
126	_mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2);
127	unfinished += _mm_popcnt_u32((int) loadmask2);
128	}
129	_mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1);
130	#else
131	(void) symbolTable;
132	(void) codeBase;
133	(void) symbolBase;
134	(void) input;
135	(void) output;
136	(void) n;
137	(void) unroll;
138	#endif
139	return processed;
140	}
141

Browse the source code of Velox/build/_deps/duckdb-src/third_party/fsst/fsst_avx512.cpp