fsst.h source code [Velox/build/_deps/duckdb-src/third_party/fsst/fsst.h]

1	/*
2	* the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
3	*
4	* ===================================================================================================================================
5	* this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
6	*
7	* Copyright 2018-2020, CWI, TU Munich, FSU Jena
8	*
9	* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
10	* (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
11	* merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
12	* furnished to do so, subject to the following conditions:
13	*
14	* - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18	* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
19	* IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20	*
21	* You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
22	* ===================================================================================================================================
23	*
24	* FSST: Fast Static Symbol Table compression
25	* see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
26	*
27	* FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
28	* where dictionary compression will not work well). It allows random-access to compressed data: it is not block-based, so individual
29	* strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is
30	* block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
31	*
32	* FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences)
33	* onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression
34	* transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could
35	* be seen as strings again and fit in whatever your program is that manipulates strings.
36	*
37	* useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
38	*
39	* In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of
40	* unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of.
41	*
42	* This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
43	* that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
44	*
45	* We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are
46	* also zero-terminated strings. In zero-terminated mode, the zero-byte at the end is counted in the string byte-length.
47	*/
48	#ifndef FSST_INCLUDED_H
49	#define FSST_INCLUDED_H
50
51	#ifdef _MSC_VER
52	#define __restrict__
53	#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
54	#define __ORDER_LITTLE_ENDIAN__ 2
55	#include <intrin.h>
56	static inline int __builtin_ctzl(unsigned long long x) {
57	# ifdef _WIN64
58	unsigned long ret;
59	_BitScanForward64(&ret, x);
60	return (int)ret;
61	# else
62	unsigned long low, high;
63	bool low_set = _BitScanForward(&low, (unsigned __int32)(x)) != `0`;
64	_BitScanForward(&high, (unsigned __int32)(x >> `32`));
65	high += `32`;
66	return low_set ? low : high;
67	# endif
68	}
69	#endif
70
71	#ifdef __cplusplus
72	#define FSST_FALLTHROUGH [[fallthrough]]
73	#include <cstring>
74	extern "C" {
75	#else
76	#define FSST_FALLTHROUGH
77	#endif
78
79	#ifndef __has_cpp_attribute // For backwards compatibility
80	#define __has_cpp_attribute(x) 0
81	#endif
82	#if __has_cpp_attribute(clang::fallthrough)
83	#define DUCKDB_FSST_EXPLICIT_FALLTHROUGH [[clang::fallthrough]]
84	#elif __has_cpp_attribute(gnu::fallthrough)
85	#define DUCKDB_FSST_EXPLICIT_FALLTHROUGH [[gnu::fallthrough]]
86	#else
87	#define DUCKDB_FSST_EXPLICIT_FALLTHROUGH
88	#endif
89
90	#include <stddef.h>
91
92	/ A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. /
93	#define FSST_ESC 255
94
95	/ Data structure needed for compressing strings - use duckdb_fsst_duplicate() to create thread-local copies. Use duckdb_fsst_destroy() to free. /
96	typedef void* duckdb_fsst_encoder_t; / opaque type - it wraps around a rather large (~900KB) C++ object /
97
98	/ Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. /
99	typedef struct {
100	unsigned long long version; / version id /
101	unsigned char zeroTerminated; / terminator is a single-byte code that does not appear in longer symbols /
102	unsigned char len[`255`]; / len[x] is the byte-length of the symbol x (1 < len[x] <= 8). /
103	unsigned long long symbol[`255`]; / symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). /
104	} duckdb_fsst_decoder_t;
105
106	/ Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). /
107	duckdb_fsst_encoder_t*
108	duckdb_fsst_create(
109	size_t n, / IN: number of strings in batch to sample from. /
110	size_t lenIn[], / IN: byte-lengths of the inputs /
111	unsigned char strIn[], /* IN: string start pointers. /
112	int zeroTerminated / IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). /
113	);
114
115	/ Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. /
116	duckdb_fsst_encoder_t*
117	duckdb_fsst_duplicate(
118	duckdb_fsst_encoder_t encoder /* IN: the symbol table to duplicate. /
119	);
120
121	#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by duckdb_fsst_export() resp. duckdb_fsst_import() */
122
123	/ Space-efficient symbol table serialization (smaller than sizeof(duckdb_fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). /
124	unsigned int / OUT: number of bytes written in buf, at most sizeof(duckdb_fsst_decoder_t) /
125	duckdb_fsst_export(
126	duckdb_fsst_encoder_t encoder, /* IN: the symbol table to dump. /
127	unsigned char buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. /
128	);
129
130	/ Deallocate encoder. /
131	void
132	duckdb_fsst_destroy(duckdb_fsst_encoder_t*);
133
134	/ Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). /
135	unsigned int / OUT: number of bytes consumed in buf (0 on failure). /
136	duckdb_fsst_import(
137	duckdb_fsst_decoder_t decoder, /* IN: this symbol table will be overwritten. /
138	unsigned char buf /* OUT: pointer to a byte-buffer where duckdb_fsst_export() serialized this symbol table. /
139	);
140
141	/ Return a decoder structure from an encoder. /
142	duckdb_fsst_decoder_t
143	duckdb_fsst_decoder(
144	duckdb_fsst_encoder_t *encoder
145	);
146
147	/ Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). /
148	/ The output buffer must be large; at least "conservative space" (7+2inputlength) for the first string for something to happen. /*
149	size_t / OUT: the number of compressed strings (<=n) that fit the output buffer. /
150	duckdb_fsst_compress(
151	duckdb_fsst_encoder_t encoder, /* IN: encoder obtained from duckdb_fsst_create(). /
152	size_t nstrings, / IN: number of strings in batch to compress. /
153	size_t lenIn[], / IN: byte-lengths of the inputs /
154	unsigned char strIn[], /* IN: input string start pointers. /
155	size_t outsize, / IN: byte-length of output buffer. /
156	unsigned char output, /* OUT: memory buffer to put the compressed strings in (one after the other). /
157	size_t lenOut[], / OUT: byte-lengths of the compressed strings. /
158	unsigned char strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). /
159	);
160
161	/ Decompress a single string, inlined for speed. /
162	inline size_t / OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. /
163	duckdb_fsst_decompress(
164	duckdb_fsst_decoder_t decoder, /* IN: use this symbol table for compression. /
165	size_t lenIn, / IN: byte-length of compressed string. /
166	unsigned char strIn, /* IN: compressed string. /
167	size_t size, / IN: byte-length of output buffer. /
168	unsigned char output /* OUT: memory buffer to put the decompressed string in. /
169	) {
170	unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
171	unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
172	unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol;
173	size_t code, posOut = `0`, posIn = `0`;
174	#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
175	#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
176	#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
177	while (posOut+`32` <= size && posIn+`4` <= lenIn) {
178	unsigned int nextBlock, escapeMask;
179	memcpy(dest: &nextBlock, src: strIn+posIn, n: sizeof(unsigned int));
180	escapeMask = (nextBlock&`0x80808080u`)&((((~nextBlock)&`0x7F7F7F7Fu`)+`0x7F7F7F7Fu`)^`0x80808080u`);
181	if (escapeMask == `0`) {
182	code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
183	code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
184	code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
185	code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
186	} else {
187	unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>`3`;
188	switch(firstEscapePos) { / Duff's device /
189	case `3`: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
190	DUCKDB_FSST_EXPLICIT_FALLTHROUGH;
191	case `2`: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
192	DUCKDB_FSST_EXPLICIT_FALLTHROUGH;
193	case `1`: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
194	DUCKDB_FSST_EXPLICIT_FALLTHROUGH;
195	case `0`: posIn+=`2`; strOut[posOut++] = strIn[posIn-`1`]; / decompress an escaped byte /
196	}
197	}
198	}
199	if (posOut+`24` <= size) { // handle the possibly 3 last bytes without a loop
200	if (posIn+`2` <= lenIn) {
201	strOut[posOut] = strIn[posIn+`1`];
202	if (strIn[posIn] != FSST_ESC) {
203	code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
204	if (strIn[posIn] != FSST_ESC) {
205	code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
206	} else {
207	posIn += `2`; strOut[posOut++] = strIn[posIn-`1`];
208	}
209	} else {
210	posIn += `2`; posOut++;
211	}
212	}
213	if (posIn < lenIn) { // last code cannot be an escape
214	code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
215	}
216	}
217	#else
218	while (posOut+`8` <= size && posIn < lenIn)
219	if ((code = strIn[posIn++]) < FSST_ESC) { / symbol compressed as code? /
220	FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); / unaligned memory write /
221	posOut += len[code];
222	} else {
223	strOut[posOut] = strIn[posIn]; / decompress an escaped byte /
224	posIn++; posOut++;
225	}
226	#endif
227	#endif
228	while (posIn < lenIn)
229	if ((code = strIn[posIn++]) < FSST_ESC) {
230	size_t posWrite = posOut, endWrite = posOut + len[code];
231	unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite;
232	if ((posOut = endWrite) > size) endWrite = size;
233	for(; posWrite < endWrite; posWrite++) / only write if there is room /
234	strOut[posWrite] = symbolPointer[posWrite];
235	} else {
236	if (posOut < size) strOut[posOut] = strIn[posIn]; / idem /
237	posIn++; posOut++;
238	}
239	if (posOut >= size && (decoder->zeroTerminated&`1`)) strOut[size-`1`] = `0`;
240	return posOut; / full size of decompressed string (could be >size, then the actually decompressed part) /
241	}
242
243	#ifdef __cplusplus
244	}
245	#endif
246	#endif /* FSST_INCLUDED_H */
247

Browse the source code of Velox/build/_deps/duckdb-src/third_party/fsst/fsst.h