1/*
2 * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019
3 *
4 * ===================================================================================================================================
5 * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
6 *
7 * Copyright 2018-2020, CWI, TU Munich, FSU Jena
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
10 * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
11 * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
19 * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 *
21 * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
22 * ===================================================================================================================================
23 *
24 * FSST: Fast Static Symbol Table compression
25 * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf
26 *
27 * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e.
28 * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual
29 * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is
30 * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text.
31 *
32 * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences)
33 * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression
34 * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could
35 * be seen as strings again and fit in whatever your program is that manipulates strings.
36 *
37 * useful property: FSST ensures that strings that are equal, are also equal in their compressed form.
38 *
39 * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of
40 * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of.
41 *
42 * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program
43 * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings).
44 *
45 * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are
46 * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length.
47 */
48#ifndef FSST_INCLUDED_H
49#define FSST_INCLUDED_H
50
51#ifdef _MSC_VER
52#define __restrict__
53#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
54#define __ORDER_LITTLE_ENDIAN__ 2
55#include <intrin.h>
56static inline int __builtin_ctzl(unsigned long long x) {
57# ifdef _WIN64
58 unsigned long ret;
59 _BitScanForward64(&ret, x);
60 return (int)ret;
61# else
62 unsigned long low, high;
63 bool low_set = _BitScanForward(&low, (unsigned __int32)(x)) != 0;
64 _BitScanForward(&high, (unsigned __int32)(x >> 32));
65 high += 32;
66 return low_set ? low : high;
67# endif
68}
69#endif
70
71#ifdef __cplusplus
72#define FSST_FALLTHROUGH [[fallthrough]]
73#include <cstring>
74extern "C" {
75#else
76#define FSST_FALLTHROUGH
77#endif
78
79#ifndef __has_cpp_attribute // For backwards compatibility
80#define __has_cpp_attribute(x) 0
81#endif
82#if __has_cpp_attribute(clang::fallthrough)
83#define DUCKDB_FSST_EXPLICIT_FALLTHROUGH [[clang::fallthrough]]
84#elif __has_cpp_attribute(gnu::fallthrough)
85#define DUCKDB_FSST_EXPLICIT_FALLTHROUGH [[gnu::fallthrough]]
86#else
87#define DUCKDB_FSST_EXPLICIT_FALLTHROUGH
88#endif
89
90#include <stddef.h>
91
92/* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */
93#define FSST_ESC 255
94
95/* Data structure needed for compressing strings - use duckdb_fsst_duplicate() to create thread-local copies. Use duckdb_fsst_destroy() to free. */
96typedef void* duckdb_fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */
97
98/* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */
99typedef struct {
100 unsigned long long version; /* version id */
101 unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */
102 unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */
103 unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */
104} duckdb_fsst_decoder_t;
105
106/* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */
107duckdb_fsst_encoder_t*
108duckdb_fsst_create(
109 size_t n, /* IN: number of strings in batch to sample from. */
110 size_t lenIn[], /* IN: byte-lengths of the inputs */
111 unsigned char *strIn[], /* IN: string start pointers. */
112 int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */
113);
114
115/* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */
116duckdb_fsst_encoder_t*
117duckdb_fsst_duplicate(
118 duckdb_fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */
119);
120
121#define FSST_MAXHEADER (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by duckdb_fsst_export() resp. duckdb_fsst_import() */
122
123/* Space-efficient symbol table serialization (smaller than sizeof(duckdb_fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */
124unsigned int /* OUT: number of bytes written in buf, at most sizeof(duckdb_fsst_decoder_t) */
125duckdb_fsst_export(
126 duckdb_fsst_encoder_t *encoder, /* IN: the symbol table to dump. */
127 unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */
128);
129
130/* Deallocate encoder. */
131void
132duckdb_fsst_destroy(duckdb_fsst_encoder_t*);
133
134/* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */
135unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */
136duckdb_fsst_import(
137 duckdb_fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */
138 unsigned char *buf /* OUT: pointer to a byte-buffer where duckdb_fsst_export() serialized this symbol table. */
139);
140
141/* Return a decoder structure from an encoder. */
142duckdb_fsst_decoder_t
143duckdb_fsst_decoder(
144 duckdb_fsst_encoder_t *encoder
145);
146
147/* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */
148/* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */
149size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */
150duckdb_fsst_compress(
151 duckdb_fsst_encoder_t *encoder, /* IN: encoder obtained from duckdb_fsst_create(). */
152 size_t nstrings, /* IN: number of strings in batch to compress. */
153 size_t lenIn[], /* IN: byte-lengths of the inputs */
154 unsigned char *strIn[], /* IN: input string start pointers. */
155 size_t outsize, /* IN: byte-length of output buffer. */
156 unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */
157 size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */
158 unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */
159);
160
161/* Decompress a single string, inlined for speed. */
162inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */
163duckdb_fsst_decompress(
164 duckdb_fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */
165 size_t lenIn, /* IN: byte-length of compressed string. */
166 unsigned char *strIn, /* IN: compressed string. */
167 size_t size, /* IN: byte-length of output buffer. */
168 unsigned char *output /* OUT: memory buffer to put the decompressed string in. */
169) {
170 unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len;
171 unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output;
172 unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol;
173 size_t code, posOut = 0, posIn = 0;
174#ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */
175#define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long))
176#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
177 while (posOut+32 <= size && posIn+4 <= lenIn) {
178 unsigned int nextBlock, escapeMask;
179 memcpy(dest: &nextBlock, src: strIn+posIn, n: sizeof(unsigned int));
180 escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u);
181 if (escapeMask == 0) {
182 code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
183 code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
184 code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
185 code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
186 } else {
187 unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3;
188 switch(firstEscapePos) { /* Duff's device */
189 case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
190 DUCKDB_FSST_EXPLICIT_FALLTHROUGH;
191 case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
192 DUCKDB_FSST_EXPLICIT_FALLTHROUGH;
193 case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
194 DUCKDB_FSST_EXPLICIT_FALLTHROUGH;
195 case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */
196 }
197 }
198 }
199 if (posOut+24 <= size) { // handle the possibly 3 last bytes without a loop
200 if (posIn+2 <= lenIn) {
201 strOut[posOut] = strIn[posIn+1];
202 if (strIn[posIn] != FSST_ESC) {
203 code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
204 if (strIn[posIn] != FSST_ESC) {
205 code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
206 } else {
207 posIn += 2; strOut[posOut++] = strIn[posIn-1];
208 }
209 } else {
210 posIn += 2; posOut++;
211 }
212 }
213 if (posIn < lenIn) { // last code cannot be an escape
214 code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code];
215 }
216 }
217#else
218 while (posOut+8 <= size && posIn < lenIn)
219 if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */
220 FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */
221 posOut += len[code];
222 } else {
223 strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */
224 posIn++; posOut++;
225 }
226#endif
227#endif
228 while (posIn < lenIn)
229 if ((code = strIn[posIn++]) < FSST_ESC) {
230 size_t posWrite = posOut, endWrite = posOut + len[code];
231 unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite;
232 if ((posOut = endWrite) > size) endWrite = size;
233 for(; posWrite < endWrite; posWrite++) /* only write if there is room */
234 strOut[posWrite] = symbolPointer[posWrite];
235 } else {
236 if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */
237 posIn++; posOut++;
238 }
239 if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0;
240 return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */
241}
242
243#ifdef __cplusplus
244}
245#endif
246#endif /* FSST_INCLUDED_H */
247