1 | /* |
2 | * the API for FSST compression -- (c) Peter Boncz, Viktor Leis and Thomas Neumann (CWI, TU Munich), 2018-2019 |
3 | * |
4 | * =================================================================================================================================== |
5 | * this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): |
6 | * |
7 | * Copyright 2018-2020, CWI, TU Munich, FSU Jena |
8 | * |
9 | * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files |
10 | * (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, |
11 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is |
12 | * furnished to do so, subject to the following conditions: |
13 | * |
14 | * - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. |
15 | * |
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
17 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE |
18 | * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR |
19 | * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
20 | * |
21 | * You can contact the authors via the FSST source repository : https://github.com/cwida/fsst |
22 | * =================================================================================================================================== |
23 | * |
24 | * FSST: Fast Static Symbol Table compression |
25 | * see the paper https://github.com/cwida/fsst/raw/master/fsstcompression.pdf |
26 | * |
27 | * FSST is a compression scheme focused on string/text data: it can compress strings from distributions with many different values (i.e. |
28 | * where dictionary compression will not work well). It allows *random-access* to compressed data: it is not block-based, so individual |
29 | * strings can be decompressed without touching the surrounding data in a compressed block. When compared to e.g. lz4 (which is |
30 | * block-based), FSST achieves similar decompression speed, (2x) better compression speed and 30% better compression ratio on text. |
31 | * |
32 | * FSST encodes strings also using a symbol table -- but it works on pieces of the string, as it maps "symbols" (1-8 byte sequences) |
33 | * onto "codes" (single-bytes). FSST can also represent a byte as an exception (255 followed by the original byte). Hence, compression |
34 | * transforms a sequence of bytes into a (supposedly shorter) sequence of codes or escaped bytes. These shorter byte-sequences could |
35 | * be seen as strings again and fit in whatever your program is that manipulates strings. |
36 | * |
37 | * useful property: FSST ensures that strings that are equal, are also equal in their compressed form. |
38 | * |
39 | * In this API, strings are considered byte-arrays (byte = unsigned char) and a batch of strings is represented as an array of |
40 | * unsigned char* pointers to their starts. A seperate length array (of unsigned int) denotes how many bytes each string consists of. |
41 | * |
42 | * This representation as unsigned char* pointers tries to assume as little as possible on the memory management of the program |
43 | * that calls this API, and is also intended to allow passing strings into this API without copying (even if you use C++ strings). |
44 | * |
45 | * We optionally support C-style zero-terminated strings (zero appearing only at the end). In this case, the compressed strings are |
46 | * also zero-terminated strings. In zero-terminated mode, the zero-byte at the end *is* counted in the string byte-length. |
47 | */ |
48 | #ifndef FSST_INCLUDED_H |
49 | #define FSST_INCLUDED_H |
50 | |
51 | #ifdef _MSC_VER |
52 | #define __restrict__ |
53 | #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ |
54 | #define __ORDER_LITTLE_ENDIAN__ 2 |
55 | #include <intrin.h> |
56 | static inline int __builtin_ctzl(unsigned long long x) { |
57 | # ifdef _WIN64 |
58 | unsigned long ret; |
59 | _BitScanForward64(&ret, x); |
60 | return (int)ret; |
61 | # else |
62 | unsigned long low, high; |
63 | bool low_set = _BitScanForward(&low, (unsigned __int32)(x)) != 0; |
64 | _BitScanForward(&high, (unsigned __int32)(x >> 32)); |
65 | high += 32; |
66 | return low_set ? low : high; |
67 | # endif |
68 | } |
69 | #endif |
70 | |
71 | #ifdef __cplusplus |
72 | #define FSST_FALLTHROUGH [[fallthrough]] |
73 | #include <cstring> |
74 | extern "C" { |
75 | #else |
76 | #define FSST_FALLTHROUGH |
77 | #endif |
78 | |
79 | #ifndef __has_cpp_attribute // For backwards compatibility |
80 | #define __has_cpp_attribute(x) 0 |
81 | #endif |
82 | #if __has_cpp_attribute(clang::fallthrough) |
83 | #define DUCKDB_FSST_EXPLICIT_FALLTHROUGH [[clang::fallthrough]] |
84 | #elif __has_cpp_attribute(gnu::fallthrough) |
85 | #define DUCKDB_FSST_EXPLICIT_FALLTHROUGH [[gnu::fallthrough]] |
86 | #else |
87 | #define DUCKDB_FSST_EXPLICIT_FALLTHROUGH |
88 | #endif |
89 | |
90 | #include <stddef.h> |
91 | |
92 | /* A compressed string is simply a string of 1-byte codes; except for code 255, which is followed by an uncompressed byte. */ |
93 | #define FSST_ESC 255 |
94 | |
95 | /* Data structure needed for compressing strings - use duckdb_fsst_duplicate() to create thread-local copies. Use duckdb_fsst_destroy() to free. */ |
96 | typedef void* duckdb_fsst_encoder_t; /* opaque type - it wraps around a rather large (~900KB) C++ object */ |
97 | |
98 | /* Data structure needed for decompressing strings - read-only and thus can be shared between multiple decompressing threads. */ |
99 | typedef struct { |
100 | unsigned long long version; /* version id */ |
101 | unsigned char zeroTerminated; /* terminator is a single-byte code that does not appear in longer symbols */ |
102 | unsigned char len[255]; /* len[x] is the byte-length of the symbol x (1 < len[x] <= 8). */ |
103 | unsigned long long symbol[255]; /* symbol[x] contains in LITTLE_ENDIAN the bytesequence that code x represents (0 <= x < 255). */ |
104 | } duckdb_fsst_decoder_t; |
105 | |
106 | /* Calibrate a FSST symboltable from a batch of strings (it is best to provide at least 16KB of data). */ |
107 | duckdb_fsst_encoder_t* |
108 | duckdb_fsst_create( |
109 | size_t n, /* IN: number of strings in batch to sample from. */ |
110 | size_t lenIn[], /* IN: byte-lengths of the inputs */ |
111 | unsigned char *strIn[], /* IN: string start pointers. */ |
112 | int zeroTerminated /* IN: whether input strings are zero-terminated. If so, encoded strings are as well (i.e. symbol[0]=""). */ |
113 | ); |
114 | |
115 | /* Create another encoder instance, necessary to do multi-threaded encoding using the same symbol table. */ |
116 | duckdb_fsst_encoder_t* |
117 | duckdb_fsst_duplicate( |
118 | duckdb_fsst_encoder_t *encoder /* IN: the symbol table to duplicate. */ |
119 | ); |
120 | |
121 | #define (8+1+8+2048+1) /* maxlen of deserialized fsst header, produced/consumed by duckdb_fsst_export() resp. duckdb_fsst_import() */ |
122 | |
123 | /* Space-efficient symbol table serialization (smaller than sizeof(duckdb_fsst_decoder_t) - by saving on the unused bytes in symbols of len < 8). */ |
124 | unsigned int /* OUT: number of bytes written in buf, at most sizeof(duckdb_fsst_decoder_t) */ |
125 | duckdb_fsst_export( |
126 | duckdb_fsst_encoder_t *encoder, /* IN: the symbol table to dump. */ |
127 | unsigned char *buf /* OUT: pointer to a byte-buffer where to serialize this symbol table. */ |
128 | ); |
129 | |
130 | /* Deallocate encoder. */ |
131 | void |
132 | duckdb_fsst_destroy(duckdb_fsst_encoder_t*); |
133 | |
134 | /* Return a decoder structure from serialized format (typically used in a block-, file- or row-group header). */ |
135 | unsigned int /* OUT: number of bytes consumed in buf (0 on failure). */ |
136 | duckdb_fsst_import( |
137 | duckdb_fsst_decoder_t *decoder, /* IN: this symbol table will be overwritten. */ |
138 | unsigned char *buf /* OUT: pointer to a byte-buffer where duckdb_fsst_export() serialized this symbol table. */ |
139 | ); |
140 | |
141 | /* Return a decoder structure from an encoder. */ |
142 | duckdb_fsst_decoder_t |
143 | duckdb_fsst_decoder( |
144 | duckdb_fsst_encoder_t *encoder |
145 | ); |
146 | |
147 | /* Compress a batch of strings (on AVX512 machines best performance is obtained by compressing more than 32KB of string volume). */ |
148 | /* The output buffer must be large; at least "conservative space" (7+2*inputlength) for the first string for something to happen. */ |
149 | size_t /* OUT: the number of compressed strings (<=n) that fit the output buffer. */ |
150 | duckdb_fsst_compress( |
151 | duckdb_fsst_encoder_t *encoder, /* IN: encoder obtained from duckdb_fsst_create(). */ |
152 | size_t nstrings, /* IN: number of strings in batch to compress. */ |
153 | size_t lenIn[], /* IN: byte-lengths of the inputs */ |
154 | unsigned char *strIn[], /* IN: input string start pointers. */ |
155 | size_t outsize, /* IN: byte-length of output buffer. */ |
156 | unsigned char *output, /* OUT: memory buffer to put the compressed strings in (one after the other). */ |
157 | size_t lenOut[], /* OUT: byte-lengths of the compressed strings. */ |
158 | unsigned char *strOut[] /* OUT: output string start pointers. Will all point into [output,output+size). */ |
159 | ); |
160 | |
161 | /* Decompress a single string, inlined for speed. */ |
162 | inline size_t /* OUT: bytesize of the decompressed string. If > size, the decoded output is truncated to size. */ |
163 | duckdb_fsst_decompress( |
164 | duckdb_fsst_decoder_t *decoder, /* IN: use this symbol table for compression. */ |
165 | size_t lenIn, /* IN: byte-length of compressed string. */ |
166 | unsigned char *strIn, /* IN: compressed string. */ |
167 | size_t size, /* IN: byte-length of output buffer. */ |
168 | unsigned char *output /* OUT: memory buffer to put the decompressed string in. */ |
169 | ) { |
170 | unsigned char*__restrict__ len = (unsigned char* __restrict__) decoder->len; |
171 | unsigned char*__restrict__ strOut = (unsigned char* __restrict__) output; |
172 | unsigned long long*__restrict__ symbol = (unsigned long long* __restrict__) decoder->symbol; |
173 | size_t code, posOut = 0, posIn = 0; |
174 | #ifndef FSST_MUST_ALIGN /* defining on platforms that require aligned memory access may help their performance */ |
175 | #define FSST_UNALIGNED_STORE(dst,src) memcpy((unsigned long long*) (dst), &(src), sizeof(unsigned long long)) |
176 | #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
177 | while (posOut+32 <= size && posIn+4 <= lenIn) { |
178 | unsigned int nextBlock, escapeMask; |
179 | memcpy(dest: &nextBlock, src: strIn+posIn, n: sizeof(unsigned int)); |
180 | escapeMask = (nextBlock&0x80808080u)&((((~nextBlock)&0x7F7F7F7Fu)+0x7F7F7F7Fu)^0x80808080u); |
181 | if (escapeMask == 0) { |
182 | code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
183 | code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
184 | code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
185 | code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
186 | } else { |
187 | unsigned long firstEscapePos=__builtin_ctzl((unsigned long long) escapeMask)>>3; |
188 | switch(firstEscapePos) { /* Duff's device */ |
189 | case 3: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
190 | DUCKDB_FSST_EXPLICIT_FALLTHROUGH; |
191 | case 2: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
192 | DUCKDB_FSST_EXPLICIT_FALLTHROUGH; |
193 | case 1: code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
194 | DUCKDB_FSST_EXPLICIT_FALLTHROUGH; |
195 | case 0: posIn+=2; strOut[posOut++] = strIn[posIn-1]; /* decompress an escaped byte */ |
196 | } |
197 | } |
198 | } |
199 | if (posOut+24 <= size) { // handle the possibly 3 last bytes without a loop |
200 | if (posIn+2 <= lenIn) { |
201 | strOut[posOut] = strIn[posIn+1]; |
202 | if (strIn[posIn] != FSST_ESC) { |
203 | code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
204 | if (strIn[posIn] != FSST_ESC) { |
205 | code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
206 | } else { |
207 | posIn += 2; strOut[posOut++] = strIn[posIn-1]; |
208 | } |
209 | } else { |
210 | posIn += 2; posOut++; |
211 | } |
212 | } |
213 | if (posIn < lenIn) { // last code cannot be an escape |
214 | code = strIn[posIn++]; FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); posOut += len[code]; |
215 | } |
216 | } |
217 | #else |
218 | while (posOut+8 <= size && posIn < lenIn) |
219 | if ((code = strIn[posIn++]) < FSST_ESC) { /* symbol compressed as code? */ |
220 | FSST_UNALIGNED_STORE(strOut+posOut, symbol[code]); /* unaligned memory write */ |
221 | posOut += len[code]; |
222 | } else { |
223 | strOut[posOut] = strIn[posIn]; /* decompress an escaped byte */ |
224 | posIn++; posOut++; |
225 | } |
226 | #endif |
227 | #endif |
228 | while (posIn < lenIn) |
229 | if ((code = strIn[posIn++]) < FSST_ESC) { |
230 | size_t posWrite = posOut, endWrite = posOut + len[code]; |
231 | unsigned char* __restrict__ symbolPointer = ((unsigned char* __restrict__) &symbol[code]) - posWrite; |
232 | if ((posOut = endWrite) > size) endWrite = size; |
233 | for(; posWrite < endWrite; posWrite++) /* only write if there is room */ |
234 | strOut[posWrite] = symbolPointer[posWrite]; |
235 | } else { |
236 | if (posOut < size) strOut[posOut] = strIn[posIn]; /* idem */ |
237 | posIn++; posOut++; |
238 | } |
239 | if (posOut >= size && (decoder->zeroTerminated&1)) strOut[size-1] = 0; |
240 | return posOut; /* full size of decompressed string (could be >size, then the actually decompressed part) */ |
241 | } |
242 | |
243 | #ifdef __cplusplus |
244 | } |
245 | #endif |
246 | #endif /* FSST_INCLUDED_H */ |
247 | |