1// this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT):
2//
3// Copyright 2018-2020, CWI, TU Munich, FSU Jena
4//
5// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files
6// (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify,
7// merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11//
12// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
13// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
14// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
15// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16//
17// You can contact the authors via the FSST source repository : https://github.com/cwida/fsst
18#include "libfsst.hpp"
19
20Symbol concat(Symbol a, Symbol b) {
21 Symbol s;
22 u32 length = a.length()+b.length();
23 if (length > Symbol::maxLength) length = Symbol::maxLength;
24 s.set_code_len(FSST_CODE_MASK, len: length);
25 s.val.num = (b.val.num << (8*a.length())) | a.val.num;
26 return s;
27}
28
29namespace std {
30template <>
31class hash<QSymbol> {
32public:
33 size_t operator()(const QSymbol& q) const {
34 uint64_t k = q.symbol.val.num;
35 const uint64_t m = 0xc6a4a7935bd1e995;
36 const int r = 47;
37 uint64_t h = 0x8445d61a4e774912 ^ (8*m);
38 k *= m;
39 k ^= k >> r;
40 k *= m;
41 h ^= k;
42 h *= m;
43 h ^= h >> r;
44 h *= m;
45 h ^= h >> r;
46 return h;
47 }
48};
49}
50
51bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; }
52
53std::ostream& operator<<(std::ostream& out, const Symbol& s) {
54 for (u32 i=0; i<s.length(); i++)
55 out << s.val.str[i];
56 return out;
57}
58
59SymbolTable *buildSymbolTable(Counters& counters, vector<u8*> line, size_t len[], bool zeroTerminated=false) {
60 SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable();
61 int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception)
62 size_t sampleFrac = 128;
63
64 // start by determining the terminator. We use the (lowest) most infrequent byte as terminator
65 st->zeroTerminated = zeroTerminated;
66 if (zeroTerminated) {
67 st->terminator = 0; // except in case of zeroTerminated mode, then byte 0 is terminator regardless frequency
68 } else {
69 u16 byteHisto[256];
70 memset(s: byteHisto, c: 0, n: sizeof(byteHisto));
71 for(size_t i=0; i<line.size(); i++) {
72 u8* cur = line[i];
73 u8* end = cur + len[i];
74 while(cur < end) byteHisto[*cur++]++;
75 }
76 u32 minSize = FSST_SAMPLEMAXSZ, i = st->terminator = 256;
77 while(i-- > 0) {
78 if (byteHisto[i] > minSize) continue;
79 st->terminator = i;
80 minSize = byteHisto[i];
81 }
82 }
83 assert(st->terminator != 256);
84
85 // a random number between 0 and 128
86 auto rnd128 = [&](size_t i) { return 1 + (FSST_HASH((i+1UL)*sampleFrac)&127); };
87
88 // compress sample, and compute (pair-)frequencies
89 auto compressCount = [&](SymbolTable *st, Counters &counters) { // returns gain
90 int gain = 0;
91
92 for(size_t i=0; i<line.size(); i++) {
93 u8* cur = line[i];
94 u8* end = cur + len[i];
95
96 if (sampleFrac < 128) {
97 // in earlier rounds (sampleFrac < 128) we skip data in the sample (reduces overall work ~2x)
98 if (rnd128(i) > sampleFrac) continue;
99 }
100 if (cur < end) {
101 u8* start = cur;
102 u16 code2 = 255, code1 = st->findLongestSymbol(cur, end);
103 cur += st->symbols[code1].length();
104 gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(pos: code1)));
105 while (true) {
106 // count single symbol (i.e. an option is not extending it)
107 counters.count1Inc(pos1: code1);
108
109 // as an alternative, consider just using the next byte..
110 if (st->symbols[code1].length() != 1) // .. but do not count single byte symbols doubly
111 counters.count1Inc(pos1: *start);
112
113 if (cur==end) {
114 break;
115 }
116
117 // now match a new symbol
118 start = cur;
119 if (cur<end-7) {
120 u64 word = fsst_unaligned_load(V: cur);
121 size_t code = word & 0xFFFFFF;
122 size_t idx = FSST_HASH(code)&(st->hashTabSize-1);
123 Symbol s = st->hashTab[idx];
124 code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
125 word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
126 if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
127 code2 = s.code();
128 cur += s.length();
129 } else if (code2 >= FSST_CODE_BASE) {
130 cur += 2;
131 } else {
132 code2 = st->byteCodes[word & 0xFF] & FSST_CODE_MASK;
133 cur += 1;
134 }
135 } else {
136 code2 = st->findLongestSymbol(cur, end);
137 cur += st->symbols[code2].length();
138 }
139
140 // compute compressed output size
141 gain += ((int) (cur-start))-(1+isEscapeCode(pos: code2));
142
143 // now count the subsequent two symbols we encode as an extension codesibility
144 if (sampleFrac < 128) { // no need to count pairs in final round
145 // consider the symbol that is the concatenation of the two last symbols
146 counters.count2Inc(pos1: code1, pos2: code2);
147
148 // as an alternative, consider just extending with the next byte..
149 if ((cur-start) > 1) // ..but do not count single byte extensions doubly
150 counters.count2Inc(pos1: code1, pos2: *start);
151 }
152 code1 = code2;
153 }
154 }
155 }
156 return gain;
157 };
158
159 auto makeTable = [&](SymbolTable *st, Counters &counters) {
160 // hashmap of c (needed because we can generate duplicate candidates)
161 unordered_set<QSymbol> cands;
162
163 // artificially make terminater the most frequent symbol so it gets included
164 u16 terminator = st->nSymbols?FSST_CODE_BASE:st->terminator;
165 counters.count1Set(pos1: terminator,val: 65535);
166
167 auto addOrInc = [&](unordered_set<QSymbol> &cands, Symbol s, u64 count) {
168 if (count < (5*sampleFrac)/128) return; // improves both compression speed (less candidates), but also quality!!
169 QSymbol q;
170 q.symbol = s;
171 q.gain = count * s.length();
172 auto it = cands.find(x: q);
173 if (it != cands.end()) {
174 q.gain += (*it).gain;
175 cands.erase(x: *it);
176 }
177 cands.insert(x: q);
178 };
179
180 // add candidate symbols based on counted frequency
181 for (u32 pos1=0; pos1<FSST_CODE_BASE+(size_t) st->nSymbols; pos1++) {
182 u32 cnt1 = counters.count1GetNext(pos1); // may advance pos1!!
183 if (!cnt1) continue;
184
185 // heuristic: promoting single-byte symbols (*8) helps reduce exception rates and increases [de]compression speed
186 Symbol s1 = st->symbols[pos1];
187 addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1);
188
189 if (sampleFrac >= 128 || // last round we do not create new (combined) symbols
190 s1.length() == Symbol::maxLength || // symbol cannot be extended
191 s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte
192 continue;
193 }
194 for (u32 pos2=0; pos2<FSST_CODE_BASE+(size_t)st->nSymbols; pos2++) {
195 u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!!
196 if (!cnt2) continue;
197
198 // create a new symbol
199 Symbol s2 = st->symbols[pos2];
200 Symbol s3 = concat(a: s1, b: s2);
201 if (s2.val.str[0] != st->terminator) // multi-byte symbols cannot contain the terminator byte
202 addOrInc(cands, s3, cnt2);
203 }
204 }
205
206 // insert candidates into priority queue (by gain)
207 auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); };
208 priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
209 for (auto& q : cands)
210 pq.push(x: q);
211
212 // Create new symbol map using best candidates
213 st->clear();
214 while (st->nSymbols < 255 && !pq.empty()) {
215 QSymbol q = pq.top();
216 pq.pop();
217 st->add(s: q.symbol);
218 }
219 };
220
221 u8 bestCounters[512*sizeof(u16)];
222#ifdef NONOPT_FSST
223 for(size_t frac : {127, 127, 127, 127, 127, 127, 127, 127, 127, 128}) {
224 sampleFrac = frac;
225#else
226 for(sampleFrac=8; true; sampleFrac += 30) {
227#endif
228 memset(s: &counters, c: 0, n: sizeof(Counters));
229 long gain = compressCount(st, counters);
230 if (gain >= bestGain) { // a new best solution!
231 counters.backup1(buf: bestCounters);
232 *bestTable = *st; bestGain = gain;
233 }
234 if (sampleFrac >= 128) break; // we do 5 rounds (sampleFrac=8,38,68,98,128)
235 makeTable(st, counters);
236 }
237 delete st;
238 counters.restore1(buf: bestCounters);
239 makeTable(bestTable, counters);
240 bestTable->finalize(zeroTerminated); // renumber codes for more efficient compression
241 return bestTable;
242}
243
244static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size_t nlines, size_t len[], u8* line[], size_t size, u8* dst, size_t lenOut[], u8* strOut[], int unroll) {
245 size_t curLine = 0, inOff = 0, outOff = 0, batchPos = 0, empty = 0, budget = size;
246 u8 *lim = dst + size, *codeBase = symbolBase + (1<<18); // 512KB temp space for compressing 512 strings
247 SIMDjob input[512]; // combined offsets of input strings (cur,end), and string #id (pos) and output (dst) pointer
248 SIMDjob output[512]; // output are (pos:9,dst:19) end pointers (compute compressed length from this)
249 size_t jobLine[512]; // for which line in the input sequence was this job (needed because we may split a line into multiple jobs)
250
251 while (curLine < nlines && outOff <= (1<<19)) {
252 size_t prevLine = curLine, chunk, curOff = 0;
253
254 // bail out if the output buffer cannot hold the compressed next string fully
255 if (((len[curLine]-curOff)*2 + 7) > budget) break; // see below for the +7
256 else budget -= (len[curLine]-curOff)*2;
257
258 strOut[curLine] = (u8*) 0;
259 lenOut[curLine] = 0;
260
261 do {
262 do {
263 chunk = len[curLine] - curOff;
264 if (chunk > 511) {
265 chunk = 511; // large strings need to be chopped up into segments of 511 bytes
266 }
267 // create a job in this batch
268 SIMDjob job;
269 job.cur = inOff;
270 job.end = job.cur + chunk;
271 job.pos = batchPos;
272 job.out = outOff;
273
274 // worst case estimate for compressed size (+7 is for the scatter that writes extra 7 zeros)
275 outOff += 7 + 2*(size_t)(job.end - job.cur); // note, total size needed is 512*(511*2+7) bytes.
276 if (outOff > (1<<19)) break; // simdbuf may get full, stop before this chunk
277
278 // register job in this batch
279 input[batchPos] = job;
280 jobLine[batchPos] = curLine;
281
282 if (chunk == 0) {
283 empty++; // detect empty chunks -- SIMD code cannot handle empty strings, so they need to be filtered out
284 } else {
285 // copy string chunk into temp buffer
286 memcpy(dest: symbolBase + inOff, src: line[curLine] + curOff, n: chunk);
287 inOff += chunk;
288 curOff += chunk;
289 symbolBase[inOff++] = (u8) symbolTable.terminator; // write an extra char at the end that will not be encoded
290 }
291 if (++batchPos == 512) break;
292 } while(curOff < len[curLine]);
293
294 if ((batchPos == 512) || (outOff > (1<<19)) || (++curLine >= nlines)) { // cannot accumulate more?
295 if (batchPos-empty >= 32) { // if we have enough work, fire off fsst_compressAVX512 (32 is due to max 4x8 unrolling)
296 // radix-sort jobs on length (longest string first)
297 // -- this provides best load balancing and allows to skip empty jobs at the end
298 u16 sortpos[513];
299 memset(s: sortpos, c: 0, n: sizeof(sortpos));
300
301 // calculate length histo
302 for(size_t i=0; i<batchPos; i++) {
303 size_t len = input[i].end - input[i].cur;
304 sortpos[512UL - len]++;
305 }
306 // calculate running sum
307 for(size_t i=1; i<=512; i++)
308 sortpos[i] += sortpos[i-1];
309
310 // move jobs to their final destination
311 SIMDjob inputOrdered[512];
312 for(size_t i=0; i<batchPos; i++) {
313 size_t len = input[i].end - input[i].cur;
314 size_t pos = sortpos[511UL - len]++;
315 inputOrdered[pos] = input[i];
316 }
317 // finally.. SIMD compress max 256KB of simdbuf into (max) 512KB of simdbuf (but presumably much less..)
318 for(size_t done = duckdb_fsst_compressAVX512(symbolTable, codeBase, symbolBase, input: inputOrdered, output, n: batchPos-empty, unroll);
319 done < batchPos; done++) output[done] = inputOrdered[done];
320 } else {
321 memcpy(dest: output, src: input, n: batchPos*sizeof(SIMDjob));
322 }
323
324 // finish encoding (unfinished strings in process, plus the few last strings not yet processed)
325 for(size_t i=0; i<batchPos; i++) {
326 SIMDjob job = output[i];
327 if (job.cur < job.end) { // finish encoding this string with scalar code
328 u8* cur = symbolBase + job.cur;
329 u8* end = symbolBase + job.end;
330 u8* out = codeBase + job.out;
331 while (cur < end) {
332 u64 word = fsst_unaligned_load(V: cur);
333 size_t code = symbolTable.shortCodes[word & 0xFFFF];
334 size_t pos = word & 0xFFFFFF;
335 size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
336 Symbol s = symbolTable.hashTab[idx];
337 out[1] = (u8) word; // speculatively write out escaped byte
338 word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
339 if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
340 *out++ = (u8) s.code(); cur += s.length();
341 } else {
342 // could be a 2-byte or 1-byte code, or miss
343 // handle everything with predication
344 *out = (u8) code;
345 out += 1+((code&FSST_CODE_BASE)>>8);
346 cur += (code>>FSST_LEN_BITS);
347 }
348 }
349 job.out = out - codeBase;
350 }
351 // postprocess job info
352 job.cur = 0;
353 job.end = job.out - input[job.pos].out; // misuse .end field as compressed size
354 job.out = input[job.pos].out; // reset offset to start of encoded string
355 input[job.pos] = job;
356 }
357
358 // copy out the result data
359 for(size_t i=0; i<batchPos; i++) {
360 size_t lineNr = jobLine[i]; // the sort must be order-preserving, as we concatenate results string in order
361 size_t sz = input[i].end; // had stored compressed lengths here
362 if (!strOut[lineNr]) strOut[lineNr] = dst; // first segment will be the strOut pointer
363 lenOut[lineNr] += sz; // add segment (lenOut starts at 0 for this reason)
364 memcpy(dest: dst, src: codeBase+input[i].out, n: sz);
365 dst += sz;
366 }
367
368 // go for the next batch of 512 chunks
369 inOff = outOff = batchPos = empty = 0;
370 budget = (size_t) (lim - dst);
371 }
372 } while (curLine == prevLine && outOff <= (1<<19));
373 }
374 return curLine;
375}
376
377
378// optimized adaptive *scalar* compression method
379static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_t lenIn[], u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) {
380 u8 *cur = NULL, *end = NULL, *lim = out + size;
381 size_t curLine, suffixLim = symbolTable.suffixLim;
382 u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0];
383
384 u8 buf[512+7] = {}; /* +7 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */
385
386 // three variants are possible. dead code falls away since the bool arguments are constants
387 auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) {
388 while (cur < end) {
389 u64 word = fsst_unaligned_load(V: cur);
390 size_t code = symbolTable.shortCodes[word & 0xFFFF];
391 if (noSuffixOpt && ((u8) code) < suffixLim) {
392 // 2 byte code without having to worry about longer matches
393 *out++ = (u8) code; cur += 2;
394 } else {
395 size_t pos = word & 0xFFFFFF;
396 size_t idx = FSST_HASH(pos)&(symbolTable.hashTabSize-1);
397 Symbol s = symbolTable.hashTab[idx];
398 out[1] = (u8) word; // speculatively write out escaped byte
399 word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
400 if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
401 *out++ = (u8) s.code(); cur += s.length();
402 } else if (avoidBranch) {
403 // could be a 2-byte or 1-byte code, or miss
404 // handle everything with predication
405 *out = (u8) code;
406 out += 1+((code&FSST_CODE_BASE)>>8);
407 cur += (code>>FSST_LEN_BITS);
408 } else if ((u8) code < byteLim) {
409 // 2 byte code after checking there is no longer pattern
410 *out++ = (u8) code; cur += 2;
411 } else {
412 // 1 byte code or miss.
413 *out = (u8) code;
414 out += 1+((code&FSST_CODE_BASE)>>8); // predicated - tested with a branch, that was always worse
415 cur++;
416 }
417 }
418 }
419 };
420
421 for(curLine=0; curLine<nlines; curLine++) {
422 size_t chunk, curOff = 0;
423 strOut[curLine] = out;
424 do {
425 cur = strIn[curLine] + curOff;
426 chunk = lenIn[curLine] - curOff;
427 if (chunk > 511) {
428 chunk = 511; // we need to compress in chunks of 511 in order to be byte-compatible with simd-compressed FSST
429 }
430 if ((2*chunk+7) > (size_t) (lim-out)) {
431 return curLine; // out of memory
432 }
433 // copy the string to the 511-byte buffer
434 memcpy(dest: buf, src: cur, n: chunk);
435 buf[chunk] = (u8) symbolTable.terminator;
436 cur = buf;
437 end = cur + chunk;
438
439 // based on symboltable stats, choose a variant that is nice to the branch predictor
440 if (noSuffixOpt) {
441 compressVariant(true,false);
442 } else if (avoidBranch) {
443 compressVariant(false,true);
444 } else {
445 compressVariant(false, false);
446 }
447 } while((curOff += chunk) < lenIn[curLine]);
448 lenOut[curLine] = (size_t) (out - strOut[curLine]);
449 }
450 return curLine;
451}
452
453#define FSST_SAMPLELINE ((size_t) 512)
454
455// quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes
456vector<u8*> makeSample(u8* sampleBuf, u8* strIn[], size_t *lenIn, size_t nlines,
457 unique_ptr<vector<size_t>>& sample_len_out) {
458 size_t totSize = 0;
459 vector<u8*> sample;
460
461 for(size_t i=0; i<nlines; i++)
462 totSize += lenIn[i];
463 if (totSize < FSST_SAMPLETARGET) {
464 for(size_t i=0; i<nlines; i++)
465 sample.push_back(x: strIn[i]);
466 } else {
467 size_t sampleRnd = FSST_HASH(4637947);
468 u8* sampleLim = sampleBuf + FSST_SAMPLETARGET;
469
470 sample_len_out = unique_ptr<vector<size_t>>(new vector<size_t>());
471 sample_len_out->reserve(n: nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE);
472
473 // This fails if we have a lot of small strings and a few big ones?
474 while(sampleBuf < sampleLim) {
475 // choose a non-empty line
476 sampleRnd = FSST_HASH(sampleRnd);
477 size_t linenr = sampleRnd % nlines;
478 while (lenIn[linenr] == 0)
479 if (++linenr == nlines) linenr = 0;
480
481 // choose a chunk
482 size_t chunks = 1 + ((lenIn[linenr]-1) / FSST_SAMPLELINE);
483 sampleRnd = FSST_HASH(sampleRnd);
484 size_t chunk = FSST_SAMPLELINE*(sampleRnd % chunks);
485
486 // add the chunk to the sample
487 size_t len = min(lenIn[linenr]-chunk,FSST_SAMPLELINE);
488 memcpy(dest: sampleBuf, src: strIn[linenr]+chunk, n: len);
489 sample.push_back(x: sampleBuf);
490
491 sample_len_out->push_back(x: len);
492 sampleBuf += len;
493 }
494 }
495 return sample;
496}
497
498extern "C" duckdb_fsst_encoder_t* duckdb_fsst_create(size_t n, size_t lenIn[], u8 *strIn[], int zeroTerminated) {
499 u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ];
500 unique_ptr<vector<size_t>> sample_sizes;
501 vector<u8*> sample = makeSample(sampleBuf, strIn, lenIn, nlines: n?n:1, sample_len_out&: sample_sizes); // careful handling of input to get a right-size and representative sample
502 Encoder *encoder = new Encoder();
503 size_t* sampleLen = sample_sizes ? sample_sizes->data() : &lenIn[0];
504 encoder->symbolTable = shared_ptr<SymbolTable>(buildSymbolTable(counters&: encoder->counters, line: sample, len: sampleLen, zeroTerminated));
505 delete[] sampleBuf;
506 return (duckdb_fsst_encoder_t*) encoder;
507}
508
509/* create another encoder instance, necessary to do multi-threaded encoding using the same symbol table */
510extern "C" duckdb_fsst_encoder_t* duckdb_fsst_duplicate(duckdb_fsst_encoder_t *encoder) {
511 Encoder *e = new Encoder();
512 e->symbolTable = ((Encoder*)encoder)->symbolTable; // it is a shared_ptr
513 return (duckdb_fsst_encoder_t*) e;
514}
515
516// export a symbol table in compact format.
517extern "C" u32 duckdb_fsst_export(duckdb_fsst_encoder_t *encoder, u8 *buf) {
518 Encoder *e = (Encoder*) encoder;
519 // In ->version there is a versionnr, but we hide also suffixLim/terminator/nSymbols there.
520 // This is sufficient in principle to *reconstruct* a duckdb_fsst_encoder_t from a duckdb_fsst_decoder_t
521 // (such functionality could be useful to append compressed data to an existing block).
522 //
523 // However, the hash function in the encoder hash table is endian-sensitive, and given its
524 // 'lossy perfect' hashing scheme is *unable* to contain other-endian-produced symbol tables.
525 // Doing a endian-conversion during hashing will be slow and self-defeating.
526 //
527 // Overall, we could support reconstructing an encoder for incremental compression, but
528 // should enforce equal-endianness. Bit of a bummer. Not going there now.
529 //
530 // The version field is now there just for future-proofness, but not used yet
531
532 // version allows keeping track of fsst versions, track endianness, and encoder reconstruction
533 u64 version = (FSST_VERSION << 32) | // version is 24 bits, most significant byte is 0
534 (((u64) e->symbolTable->suffixLim) << 24) |
535 (((u64) e->symbolTable->terminator) << 16) |
536 (((u64) e->symbolTable->nSymbols) << 8) |
537 FSST_ENDIAN_MARKER; // least significant byte is nonzero
538
539 /* do not assume unaligned reads here */
540 memcpy(dest: buf, src: &version, n: 8);
541 buf[8] = e->symbolTable->zeroTerminated;
542 for(u32 i=0; i<8; i++)
543 buf[9+i] = (u8) e->symbolTable->lenHisto[i];
544 u32 pos = 17;
545
546 // emit only the used bytes of the symbols
547 for(u32 i = e->symbolTable->zeroTerminated; i < e->symbolTable->nSymbols; i++)
548 for(u32 j = 0; j < e->symbolTable->symbols[i].length(); j++)
549 buf[pos++] = e->symbolTable->symbols[i].val.str[j]; // serialize used symbol bytes
550
551 return pos; // length of what was serialized
552}
553
554#define FSST_CORRUPT 32774747032022883 /* 7-byte number in little endian containing "corrupt" */
555
556extern "C" u32 duckdb_fsst_import(duckdb_fsst_decoder_t *decoder, u8 *buf) {
557 u64 version = 0;
558 u32 code, pos = 17;
559 u8 lenHisto[8];
560
561 // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
562 memcpy(dest: &version, src: buf, n: 8);
563 if ((version>>32) != FSST_VERSION) return 0;
564 decoder->zeroTerminated = buf[8]&1;
565 memcpy(dest: lenHisto, src: buf+9, n: 8);
566
567 // in case of zero-terminated, first symbol is "" (zero always, may be overwritten)
568 decoder->len[0] = 1;
569 decoder->symbol[0] = 0;
570
571 // we use lenHisto[0] as 1-byte symbol run length (at the end)
572 code = decoder->zeroTerminated;
573 if (decoder->zeroTerminated) lenHisto[0]--; // if zeroTerminated, then symbol "" aka 1-byte code=0, is not stored at the end
574
575 // now get all symbols from the buffer
576 for(u32 l=1; l<=8; l++) { /* l = 1,2,3,4,5,6,7,8 */
577 for(u32 i=0; i < lenHisto[(l&7) /* 1,2,3,4,5,6,7,0 */]; i++, code++) {
578 decoder->len[code] = (l&7)+1; /* len = 2,3,4,5,6,7,8,1 */
579 decoder->symbol[code] = 0;
580 for(u32 j=0; j<decoder->len[code]; j++)
581 ((u8*) &decoder->symbol[code])[j] = buf[pos++]; // note this enforces 'little endian' symbols
582 }
583 }
584 if (decoder->zeroTerminated) lenHisto[0]++;
585
586 // fill unused symbols with text "corrupt". Gives a chance to detect corrupted code sequences (if there are unused symbols).
587 while(code<255) {
588 decoder->symbol[code] = FSST_CORRUPT;
589 decoder->len[code++] = 8;
590 }
591 return pos;
592}
593
594// runtime check for simd
595inline size_t _compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
596#ifndef NONOPT_FSST
597 if (simd && duckdb_fsst_hasAVX512())
598 return compressSIMD(symbolTable&: *e->symbolTable, symbolBase: e->simdbuf, nlines, len: lenIn, line: strIn, size, dst: output, lenOut, strOut, unroll: simd);
599#endif
600 (void) simd;
601 return compressBulk(symbolTable&: *e->symbolTable, nlines, lenIn, strIn, size, out: output, lenOut, strOut, noSuffixOpt, avoidBranch);
602}
603size_t compressImpl(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd) {
604 return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
605}
606
607// adaptive choosing of scalar compression method based on symbol length histogram
608inline size_t _compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
609 bool avoidBranch = false, noSuffixOpt = false;
610 if (100*e->symbolTable->lenHisto[1] > 65*e->symbolTable->nSymbols && 100*e->symbolTable->suffixLim > 95*e->symbolTable->lenHisto[1]) {
611 noSuffixOpt = true;
612 } else if ((e->symbolTable->lenHisto[0] > 24 && e->symbolTable->lenHisto[0] < 92) &&
613 (e->symbolTable->lenHisto[0] < 43 || e->symbolTable->lenHisto[6] + e->symbolTable->lenHisto[7] < 29) &&
614 (e->symbolTable->lenHisto[0] < 72 || e->symbolTable->lenHisto[2] < 72)) {
615 avoidBranch = true;
616 }
617 return _compressImpl(e, nlines, lenIn, strIn, size, output, lenOut, strOut, noSuffixOpt, avoidBranch, simd);
618}
619size_t compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) {
620 return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd);
621}
622
623// the main compression function (everything automatic)
624extern "C" size_t duckdb_fsst_compress(duckdb_fsst_encoder_t *encoder, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) {
625 // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB)
626 size_t totLen = accumulate(first: lenIn, last: lenIn+nlines, init: 0);
627 int simd = totLen > nlines*12 && (nlines > 64 || totLen > (size_t) 1<<15);
628 return _compressAuto(e: (Encoder*) encoder, nlines, lenIn, strIn, size, output, lenOut, strOut, simd: 3*simd);
629}
630
631/* deallocate encoder */
632extern "C" void duckdb_fsst_destroy(duckdb_fsst_encoder_t* encoder) {
633 Encoder *e = (Encoder*) encoder;
634 delete e;
635}
636
637/* very lazy implementation relying on export and import */
638extern "C" duckdb_fsst_decoder_t duckdb_fsst_decoder(duckdb_fsst_encoder_t *encoder) {
639 u8 buf[sizeof(duckdb_fsst_decoder_t)];
640 u32 cnt1 = duckdb_fsst_export(encoder, buf);
641 duckdb_fsst_decoder_t decoder;
642 u32 cnt2 = duckdb_fsst_import(decoder: &decoder, buf);
643 assert(cnt1 == cnt2); (void) cnt1; (void) cnt2;
644 return decoder;
645}