| 1 | // this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): |
| 2 | // |
| 3 | // Copyright 2018-2020, CWI, TU Munich, FSU Jena |
| 4 | // |
| 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files |
| 6 | // (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, |
| 7 | // merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is |
| 8 | // furnished to do so, subject to the following conditions: |
| 9 | // |
| 10 | // - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. |
| 11 | // |
| 12 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| 13 | // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE |
| 14 | // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR |
| 15 | // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 16 | // |
| 17 | // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst |
| 18 | #include <algorithm> |
| 19 | #include <cassert> |
| 20 | #include <cstring> |
| 21 | #include <fstream> |
| 22 | #include <iostream> |
| 23 | #include <numeric> |
| 24 | #include <memory> |
| 25 | #include <queue> |
| 26 | #include <string> |
| 27 | #include <unordered_set> |
| 28 | #include <vector> |
| 29 | #include <sys/types.h> |
| 30 | #include <sys/stat.h> |
| 31 | #include <fcntl.h> |
| 32 | #include <stddef.h> |
| 33 | |
| 34 | using namespace std; |
| 35 | |
| 36 | #include "fsst.h" // the official FSST API -- also usable by C mortals |
| 37 | |
| 38 | /* unsigned integers */ |
| 39 | typedef uint8_t u8; |
| 40 | typedef uint16_t u16; |
| 41 | typedef uint32_t u32; |
| 42 | typedef uint64_t u64; |
| 43 | |
| 44 | inline uint64_t fsst_unaligned_load(u8 const* V) { |
| 45 | uint64_t Ret; |
| 46 | memcpy(dest: &Ret, src: V, n: sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible) |
| 47 | return Ret; |
| 48 | } |
| 49 | |
| 50 | #define FSST_ENDIAN_MARKER ((u64) 1) |
| 51 | #define FSST_VERSION_20190218 20190218 |
| 52 | #define FSST_VERSION ((u64) FSST_VERSION_20190218) |
| 53 | |
| 54 | // "symbols" are character sequences (up to 8 bytes) |
| 55 | // A symbol is compressed into a "code" of, in principle, one byte. But, we added an exception mechanism: |
| 56 | // byte 255 followed by byte X represents the single-byte symbol X. Its code is 256+X. |
| 57 | |
| 58 | // we represent codes in u16 (not u8). 12 bits code (of which 10 are used), 4 bits length |
| 59 | #define FSST_LEN_BITS 12 |
| 60 | #define FSST_CODE_BITS 9 |
| 61 | #define FSST_CODE_BASE 256UL /* first 256 codes [0,255] are pseudo codes: escaped bytes */ |
| 62 | #define FSST_CODE_MAX (1UL<<FSST_CODE_BITS) /* all bits set: indicating a symbol that has not been assigned a code yet */ |
| 63 | #define FSST_CODE_MASK (FSST_CODE_MAX-1UL) /* all bits set: indicating a symbol that has not been assigned a code yet */ |
| 64 | |
| 65 | struct Symbol { |
| 66 | static const unsigned maxLength = 8; |
| 67 | |
| 68 | // the byte sequence that this symbol stands for |
| 69 | union { char str[maxLength]; u64 num; } val; // usually we process it as a num(ber), as this is fast |
| 70 | |
| 71 | // icl = u64 ignoredBits:16,code:12,length:4,unused:32 -- but we avoid exposing this bit-field notation |
| 72 | u64 icl; // use a single u64 to be sure "code" is accessed with one load and can be compared with one comparison |
| 73 | |
| 74 | Symbol() : icl(0) { val.num = 0; } |
| 75 | |
| 76 | explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { val.num = c; } // single-char symbol |
| 77 | explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {} |
| 78 | explicit Symbol(u8* begin, u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {} |
| 79 | explicit Symbol(const char* input, u32 len) { |
| 80 | val.num = 0; |
| 81 | if (len>=8) { |
| 82 | len = 8; |
| 83 | memcpy(dest: val.str, src: input, n: 8); |
| 84 | } else { |
| 85 | memcpy(dest: val.str, src: input, n: len); |
| 86 | } |
| 87 | set_code_len(FSST_CODE_MAX, len); |
| 88 | } |
| 89 | void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); } |
| 90 | |
| 91 | u32 length() const { return (u32) (icl >> 28); } |
| 92 | u16 code() const { return (icl >> 16) & FSST_CODE_MASK; } |
| 93 | u32 ignoredBits() const { return (u32) icl; } |
| 94 | |
| 95 | u8 first() const { assert( length() >= 1); return 0xFF & val.num; } |
| 96 | u16 first2() const { assert( length() >= 2); return 0xFFFF & val.num; } |
| 97 | |
| 98 | #define FSST_HASH_LOG2SIZE 10 |
| 99 | #define FSST_HASH_PRIME 2971215073LL |
| 100 | #define FSST_SHIFT 15 |
| 101 | #define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) |
| 102 | size_t hash() const { size_t v = 0xFFFFFF & val.num; return FSST_HASH(v); } // hash on the next 3 bytes |
| 103 | }; |
| 104 | |
| 105 | // Symbol that can be put in a queue, ordered on gain |
| 106 | struct QSymbol{ |
| 107 | Symbol symbol; |
| 108 | mutable u32 gain; // mutable because gain value should be ignored in find() on unordered_set of QSymbols |
| 109 | bool operator==(const QSymbol& other) const { return symbol.val.num == other.symbol.val.num && symbol.length() == other.symbol.length(); } |
| 110 | }; |
| 111 | |
| 112 | // we construct FSST symbol tables using a random sample of about 16KB (1<<14) |
| 113 | #define FSST_SAMPLETARGET (1<<14) |
| 114 | #define FSST_SAMPLEMAXSZ ((long) 2*FSST_SAMPLETARGET) |
| 115 | |
| 116 | // two phases of compression, before and after optimize(): |
| 117 | // |
| 118 | // (1) to encode values we probe (and maintain) three datastructures: |
| 119 | // - u16 byteCodes[65536] array at the position of the next byte (s.length==1) |
| 120 | // - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2) |
| 121 | // - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), |
| 122 | // this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are |
| 123 | // pseudo codes representing a single byte these will become escapes) |
| 124 | // |
| 125 | // (2) when we finished looking for the best symbol table we call optimize() to reshape it: |
| 126 | // - it renumbers the codes by length (first symbols of length 2,3,4,5,6,7,8; then 1 (starting from byteLim are symbols of length 1) |
| 127 | // length 2 codes for which no longer suffix symbol exists (< suffixLim) come first among the 2-byte codes |
| 128 | // (allows shortcut during compression) |
| 129 | // - for each two-byte combination, in all unused slots of shortCodes[], it enters the byteCode[] of the symbol corresponding |
| 130 | // to the first byte (if such a single-byte symbol exists). This allows us to just probe the next two bytes (if there is only one |
| 131 | // byte left in the string, there is still a terminator-byte added during compression) in shortCodes[]. That is, byteCodes[] |
| 132 | // and its codepath is no longer required. This makes compression faster. The reason we use byteCodes[] during symbolTable construction |
| 133 | // is that adding a new code/symbol is expensive (you have to touch shortCodes[] in 256 places). This optimization was |
| 134 | // hence added to make symbolTable construction faster. |
| 135 | // |
| 136 | // this final layout allows for the fastest compression code, only currently present in compressBulk |
| 137 | |
| 138 | // in the hash table, the icl field contains (low-to-high) ignoredBits:16,code:12,length:4 |
| 139 | #define FSST_ICL_FREE ((15<<28)|(((u32)FSST_CODE_MASK)<<16)) // high bits of icl (len=8,code=FSST_CODE_MASK) indicates free bucket |
| 140 | |
| 141 | // ignoredBits is (8-length)*8, which is the amount of high bits to zero in the input word before comparing with the hashtable key |
| 142 | // ..it could of course be computed from len during lookup, but storing it precomputed in some loose bits is faster |
| 143 | // |
| 144 | // the gain field is only used in the symbol queue that sorts symbols on gain |
| 145 | |
| 146 | struct SymbolTable { |
| 147 | static const u32 hashTabSize = 1<<FSST_HASH_LOG2SIZE; // smallest size that incurs no precision loss |
| 148 | |
| 149 | // lookup table using the next two bytes (65536 codes), or just the next single byte |
| 150 | u16 shortCodes[65536]; // contains code for 2-byte symbol, otherwise code for pseudo byte (escaped byte) |
| 151 | |
| 152 | // lookup table (only used during symbolTable construction, not during normal text compression) |
| 153 | u16 byteCodes[256]; // contains code for every 1-byte symbol, otherwise code for pseudo byte (escaped byte) |
| 154 | |
| 155 | // 'symbols' is the current symbol table symbol[code].symbol is the max 8-byte 'symbol' for single-byte 'code' |
| 156 | Symbol symbols[FSST_CODE_MAX]; // x in [0,255]: pseudo symbols representing escaped byte x; x in [FSST_CODE_BASE=256,256+nSymbols]: real symbols |
| 157 | |
| 158 | // replicate long symbols in hashTab (avoid indirection). |
| 159 | Symbol hashTab[hashTabSize]; // used for all symbols of 3 and more bytes |
| 160 | |
| 161 | u16 nSymbols; // amount of symbols in the map (max 255) |
| 162 | u16 suffixLim; // codes higher than this do not have a longer suffix |
| 163 | u16 terminator; // code of 1-byte symbol, that can be used as a terminator during compression |
| 164 | bool zeroTerminated; // whether we are expecting zero-terminated strings (we then also produce zero-terminated compressed strings) |
| 165 | u16 lenHisto[FSST_CODE_BITS]; // lenHisto[x] is the amount of symbols of byte-length (x+1) in this SymbolTable |
| 166 | |
| 167 | SymbolTable() : nSymbols(0), suffixLim(FSST_CODE_MAX), terminator(0), zeroTerminated(false) { |
| 168 | // stuff done once at startup |
| 169 | for (u32 i=0; i<256; i++) { |
| 170 | symbols[i] = Symbol(i,i|(1<<FSST_LEN_BITS)); // pseudo symbols |
| 171 | } |
| 172 | Symbol unused = Symbol((u8) 0,FSST_CODE_MASK); // single-char symbol, exception code |
| 173 | for (u32 i=256; i<FSST_CODE_MAX; i++) { |
| 174 | symbols[i] = unused; // we start with all symbols unused |
| 175 | } |
| 176 | // empty hash table |
| 177 | Symbol s; |
| 178 | s.val.num = 0; |
| 179 | s.icl = FSST_ICL_FREE; //marks empty in hashtab |
| 180 | for(u32 i=0; i<hashTabSize; i++) |
| 181 | hashTab[i] = s; |
| 182 | |
| 183 | // fill byteCodes[] with the pseudo code all bytes (escaped bytes) |
| 184 | for(u32 i=0; i<256; i++) |
| 185 | byteCodes[i] = (1<<FSST_LEN_BITS) | i; |
| 186 | |
| 187 | // fill shortCodes[] with the pseudo code for the first byte of each two-byte pattern |
| 188 | for(u32 i=0; i<65536; i++) |
| 189 | shortCodes[i] = (1<<FSST_LEN_BITS) | (i&255); |
| 190 | |
| 191 | memset(s: lenHisto, c: 0, n: sizeof(lenHisto)); // all unused |
| 192 | } |
| 193 | |
| 194 | void clear() { |
| 195 | // clear a symbolTable with minimal effort (only erase the used positions in it) |
| 196 | memset(s: lenHisto, c: 0, n: sizeof(lenHisto)); // all unused |
| 197 | for(u32 i=FSST_CODE_BASE; i<FSST_CODE_BASE+nSymbols; i++) { |
| 198 | if (symbols[i].length() == 1) { |
| 199 | u16 val = symbols[i].first(); |
| 200 | byteCodes[val] = (1<<FSST_LEN_BITS) | val; |
| 201 | } else if (symbols[i].length() == 2) { |
| 202 | u16 val = symbols[i].first2(); |
| 203 | shortCodes[val] = (1<<FSST_LEN_BITS) | (val&255); |
| 204 | } else { |
| 205 | u32 idx = symbols[i].hash() & (hashTabSize-1); |
| 206 | hashTab[idx].val.num = 0; |
| 207 | hashTab[idx].icl = FSST_ICL_FREE; //marks empty in hashtab |
| 208 | } |
| 209 | } |
| 210 | nSymbols = 0; // no need to clean symbols[] as no symbols are used |
| 211 | } |
| 212 | bool hashInsert(Symbol s) { |
| 213 | u32 idx = s.hash() & (hashTabSize-1); |
| 214 | bool taken = (hashTab[idx].icl < FSST_ICL_FREE); |
| 215 | if (taken) return false; // collision in hash table |
| 216 | hashTab[idx].icl = s.icl; |
| 217 | hashTab[idx].val.num = s.val.num & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); |
| 218 | return true; |
| 219 | } |
| 220 | bool add(Symbol s) { |
| 221 | assert(FSST_CODE_BASE + nSymbols < FSST_CODE_MAX); |
| 222 | u32 len = s.length(); |
| 223 | s.set_code_len(FSST_CODE_BASE + nSymbols, len); |
| 224 | if (len == 1) { |
| 225 | byteCodes[s.first()] = FSST_CODE_BASE + nSymbols + (1<<FSST_LEN_BITS); // len=1 (<<FSST_LEN_BITS) |
| 226 | } else if (len == 2) { |
| 227 | shortCodes[s.first2()] = FSST_CODE_BASE + nSymbols + (2<<FSST_LEN_BITS); // len=2 (<<FSST_LEN_BITS) |
| 228 | } else if (!hashInsert(s)) { |
| 229 | return false; |
| 230 | } |
| 231 | symbols[FSST_CODE_BASE + nSymbols++] = s; |
| 232 | lenHisto[len-1]++; |
| 233 | return true; |
| 234 | } |
| 235 | /// Find longest expansion, return code (= position in symbol table) |
| 236 | u16 findLongestSymbol(Symbol s) const { |
| 237 | size_t idx = s.hash() & (hashTabSize-1); |
| 238 | if (hashTab[idx].icl <= s.icl && hashTab[idx].val.num == (s.val.num & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) { |
| 239 | return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol |
| 240 | } |
| 241 | if (s.length() >= 2) { |
| 242 | u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; |
| 243 | if (code >= FSST_CODE_BASE) return code; |
| 244 | } |
| 245 | return byteCodes[s.first()] & FSST_CODE_MASK; |
| 246 | } |
| 247 | u16 findLongestSymbol(u8* cur, u8* end) const { |
| 248 | return findLongestSymbol(s: Symbol(cur,end)); // represent the string as a temporary symbol |
| 249 | } |
| 250 | |
| 251 | // rationale for finalize: |
| 252 | // - during symbol table construction, we may create more than 256 codes, but bring it down to max 255 in the last makeTable() |
| 253 | // consequently we needed more than 8 bits during symbol table contruction, but can simplify the codes to single bytes in finalize() |
| 254 | // (this feature is in fact lo longer used, but could still be exploited: symbol construction creates no more than 255 symbols in each pass) |
| 255 | // - we not only reduce the amount of codes to <255, but also *reorder* the symbols and renumber their codes, for higher compression perf. |
| 256 | // we renumber codes so they are grouped by length, to allow optimized scalar string compression (byteLim and suffixLim optimizations). |
| 257 | // - we make the use of byteCode[] no longer necessary by inserting single-byte codes in the free spots of shortCodes[] |
| 258 | // Using shortCodes[] only makes compression faster. When creating the symbolTable, however, using shortCodes[] for the single-byte |
| 259 | // symbols is slow, as each insert touches 256 positions in it. This optimization was added when optimizing symbolTable construction time. |
| 260 | // |
| 261 | // In all, we change the layout and coding, as follows.. |
| 262 | // |
| 263 | // before finalize(): |
| 264 | // - The real symbols are symbols[256..256+nSymbols>. As we may have nSymbols > 255 |
| 265 | // - The first 256 codes are pseudo symbols (all escaped bytes) |
| 266 | // |
| 267 | // after finalize(): |
| 268 | // - table layout is symbols[0..nSymbols>, with nSymbols < 256. |
| 269 | // - Real codes are [0,nSymbols>. 8-th bit not set. |
| 270 | // - Escapes in shortCodes have the 8th bit set (value: 256+255=511). 255 because the code to be emitted is the escape byte 255 |
| 271 | // - symbols are grouped by length: 2,3,4,5,6,7,8, then 1 (single-byte codes last) |
| 272 | // the two-byte codes are split in two sections: |
| 273 | // - first section contains codes for symbols for which there is no longer symbol (no suffix). It allows an early-out during compression |
| 274 | // |
| 275 | // finally, shortCodes[] is modified to also encode all single-byte symbols (hence byteCodes[] is not required on a critical path anymore). |
| 276 | // |
| 277 | void finalize(u8 zeroTerminated) { |
| 278 | assert(nSymbols <= 255); |
| 279 | u8 newCode[256], rsum[8], byteLim = nSymbols - (lenHisto[0] - zeroTerminated); |
| 280 | |
| 281 | // compute running sum of code lengths (starting offsets for each length) |
| 282 | rsum[0] = byteLim; // 1-byte codes are highest |
| 283 | rsum[1] = zeroTerminated; |
| 284 | for(u32 i=1; i<7; i++) |
| 285 | rsum[i+1] = rsum[i] + lenHisto[i]; |
| 286 | |
| 287 | // determine the new code for each symbol, ordered by length (and splitting 2byte symbols into two classes around suffixLim) |
| 288 | suffixLim = rsum[1]; |
| 289 | symbols[newCode[0] = 0] = symbols[256]; // keep symbol 0 in place (for zeroTerminated cases only) |
| 290 | |
| 291 | for(u32 i=zeroTerminated, j=rsum[2]; i<nSymbols; i++) { |
| 292 | Symbol s1 = symbols[FSST_CODE_BASE+i]; |
| 293 | u32 len = s1.length(), opt = (len == 2)*nSymbols; |
| 294 | if (opt) { |
| 295 | u16 first2 = s1.first2(); |
| 296 | for(u32 k=0; k<opt; k++) { |
| 297 | Symbol s2 = symbols[FSST_CODE_BASE+k]; |
| 298 | if (k != i && s2.length() > 1 && first2 == s2.first2()) // test if symbol k is a suffix of s |
| 299 | opt = 0; |
| 300 | } |
| 301 | newCode[i] = opt?suffixLim++:--j; // symbols without a larger suffix have a code < suffixLim |
| 302 | } else |
| 303 | newCode[i] = rsum[len-1]++; |
| 304 | s1.set_code_len(code: newCode[i],len); |
| 305 | symbols[newCode[i]] = s1; |
| 306 | } |
| 307 | // renumber the codes in byteCodes[] |
| 308 | for(u32 i=0; i<256; i++) |
| 309 | if ((byteCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) |
| 310 | byteCodes[i] = newCode[(u8) byteCodes[i]] + (1 << FSST_LEN_BITS); |
| 311 | else |
| 312 | byteCodes[i] = 511 + (1 << FSST_LEN_BITS); |
| 313 | |
| 314 | // renumber the codes in shortCodes[] |
| 315 | for(u32 i=0; i<65536; i++) |
| 316 | if ((shortCodes[i] & FSST_CODE_MASK) >= FSST_CODE_BASE) |
| 317 | shortCodes[i] = newCode[(u8) shortCodes[i]] + (shortCodes[i] & (15 << FSST_LEN_BITS)); |
| 318 | else |
| 319 | shortCodes[i] = byteCodes[i&0xFF]; |
| 320 | |
| 321 | // replace the symbols in the hash table |
| 322 | for(u32 i=0; i<hashTabSize; i++) |
| 323 | if (hashTab[i].icl < FSST_ICL_FREE) |
| 324 | hashTab[i] = symbols[newCode[(u8) hashTab[i].code()]]; |
| 325 | } |
| 326 | }; |
| 327 | |
| 328 | #ifdef NONOPT_FSST |
| 329 | struct Counters { |
| 330 | u16 count1[FSST_CODE_MAX]; // array to count frequency of symbols as they occur in the sample |
| 331 | u16 count2[FSST_CODE_MAX][FSST_CODE_MAX]; // array to count subsequent combinations of two symbols in the sample |
| 332 | |
| 333 | void count1Set(u32 pos1, u16 val) { |
| 334 | count1[pos1] = val; |
| 335 | } |
| 336 | void count1Inc(u32 pos1) { |
| 337 | count1[pos1]++; |
| 338 | } |
| 339 | void count2Inc(u32 pos1, u32 pos2) { |
| 340 | count2[pos1][pos2]++; |
| 341 | } |
| 342 | u32 count1GetNext(u32 &pos1) { |
| 343 | return count1[pos1]; |
| 344 | } |
| 345 | u32 count2GetNext(u32 pos1, u32 &pos2) { |
| 346 | return count2[pos1][pos2]; |
| 347 | } |
| 348 | void backup1(u8 *buf) { |
| 349 | memcpy(buf, count1, FSST_CODE_MAX*sizeof(u16)); |
| 350 | } |
| 351 | void restore1(u8 *buf) { |
| 352 | memcpy(count1, buf, FSST_CODE_MAX*sizeof(u16)); |
| 353 | } |
| 354 | }; |
| 355 | #else |
| 356 | // we keep two counters count1[pos] and count2[pos1][pos2] of resp 16 and 12-bits. Both are split into two columns for performance reasons |
| 357 | // first reason is to make the column we update the most during symbolTable construction (the low bits) thinner, thus reducing CPU cache pressure. |
| 358 | // second reason is that when scanning the array, after seeing a 64-bits 0 in the high bits column, we can quickly skip over many codes (15 or 7) |
| 359 | struct Counters { |
| 360 | // high arrays come before low arrays, because our GetNext() methods may overrun their 64-bits reads a few bytes |
| 361 | u8 count1High[FSST_CODE_MAX]; // array to count frequency of symbols as they occur in the sample (16-bits) |
| 362 | u8 count1Low[FSST_CODE_MAX]; // it is split in a low and high byte: cnt = count1High*256 + count1Low |
| 363 | u8 count2High[FSST_CODE_MAX][FSST_CODE_MAX/2]; // array to count subsequent combinations of two symbols in the sample (12-bits: 8-bits low, 4-bits high) |
| 364 | u8 count2Low[FSST_CODE_MAX][FSST_CODE_MAX]; // its value is (count2High*256+count2Low) -- but high is 4-bits (we put two numbers in one, hence /2) |
| 365 | // 385KB -- but hot area likely just 10 + 30*4 = 130 cache lines (=8KB) |
| 366 | |
| 367 | void count1Set(u32 pos1, u16 val) { |
| 368 | count1Low[pos1] = val&255; |
| 369 | count1High[pos1] = val>>8; |
| 370 | } |
| 371 | void count1Inc(u32 pos1) { |
| 372 | if (!count1Low[pos1]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) |
| 373 | count1High[pos1]++; //(0,0)->(1,1)->..->(255,1)->(0,1)->(1,2)->(2,2)->(3,2)..(255,2)->(0,2)->(1,3)->(2,3)... |
| 374 | } |
| 375 | void count2Inc(u32 pos1, u32 pos2) { |
| 376 | if (!count2Low[pos1][pos2]++) // increment high early (when low==0, not when low==255). This means (high > 0) <=> (cnt > 0) |
| 377 | // inc 4-bits high counter with 1<<0 (1) or 1<<4 (16) -- depending on whether pos2 is even or odd, repectively |
| 378 | count2High[pos1][(pos2)>>1] += 1 << (((pos2)&1)<<2); // we take our chances with overflow.. (4K maxval, on a 8K sample) |
| 379 | } |
| 380 | u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range |
| 381 | // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros |
| 382 | u64 high = fsst_unaligned_load(V: &count1High[pos1]); |
| 383 | |
| 384 | u32 zero = high?(__builtin_ctzl(high)>>3):7UL; // number of zero bytes |
| 385 | high = (high >> (zero << 3)) & 255; // advance to nonzero counter |
| 386 | if (((pos1 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 |
| 387 | return 0; // all zero |
| 388 | |
| 389 | u32 low = count1Low[pos1]; |
| 390 | if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) |
| 391 | return (u32) ((high << 8) + low); |
| 392 | } |
| 393 | u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range |
| 394 | // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros |
| 395 | u64 high = fsst_unaligned_load(V: &count2High[pos1][pos2>>1]); |
| 396 | high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters |
| 397 | |
| 398 | u32 zero = high?(__builtin_ctzl(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters |
| 399 | high = (high >> (zero << 2)) & 15; // advance to nonzero counter |
| 400 | if (((pos2 += zero) >= FSST_CODE_MAX) || !high) // SKIP! advance pos2 |
| 401 | return 0UL; // all zero |
| 402 | |
| 403 | u32 low = count2Low[pos1][pos2]; |
| 404 | if (low) high--; // high is incremented early and low late, so decrement high (unless low==0) |
| 405 | return (u32) ((high << 8) + low); |
| 406 | } |
| 407 | void backup1(u8 *buf) { |
| 408 | memcpy(dest: buf, src: count1High, FSST_CODE_MAX); |
| 409 | memcpy(dest: buf+FSST_CODE_MAX, src: count1Low, FSST_CODE_MAX); |
| 410 | } |
| 411 | void restore1(u8 *buf) { |
| 412 | memcpy(dest: count1High, src: buf, FSST_CODE_MAX); |
| 413 | memcpy(dest: count1Low, src: buf+FSST_CODE_MAX, FSST_CODE_MAX); |
| 414 | } |
| 415 | }; |
| 416 | #endif |
| 417 | |
| 418 | |
| 419 | #define FSST_BUFSZ (3<<19) // 768KB |
| 420 | |
| 421 | // an encoder is a symbolmap plus some bufferspace, needed during map construction as well as compression |
| 422 | struct Encoder { |
| 423 | shared_ptr<SymbolTable> symbolTable; // symbols, plus metadata and data structures for quick compression (shortCode,hashTab, etc) |
| 424 | union { |
| 425 | Counters counters; // for counting symbol occurences during map construction |
| 426 | u8 simdbuf[FSST_BUFSZ]; // for compression: SIMD string staging area 768KB = 256KB in + 512KB out (worst case for 256KB in) |
| 427 | }; |
| 428 | }; |
| 429 | |
| 430 | // job control integer representable in one 64bits SIMD lane: cur/end=input, out=output, pos=which string (2^9=512 per call) |
| 431 | struct SIMDjob { |
| 432 | u64 out:19,pos:9,end:18,cur:18; // cur/end is input offsets (2^18=256KB), out is output offset (2^19=512KB) |
| 433 | }; |
| 434 | |
| 435 | extern bool |
| 436 | duckdb_fsst_hasAVX512(); // runtime check for avx512 capability |
| 437 | |
| 438 | extern size_t |
| 439 | duckdb_fsst_compressAVX512( |
| 440 | SymbolTable &symbolTable, |
| 441 | u8* codeBase, // IN: base address for codes, i.e. compression output (points to simdbuf+256KB) |
| 442 | u8* symbolBase, // IN: base address for string bytes, i.e. compression input (points to simdbuf) |
| 443 | SIMDjob* input, // IN: input array (size n) with job information: what to encode, where to store it. |
| 444 | SIMDjob* output, // OUT: output array (size n) with job information: how much got encoded, end output pointer. |
| 445 | size_t n, // IN: size of arrays input and output (should be max 512) |
| 446 | size_t unroll); // IN: degree of SIMD unrolling |
| 447 | |
| 448 | // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) |
| 449 | size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); |
| 450 | size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); |
| 451 | |