| 1 | // this software is distributed under the MIT License (http://www.opensource.org/licenses/MIT): |
| 2 | // |
| 3 | // Copyright 2018-2020, CWI, TU Munich, FSU Jena |
| 4 | // |
| 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files |
| 6 | // (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, |
| 7 | // merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is |
| 8 | // furnished to do so, subject to the following conditions: |
| 9 | // |
| 10 | // - The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. |
| 11 | // |
| 12 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
| 13 | // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE |
| 14 | // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR |
| 15 | // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 16 | // |
| 17 | // You can contact the authors via the FSST source repository : https://github.com/cwida/fsst |
| 18 | #include "libfsst.hpp" |
| 19 | |
| 20 | #if DUCKDB_FSST_ENABLE_INTRINSINCS && (defined(__x86_64__) || defined(_M_X64)) |
| 21 | #include <immintrin.h> |
| 22 | |
| 23 | #ifdef _WIN32 |
| 24 | bool duckdb_fsst_hasAVX512() { |
| 25 | int info[4]; |
| 26 | __cpuidex(info, 0x00000007, 0); |
| 27 | return (info[1]>>16)&1; |
| 28 | } |
| 29 | #else |
| 30 | #include <cpuid.h> |
| 31 | bool duckdb_fsst_hasAVX512() { |
| 32 | int info[4]; |
| 33 | __cpuid_count(0x00000007, 0, info[0], info[1], info[2], info[3]); |
| 34 | return (info[1]>>16)&1; |
| 35 | } |
| 36 | #endif |
| 37 | #else |
| 38 | bool duckdb_fsst_hasAVX512() { return false; } |
| 39 | #endif |
| 40 | |
| 41 | // BULK COMPRESSION OF STRINGS |
| 42 | // |
| 43 | // In one call of this function, we can compress 512 strings, each of maximum length 511 bytes. |
| 44 | // strings can be shorter than 511 bytes, no problem, but if they are longer we need to cut them up. |
| 45 | // |
| 46 | // In each iteration of the while loop, we find one code in each of the unroll*8 strings, i.e. (8,16,24 or 32) for resp. unroll=1,2,3,4 |
| 47 | // unroll3 performs best on my hardware |
| 48 | // |
| 49 | // In the worst case, each final encoded string occupies 512KB bytes (512*1024; with 1024=512xexception, exception = 2 bytes). |
| 50 | // - hence codeBase is a buffer of 512KB (needs 19 bits jobs), symbolBase of 256KB (needs 18 bits jobs). |
| 51 | // |
| 52 | // 'jobX' controls the encoding of each string and is therefore a u64 with format [out:19][pos:9][end:18][cur:18] (low-to-high bits) |
| 53 | // The field 'pos' tells which string we are processing (0..511). We need this info as strings will complete compressing out-of-order. |
| 54 | // |
| 55 | // Strings will have different lengths, and when a string is finished, we reload from the buffer of 512 input strings. |
| 56 | // This continues until we have less than (8,16,24 or 32; depending on unroll) strings left to process. |
| 57 | // - so 'processed' is the amount of strings we started processing and it is between [480,512]. |
| 58 | // Note that when we quit, there will still be some (<32) strings that we started to process but which are unfinished. |
| 59 | // - so 'unfinished' is that amount. These unfinished strings will be encoded further using the scalar method. |
| 60 | // |
| 61 | // Apart from the coded strings, we return in a output[] array of size 'processed' the job values of the 'finished' strings. |
| 62 | // In the following 'unfinished' slots (processed=finished+unfinished) we output the 'job' values of the unfinished strings. |
| 63 | // |
| 64 | // For the finished strings, we need [out:19] to see the compressed size and [pos:9] to see which string we refer to. |
| 65 | // For the unfinished strings, we need all fields of 'job' to continue the compression with scalar code (see SIMD code in compressBatch). |
| 66 | // |
| 67 | // THIS IS A SEPARATE CODE FILE NOT BECAUSE OF MY LOVE FOR MODULARIZED CODE BUT BECAUSE IT ALLOWS TO COMPILE IT WITH DIFFERENT FLAGS |
| 68 | // in particular, unrolling is crucial for gather/scatter performance, but requires registers. the #define all_* expressions however, |
| 69 | // will be detected to be constants by g++ -O2 and will be precomputed and placed into AVX512 registers - spoiling 9 of them. |
| 70 | // This reduces the effectiveness of unrolling, hence -O2 makes the loop perform worse than -O1 which skips this optimization. |
| 71 | // Assembly inspection confirmed that 3-way unroll with -O1 avoids needless load/stores. |
| 72 | |
| 73 | size_t duckdb_fsst_compressAVX512(SymbolTable &symbolTable, u8* codeBase, u8* symbolBase, SIMDjob *input, SIMDjob *output, size_t n, size_t unroll) { |
| 74 | size_t processed = 0; |
| 75 | // define some constants (all_x means that all 8 lanes contain 64-bits value X) |
| 76 | #if defined(__AVX512F__) and DUCKDB_FSST_ENABLE_INTRINSINCS |
| 77 | //__m512i all_suffixLim= _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) symbolTable->suffixLim)); -- for variants b,c |
| 78 | __m512i all_MASK = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) -1)); |
| 79 | __m512i all_PRIME = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_HASH_PRIME)); |
| 80 | __m512i all_ICL_FREE = _mm512_broadcastq_epi64(_mm_set1_epi64((__m64) (u64) FSST_ICL_FREE)); |
| 81 | #define all_HASH _mm512_srli_epi64(all_MASK, 64-FSST_HASH_LOG2SIZE) |
| 82 | #define all_ONE _mm512_srli_epi64(all_MASK, 63) |
| 83 | #define all_M19 _mm512_srli_epi64(all_MASK, 45) |
| 84 | #define all_M18 _mm512_srli_epi64(all_MASK, 46) |
| 85 | #define all_M28 _mm512_srli_epi64(all_MASK, 36) |
| 86 | #define all_FFFFFF _mm512_srli_epi64(all_MASK, 40) |
| 87 | #define all_FFFF _mm512_srli_epi64(all_MASK, 48) |
| 88 | #define all_FF _mm512_srli_epi64(all_MASK, 56) |
| 89 | |
| 90 | SIMDjob *inputEnd = input+n; |
| 91 | assert(n >= unroll*8 && n <= 512); // should be close to 512 |
| 92 | __m512i job1, job2, job3, job4; // will contain current jobs, for each unroll 1,2,3,4 |
| 93 | __mmask8 loadmask1 = 255, loadmask2 = 255*(unroll>1), loadmask3 = 255*(unroll>2), loadmask4 = 255*(unroll>3); // 2b loaded new strings bitmask per unroll |
| 94 | u32 delta1 = 8, delta2 = 8*(unroll>1), delta3 = 8*(unroll>2), delta4 = 8*(unroll>3); // #new loads this SIMD iteration per unroll |
| 95 | |
| 96 | if (unroll >= 4) { |
| 97 | while (input+delta1+delta2+delta3+delta4 < inputEnd) { |
| 98 | #include "fsst_avx512_unroll4.inc" |
| 99 | } |
| 100 | } else if (unroll == 3) { |
| 101 | while (input+delta1+delta2+delta3 < inputEnd) { |
| 102 | #include "fsst_avx512_unroll3.inc" |
| 103 | } |
| 104 | } else if (unroll == 2) { |
| 105 | while (input+delta1+delta2 < inputEnd) { |
| 106 | #include "fsst_avx512_unroll2.inc" |
| 107 | } |
| 108 | } else { |
| 109 | while (input+delta1 < inputEnd) { |
| 110 | #include "fsst_avx512_unroll1.inc" |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | // flush the job states of the unfinished strings at the end of output[] |
| 115 | processed = n - (inputEnd - input); |
| 116 | u32 unfinished = 0; |
| 117 | if (unroll > 1) { |
| 118 | if (unroll > 2) { |
| 119 | if (unroll > 3) { |
| 120 | _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask4=~loadmask4, job4); |
| 121 | unfinished += _mm_popcnt_u32((int) loadmask4); |
| 122 | } |
| 123 | _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask3=~loadmask3, job3); |
| 124 | unfinished += _mm_popcnt_u32((int) loadmask3); |
| 125 | } |
| 126 | _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask2=~loadmask2, job2); |
| 127 | unfinished += _mm_popcnt_u32((int) loadmask2); |
| 128 | } |
| 129 | _mm512_mask_compressstoreu_epi64(output+unfinished, loadmask1=~loadmask1, job1); |
| 130 | #else |
| 131 | (void) symbolTable; |
| 132 | (void) codeBase; |
| 133 | (void) symbolBase; |
| 134 | (void) input; |
| 135 | (void) output; |
| 136 | (void) n; |
| 137 | (void) unroll; |
| 138 | #endif |
| 139 | return processed; |
| 140 | } |
| 141 | |