1#include <cstring>
2
3#include <Compression/CompressionCodecT64.h>
4#include <Compression/CompressionFactory.h>
5#include <common/unaligned.h>
6#include <Parsers/IAST.h>
7#include <Parsers/ASTLiteral.h>
8#include <IO/WriteHelpers.h>
9
10
11namespace DB
12{
13
14namespace ErrorCodes
15{
16extern const int CANNOT_COMPRESS;
17extern const int CANNOT_DECOMPRESS;
18extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;
19extern const int ILLEGAL_CODEC_PARAMETER;
20extern const int LOGICAL_ERROR;
21}
22
23namespace
24{
25
26UInt8 codecId()
27{
28 return static_cast<UInt8>(CompressionMethodByte::T64);
29}
30
31TypeIndex baseType(TypeIndex type_idx)
32{
33 switch (type_idx)
34 {
35 case TypeIndex::Int8:
36 return TypeIndex::Int8;
37 case TypeIndex::Int16:
38 return TypeIndex::Int16;
39 case TypeIndex::Int32:
40 case TypeIndex::Decimal32:
41 return TypeIndex::Int32;
42 case TypeIndex::Int64:
43 case TypeIndex::Decimal64:
44 return TypeIndex::Int64;
45 case TypeIndex::UInt8:
46 case TypeIndex::Enum8:
47 return TypeIndex::UInt8;
48 case TypeIndex::UInt16:
49 case TypeIndex::Enum16:
50 case TypeIndex::Date:
51 return TypeIndex::UInt16;
52 case TypeIndex::UInt32:
53 case TypeIndex::DateTime:
54 return TypeIndex::UInt32;
55 case TypeIndex::UInt64:
56 return TypeIndex::UInt64;
57 default:
58 break;
59 }
60
61 return TypeIndex::Nothing;
62}
63
64TypeIndex typeIdx(const DataTypePtr & data_type)
65{
66 if (!data_type)
67 return TypeIndex::Nothing;
68
69 WhichDataType which(*data_type);
70 switch (which.idx)
71 {
72 case TypeIndex::Int8:
73 case TypeIndex::UInt8:
74 case TypeIndex::Enum8:
75 case TypeIndex::Int16:
76 case TypeIndex::UInt16:
77 case TypeIndex::Enum16:
78 case TypeIndex::Date:
79 case TypeIndex::Int32:
80 case TypeIndex::UInt32:
81 case TypeIndex::DateTime:
82 case TypeIndex::Decimal32:
83 case TypeIndex::Int64:
84 case TypeIndex::UInt64:
85 case TypeIndex::Decimal64:
86 return which.idx;
87 default:
88 break;
89 }
90
91 return TypeIndex::Nothing;
92}
93
94void transpose64x8(UInt64 * src_dst)
95{
96 auto * src8 = reinterpret_cast<const UInt8 *>(src_dst);
97 UInt64 dst[8] = {};
98
99 for (UInt32 i = 0; i < 64; ++i)
100 {
101 UInt64 value = src8[i];
102 dst[0] |= (value & 0x1) << i;
103 dst[1] |= ((value >> 1) & 0x1) << i;
104 dst[2] |= ((value >> 2) & 0x1) << i;
105 dst[3] |= ((value >> 3) & 0x1) << i;
106 dst[4] |= ((value >> 4) & 0x1) << i;
107 dst[5] |= ((value >> 5) & 0x1) << i;
108 dst[6] |= ((value >> 6) & 0x1) << i;
109 dst[7] |= ((value >> 7) & 0x1) << i;
110 }
111
112 memcpy(src_dst, dst, 8 * sizeof(UInt64));
113}
114
115void reverseTranspose64x8(UInt64 * src_dst)
116{
117 UInt8 dst8[64];
118
119 for (UInt32 i = 0; i < 64; ++i)
120 {
121 dst8[i] = ((src_dst[0] >> i) & 0x1)
122 | (((src_dst[1] >> i) & 0x1) << 1)
123 | (((src_dst[2] >> i) & 0x1) << 2)
124 | (((src_dst[3] >> i) & 0x1) << 3)
125 | (((src_dst[4] >> i) & 0x1) << 4)
126 | (((src_dst[5] >> i) & 0x1) << 5)
127 | (((src_dst[6] >> i) & 0x1) << 6)
128 | (((src_dst[7] >> i) & 0x1) << 7);
129 }
130
131 memcpy(src_dst, dst8, 8 * sizeof(UInt64));
132}
133
134template <typename T>
135void transposeBytes(T value, UInt64 * matrix, UInt32 col)
136{
137 UInt8 * matrix8 = reinterpret_cast<UInt8 *>(matrix);
138 const UInt8 * value8 = reinterpret_cast<const UInt8 *>(&value);
139
140 if constexpr (sizeof(T) > 4)
141 {
142 matrix8[64 * 7 + col] = value8[7];
143 matrix8[64 * 6 + col] = value8[6];
144 matrix8[64 * 5 + col] = value8[5];
145 matrix8[64 * 4 + col] = value8[4];
146 }
147
148 if constexpr (sizeof(T) > 2)
149 {
150 matrix8[64 * 3 + col] = value8[3];
151 matrix8[64 * 2 + col] = value8[2];
152 }
153
154 if constexpr (sizeof(T) > 1)
155 matrix8[64 * 1 + col] = value8[1];
156
157 matrix8[64 * 0 + col] = value8[0];
158}
159
160template <typename T>
161void reverseTransposeBytes(const UInt64 * matrix, UInt32 col, T & value)
162{
163 auto * matrix8 = reinterpret_cast<const UInt8 *>(matrix);
164
165 if constexpr (sizeof(T) > 4)
166 {
167 value |= UInt64(matrix8[64 * 7 + col]) << (8 * 7);
168 value |= UInt64(matrix8[64 * 6 + col]) << (8 * 6);
169 value |= UInt64(matrix8[64 * 5 + col]) << (8 * 5);
170 value |= UInt64(matrix8[64 * 4 + col]) << (8 * 4);
171 }
172
173 if constexpr (sizeof(T) > 2)
174 {
175 value |= UInt32(matrix8[64 * 3 + col]) << (8 * 3);
176 value |= UInt32(matrix8[64 * 2 + col]) << (8 * 2);
177 }
178
179 if constexpr (sizeof(T) > 1)
180 value |= UInt32(matrix8[64 * 1 + col]) << (8 * 1);
181
182 value |= UInt32(matrix8[col]);
183}
184
185
186template <typename T>
187void load(const char * src, T * buf, UInt32 tail = 64)
188{
189 memcpy(buf, src, tail * sizeof(T));
190}
191
192template <typename T>
193void store(const T * buf, char * dst, UInt32 tail = 64)
194{
195 memcpy(dst, buf, tail * sizeof(T));
196}
197
198template <typename T>
199void clear(T * buf)
200{
201 for (UInt32 i = 0; i < 64; ++i)
202 buf[i] = 0;
203}
204
205
206/// UIntX[64] -> UInt64[N] transposed matrix, N <= X
207template <typename T, bool full = false>
208void transpose(const T * src, char * dst, UInt32 num_bits, UInt32 tail = 64)
209{
210 UInt32 full_bytes = num_bits / 8;
211 UInt32 part_bits = num_bits % 8;
212
213 UInt64 matrix[64] = {};
214 for (UInt32 col = 0; col < tail; ++col)
215 transposeBytes(src[col], matrix, col);
216
217 if constexpr (full)
218 {
219 UInt64 * matrix_line = matrix;
220 for (UInt32 byte = 0; byte < full_bytes; ++byte, matrix_line += 8)
221 transpose64x8(matrix_line);
222 }
223
224 UInt32 full_size = sizeof(UInt64) * (num_bits - part_bits);
225 memcpy(dst, matrix, full_size);
226 dst += full_size;
227
228 /// transpose only partially filled last byte
229 if (part_bits)
230 {
231 UInt64 * matrix_line = &matrix[full_bytes * 8];
232 transpose64x8(matrix_line);
233 memcpy(dst, matrix_line, part_bits * sizeof(UInt64));
234 }
235}
236
237/// UInt64[N] transposed matrix -> UIntX[64]
238template <typename T, bool full = false>
239void reverseTranspose(const char * src, T * buf, UInt32 num_bits, UInt32 tail = 64)
240{
241 UInt64 matrix[64] = {};
242 memcpy(matrix, src, num_bits * sizeof(UInt64));
243
244 UInt32 full_bytes = num_bits / 8;
245 UInt32 part_bits = num_bits % 8;
246
247 if constexpr (full)
248 {
249 UInt64 * matrix_line = matrix;
250 for (UInt32 byte = 0; byte < full_bytes; ++byte, matrix_line += 8)
251 reverseTranspose64x8(matrix_line);
252 }
253
254 if (part_bits)
255 {
256 UInt64 * matrix_line = &matrix[full_bytes * 8];
257 reverseTranspose64x8(matrix_line);
258 }
259
260 clear(buf);
261 for (UInt32 col = 0; col < tail; ++col)
262 reverseTransposeBytes(matrix, col, buf[col]);
263}
264
265template <typename T, typename MinMaxT = std::conditional_t<is_signed_v<T>, Int64, UInt64>>
266void restoreUpperBits(T * buf, T upper_min, T upper_max [[maybe_unused]], T sign_bit [[maybe_unused]], UInt32 tail = 64)
267{
268 if constexpr (is_signed_v<T>)
269 {
270 /// Restore some data as negatives and others as positives
271 if (sign_bit)
272 {
273 for (UInt32 col = 0; col < tail; ++col)
274 {
275 T & value = buf[col];
276
277 if (value & sign_bit)
278 value |= upper_min;
279 else
280 value |= upper_max;
281 }
282
283 return;
284 }
285 }
286
287 for (UInt32 col = 0; col < tail; ++col)
288 buf[col] |= upper_min;
289}
290
291
292UInt32 getValuableBitsNumber(UInt64 min, UInt64 max)
293{
294 UInt64 diff_bits = min ^ max;
295 if (diff_bits)
296 return 64 - __builtin_clzll(diff_bits);
297 return 0;
298}
299
300UInt32 getValuableBitsNumber(Int64 min, Int64 max)
301{
302 if (min < 0 && max >= 0)
303 {
304 if (min + max >= 0)
305 return getValuableBitsNumber(0ull, UInt64(max)) + 1;
306 else
307 return getValuableBitsNumber(0ull, UInt64(~min)) + 1;
308 }
309 else
310 return getValuableBitsNumber(UInt64(min), UInt64(max));
311}
312
313
314template <typename T>
315void findMinMax(const char * src, UInt32 src_size, T & min, T & max)
316{
317 min = unalignedLoad<T>(src);
318 max = unalignedLoad<T>(src);
319
320 const char * end = src + src_size;
321 for (; src < end; src += sizeof(T))
322 {
323 auto current = unalignedLoad<T>(src);
324 if (current < min)
325 min = current;
326 if (current > max)
327 max = current;
328 }
329}
330
331
332using Variant = CompressionCodecT64::Variant;
333
334template <typename T, bool full>
335UInt32 compressData(const char * src, UInt32 bytes_size, char * dst)
336{
337 using MinMaxType = std::conditional_t<is_signed_v<T>, Int64, UInt64>;
338
339 static constexpr const UInt32 matrix_size = 64;
340 static constexpr const UInt32 header_size = 2 * sizeof(UInt64);
341
342 if (bytes_size % sizeof(T))
343 throw Exception("Cannot compress, data size " + toString(bytes_size) + " is not multiplier of " + toString(sizeof(T)),
344 ErrorCodes::CANNOT_COMPRESS);
345
346 UInt32 src_size = bytes_size / sizeof(T);
347 UInt32 num_full = src_size / matrix_size;
348 UInt32 tail = src_size % matrix_size;
349
350 T min, max;
351 findMinMax<T>(src, bytes_size, min, max);
352 MinMaxType min64 = min;
353 MinMaxType max64 = max;
354
355 /// Write header
356 {
357 memcpy(dst, &min64, sizeof(MinMaxType));
358 memcpy(dst + 8, &max64, sizeof(MinMaxType));
359 dst += header_size;
360 }
361
362 UInt32 num_bits = getValuableBitsNumber(min64, max64);
363 if (!num_bits)
364 return header_size;
365
366 T buf[matrix_size];
367 UInt32 src_shift = sizeof(T) * matrix_size;
368 UInt32 dst_shift = sizeof(UInt64) * num_bits;
369 for (UInt32 i = 0; i < num_full; ++i)
370 {
371 load<T>(src, buf, matrix_size);
372 transpose<T, full>(buf, dst, num_bits);
373 src += src_shift;
374 dst += dst_shift;
375 }
376
377 UInt32 dst_bytes = num_full * dst_shift;
378
379 if (tail)
380 {
381 load<T>(src, buf, tail);
382 transpose<T, full>(buf, dst, num_bits, tail);
383 dst_bytes += dst_shift;
384 }
385
386 return header_size + dst_bytes;
387}
388
389template <typename T, bool full>
390void decompressData(const char * src, UInt32 bytes_size, char * dst, UInt32 uncompressed_size)
391{
392 using MinMaxType = std::conditional_t<is_signed_v<T>, Int64, UInt64>;
393
394 static constexpr const UInt32 matrix_size = 64;
395 static constexpr const UInt32 header_size = 2 * sizeof(UInt64);
396
397 if (bytes_size < header_size)
398 throw Exception("Cannot decompress, data size " + toString(bytes_size) + " is less then T64 header",
399 ErrorCodes::CANNOT_DECOMPRESS);
400
401 if (uncompressed_size % sizeof(T))
402 throw Exception("Cannot decompress, unexpected uncompressed size " + toString(uncompressed_size),
403 ErrorCodes::CANNOT_DECOMPRESS);
404
405 UInt64 num_elements = uncompressed_size / sizeof(T);
406 MinMaxType min;
407 MinMaxType max;
408
409 /// Read header
410 {
411 memcpy(&min, src, sizeof(MinMaxType));
412 memcpy(&max, src + 8, sizeof(MinMaxType));
413 src += header_size;
414 bytes_size -= header_size;
415 }
416
417 UInt32 num_bits = getValuableBitsNumber(min, max);
418 if (!num_bits)
419 {
420 T min_value = min;
421 for (UInt32 i = 0; i < num_elements; ++i, dst += sizeof(T))
422 unalignedStore<T>(dst, min_value);
423 return;
424 }
425
426 UInt32 src_shift = sizeof(UInt64) * num_bits;
427 UInt32 dst_shift = sizeof(T) * matrix_size;
428
429 if (!bytes_size || bytes_size % src_shift)
430 throw Exception("Cannot decompress, data size " + toString(bytes_size) + " is not multiplier of " + toString(src_shift),
431 ErrorCodes::CANNOT_DECOMPRESS);
432
433 UInt32 num_full = bytes_size / src_shift;
434 UInt32 tail = num_elements % matrix_size;
435 if (tail)
436 --num_full;
437
438 T upper_min = 0;
439 T upper_max [[maybe_unused]] = 0;
440 T sign_bit [[maybe_unused]] = 0;
441 if (num_bits < 64)
442 upper_min = UInt64(min) >> num_bits << num_bits;
443
444 if constexpr (is_signed_v<T>)
445 {
446 if (min < 0 && max >= 0 && num_bits < 64)
447 {
448 sign_bit = 1ull << (num_bits - 1);
449 upper_max = UInt64(max) >> num_bits << num_bits;
450 }
451 }
452
453 T buf[matrix_size];
454 for (UInt32 i = 0; i < num_full; ++i)
455 {
456 reverseTranspose<T, full>(src, buf, num_bits);
457 restoreUpperBits(buf, upper_min, upper_max, sign_bit);
458 store<T>(buf, dst, matrix_size);
459 src += src_shift;
460 dst += dst_shift;
461 }
462
463 if (tail)
464 {
465 reverseTranspose<T, full>(src, buf, num_bits, tail);
466 restoreUpperBits(buf, upper_min, upper_max, sign_bit, tail);
467 store<T>(buf, dst, tail);
468 }
469}
470
471template <typename T>
472UInt32 compressData(const char * src, UInt32 src_size, char * dst, Variant variant)
473{
474 if (variant == Variant::Bit)
475 return compressData<T, true>(src, src_size, dst);
476 return compressData<T, false>(src, src_size, dst);
477}
478
479template <typename T>
480void decompressData(const char * src, UInt32 src_size, char * dst, UInt32 uncompressed_size, Variant variant)
481{
482 if (variant == Variant::Bit)
483 decompressData<T, true>(src, src_size, dst, uncompressed_size);
484 else
485 decompressData<T, false>(src, src_size, dst, uncompressed_size);
486}
487
488}
489
490
491UInt32 CompressionCodecT64::doCompressData(const char * src, UInt32 src_size, char * dst) const
492{
493 UInt8 cookie = static_cast<UInt8>(type_idx) | (static_cast<UInt8>(variant) << 7);
494 memcpy(dst, &cookie, 1);
495 dst += 1;
496
497 switch (baseType(type_idx))
498 {
499 case TypeIndex::Int8:
500 return 1 + compressData<Int8>(src, src_size, dst, variant);
501 case TypeIndex::Int16:
502 return 1 + compressData<Int16>(src, src_size, dst, variant);
503 case TypeIndex::Int32:
504 return 1 + compressData<Int32>(src, src_size, dst, variant);
505 case TypeIndex::Int64:
506 return 1 + compressData<Int64>(src, src_size, dst, variant);
507 case TypeIndex::UInt8:
508 return 1 + compressData<UInt8>(src, src_size, dst, variant);
509 case TypeIndex::UInt16:
510 return 1 + compressData<UInt16>(src, src_size, dst, variant);
511 case TypeIndex::UInt32:
512 return 1 + compressData<UInt32>(src, src_size, dst, variant);
513 case TypeIndex::UInt64:
514 return 1 + compressData<UInt64>(src, src_size, dst, variant);
515 default:
516 break;
517 }
518
519 throw Exception("Connot compress with T64", ErrorCodes::CANNOT_COMPRESS);
520}
521
522void CompressionCodecT64::doDecompressData(const char * src, UInt32 src_size, char * dst, UInt32 uncompressed_size) const
523{
524 if (!src_size)
525 throw Exception("Connot decompress with T64", ErrorCodes::CANNOT_DECOMPRESS);
526
527 UInt8 cookie = unalignedLoad<UInt8>(src);
528 src += 1;
529 src_size -= 1;
530
531 auto saved_variant = static_cast<Variant>(cookie >> 7);
532 auto saved_type_id = static_cast<TypeIndex>(cookie & 0x7F);
533
534 switch (baseType(saved_type_id))
535 {
536 case TypeIndex::Int8:
537 return decompressData<Int8>(src, src_size, dst, uncompressed_size, saved_variant);
538 case TypeIndex::Int16:
539 return decompressData<Int16>(src, src_size, dst, uncompressed_size, saved_variant);
540 case TypeIndex::Int32:
541 return decompressData<Int32>(src, src_size, dst, uncompressed_size, saved_variant);
542 case TypeIndex::Int64:
543 return decompressData<Int64>(src, src_size, dst, uncompressed_size, saved_variant);
544 case TypeIndex::UInt8:
545 return decompressData<UInt8>(src, src_size, dst, uncompressed_size, saved_variant);
546 case TypeIndex::UInt16:
547 return decompressData<UInt16>(src, src_size, dst, uncompressed_size, saved_variant);
548 case TypeIndex::UInt32:
549 return decompressData<UInt32>(src, src_size, dst, uncompressed_size, saved_variant);
550 case TypeIndex::UInt64:
551 return decompressData<UInt64>(src, src_size, dst, uncompressed_size, saved_variant);
552 default:
553 break;
554 }
555
556 throw Exception("Connot decompress with T64", ErrorCodes::CANNOT_DECOMPRESS);
557}
558
559void CompressionCodecT64::useInfoAboutType(DataTypePtr data_type)
560{
561 if (data_type)
562 {
563 type_idx = typeIdx(data_type);
564 if (type_idx == TypeIndex::Nothing)
565 throw Exception("T64 codec is not supported for specified type", ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE);
566 }
567}
568
569UInt8 CompressionCodecT64::getMethodByte() const
570{
571 return codecId();
572}
573
574void registerCodecT64(CompressionCodecFactory & factory)
575{
576 auto reg_func = [&](const ASTPtr & arguments, DataTypePtr type) -> CompressionCodecPtr
577 {
578 Variant variant = Variant::Byte;
579
580 if (arguments && !arguments->children.empty())
581 {
582 if (arguments->children.size() > 1)
583 throw Exception("T64 support zero or one parameter, given " + std::to_string(arguments->children.size()),
584 ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE);
585
586 const auto children = arguments->children;
587 const auto * literal = children[0]->as<ASTLiteral>();
588 if (!literal)
589 throw Exception("Wrong modification for T64. Expected: 'bit', 'byte')",
590 ErrorCodes::ILLEGAL_CODEC_PARAMETER);
591 String name = literal->value.safeGet<String>();
592
593 if (name == "byte")
594 variant = Variant::Byte;
595 else if (name == "bit")
596 variant = Variant::Bit;
597 else
598 throw Exception("Wrong modification for T64: " + name, ErrorCodes::ILLEGAL_CODEC_PARAMETER);
599 }
600
601 auto type_idx = typeIdx(type);
602 if (type && type_idx == TypeIndex::Nothing)
603 throw Exception("T64 codec is not supported for specified type", ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE);
604 return std::make_shared<CompressionCodecT64>(type_idx, variant);
605 };
606
607 factory.registerCompressionCodecWithType("T64", codecId(), reg_func);
608}
609}
610