CompressionCodecT64.cpp source code [ClickHouse/dbms/src/Compression/CompressionCodecT64.cpp]

1	#include <cstring>
2
3	#include <Compression/CompressionCodecT64.h>
4	#include <Compression/CompressionFactory.h>
5	#include <common/unaligned.h>
6	#include <Parsers/IAST.h>
7	#include <Parsers/ASTLiteral.h>
8	#include <IO/WriteHelpers.h>
9
10
11	namespace DB
12	{
13
14	namespace ErrorCodes
15	{
16	extern const int CANNOT_COMPRESS;
17	extern const int CANNOT_DECOMPRESS;
18	extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;
19	extern const int ILLEGAL_CODEC_PARAMETER;
20	extern const int LOGICAL_ERROR;
21	}
22
23	namespace
24	{
25
26	UInt8 codecId()
27	{
28	return static_cast<UInt8>(CompressionMethodByte::T64);
29	}
30
31	TypeIndex baseType(TypeIndex type_idx)
32	{
33	switch (type_idx)
34	{
35	case TypeIndex::Int8:
36	return TypeIndex::Int8;
37	case TypeIndex::Int16:
38	return TypeIndex::Int16;
39	case TypeIndex::Int32:
40	case TypeIndex::Decimal32:
41	return TypeIndex::Int32;
42	case TypeIndex::Int64:
43	case TypeIndex::Decimal64:
44	return TypeIndex::Int64;
45	case TypeIndex::UInt8:
46	case TypeIndex::Enum8:
47	return TypeIndex::UInt8;
48	case TypeIndex::UInt16:
49	case TypeIndex::Enum16:
50	case TypeIndex::Date:
51	return TypeIndex::UInt16;
52	case TypeIndex::UInt32:
53	case TypeIndex::DateTime:
54	return TypeIndex::UInt32;
55	case TypeIndex::UInt64:
56	return TypeIndex::UInt64;
57	default:
58	break;
59	}
60
61	return TypeIndex::Nothing;
62	}
63
64	TypeIndex typeIdx(const DataTypePtr & data_type)
65	{
66	if (!data_type)
67	return TypeIndex::Nothing;
68
69	WhichDataType which(*data_type);
70	switch (which.idx)
71	{
72	case TypeIndex::Int8:
73	case TypeIndex::UInt8:
74	case TypeIndex::Enum8:
75	case TypeIndex::Int16:
76	case TypeIndex::UInt16:
77	case TypeIndex::Enum16:
78	case TypeIndex::Date:
79	case TypeIndex::Int32:
80	case TypeIndex::UInt32:
81	case TypeIndex::DateTime:
82	case TypeIndex::Decimal32:
83	case TypeIndex::Int64:
84	case TypeIndex::UInt64:
85	case TypeIndex::Decimal64:
86	return which.idx;
87	default:
88	break;
89	}
90
91	return TypeIndex::Nothing;
92	}
93
94	void transpose64x8(UInt64 * src_dst)
95	{
96	auto * src8 = reinterpret_cast<const UInt8 *>(src_dst);
97	UInt64 dst[`8`] = {};
98
99	for (UInt32 i = `0`; i < `64`; ++i)
100	{
101	UInt64 value = src8[i];
102	dst[`0`] \|= (value & `0x1`) << i;
103	dst[`1`] \|= ((value >> `1`) & `0x1`) << i;
104	dst[`2`] \|= ((value >> `2`) & `0x1`) << i;
105	dst[`3`] \|= ((value >> `3`) & `0x1`) << i;
106	dst[`4`] \|= ((value >> `4`) & `0x1`) << i;
107	dst[`5`] \|= ((value >> `5`) & `0x1`) << i;
108	dst[`6`] \|= ((value >> `6`) & `0x1`) << i;
109	dst[`7`] \|= ((value >> `7`) & `0x1`) << i;
110	}
111
112	memcpy(src_dst, dst, `8` * sizeof(UInt64));
113	}
114
115	void reverseTranspose64x8(UInt64 * src_dst)
116	{
117	UInt8 dst8[`64`];
118
119	for (UInt32 i = `0`; i < `64`; ++i)
120	{
121	dst8[i] = ((src_dst[`0`] >> i) & `0x1`)
122	\| (((src_dst[`1`] >> i) & `0x1`) << `1`)
123	\| (((src_dst[`2`] >> i) & `0x1`) << `2`)
124	\| (((src_dst[`3`] >> i) & `0x1`) << `3`)
125	\| (((src_dst[`4`] >> i) & `0x1`) << `4`)
126	\| (((src_dst[`5`] >> i) & `0x1`) << `5`)
127	\| (((src_dst[`6`] >> i) & `0x1`) << `6`)
128	\| (((src_dst[`7`] >> i) & `0x1`) << `7`);
129	}
130
131	memcpy(src_dst, dst8, `8` * sizeof(UInt64));
132	}
133
134	template <typename T>
135	void transposeBytes(T value, UInt64 * matrix, UInt32 col)
136	{
137	UInt8 * matrix8 = reinterpret_cast<UInt8 *>(matrix);
138	const UInt8 * value8 = reinterpret_cast<const UInt8 *>(&value);
139
140	if constexpr (sizeof(T) > `4`)
141	{
142	matrix8[`64` * `7` + col] = value8[`7`];
143	matrix8[`64` * `6` + col] = value8[`6`];
144	matrix8[`64` * `5` + col] = value8[`5`];
145	matrix8[`64` * `4` + col] = value8[`4`];
146	}
147
148	if constexpr (sizeof(T) > `2`)
149	{
150	matrix8[`64` * `3` + col] = value8[`3`];
151	matrix8[`64` * `2` + col] = value8[`2`];
152	}
153
154	if constexpr (sizeof(T) > `1`)
155	matrix8[`64` * `1` + col] = value8[`1`];
156
157	matrix8[`64` * `0` + col] = value8[`0`];
158	}
159
160	template <typename T>
161	void reverseTransposeBytes(const UInt64 * matrix, UInt32 col, T & value)
162	{
163	auto * matrix8 = reinterpret_cast<const UInt8 *>(matrix);
164
165	if constexpr (sizeof(T) > `4`)
166	{
167	value \|= UInt64(matrix8[`64` * `7` + col]) << (`8` * `7`);
168	value \|= UInt64(matrix8[`64` * `6` + col]) << (`8` * `6`);
169	value \|= UInt64(matrix8[`64` * `5` + col]) << (`8` * `5`);
170	value \|= UInt64(matrix8[`64` * `4` + col]) << (`8` * `4`);
171	}
172
173	if constexpr (sizeof(T) > `2`)
174	{
175	value \|= UInt32(matrix8[`64` * `3` + col]) << (`8` * `3`);
176	value \|= UInt32(matrix8[`64` * `2` + col]) << (`8` * `2`);
177	}
178
179	if constexpr (sizeof(T) > `1`)
180	value \|= UInt32(matrix8[`64` * `1` + col]) << (`8` * `1`);
181
182	value \|= UInt32(matrix8[col]);
183	}
184
185
186	template <typename T>
187	void load(const char * src, T * buf, UInt32 tail = `64`)
188	{
189	memcpy(buf, src, tail * sizeof(T));
190	}
191
192	template <typename T>
193	void store(const T * buf, char * dst, UInt32 tail = `64`)
194	{
195	memcpy(dst, buf, tail * sizeof(T));
196	}
197
198	template <typename T>
199	void clear(T * buf)
200	{
201	for (UInt32 i = `0`; i < `64`; ++i)
202	buf[i] = `0`;
203	}
204
205
206	/// UIntX[64] -> UInt64[N] transposed matrix, N <= X
207	template <typename T, bool full = false>
208	void transpose(const T * src, char * dst, UInt32 num_bits, UInt32 tail = `64`)
209	{
210	UInt32 full_bytes = num_bits / `8`;
211	UInt32 part_bits = num_bits % `8`;
212
213	UInt64 matrix[`64`] = {};
214	for (UInt32 col = `0`; col < tail; ++col)
215	transposeBytes(src[col], matrix, col);
216
217	if constexpr (full)
218	{
219	UInt64 * matrix_line = matrix;
220	for (UInt32 byte = `0`; byte < full_bytes; ++byte, matrix_line += `8`)
221	transpose64x8(matrix_line);
222	}
223
224	UInt32 full_size = sizeof(UInt64) * (num_bits - part_bits);
225	memcpy(dst, matrix, full_size);
226	dst += full_size;
227
228	/// transpose only partially filled last byte
229	if (part_bits)
230	{
231	UInt64 * matrix_line = &matrix[full_bytes * `8`];
232	transpose64x8(matrix_line);
233	memcpy(dst, matrix_line, part_bits * sizeof(UInt64));
234	}
235	}
236
237	/// UInt64[N] transposed matrix -> UIntX[64]
238	template <typename T, bool full = false>
239	void reverseTranspose(const char * src, T * buf, UInt32 num_bits, UInt32 tail = `64`)
240	{
241	UInt64 matrix[`64`] = {};
242	memcpy(matrix, src, num_bits * sizeof(UInt64));
243
244	UInt32 full_bytes = num_bits / `8`;
245	UInt32 part_bits = num_bits % `8`;
246
247	if constexpr (full)
248	{
249	UInt64 * matrix_line = matrix;
250	for (UInt32 byte = `0`; byte < full_bytes; ++byte, matrix_line += `8`)
251	reverseTranspose64x8(matrix_line);
252	}
253
254	if (part_bits)
255	{
256	UInt64 * matrix_line = &matrix[full_bytes * `8`];
257	reverseTranspose64x8(matrix_line);
258	}
259
260	clear(buf);
261	for (UInt32 col = `0`; col < tail; ++col)
262	reverseTransposeBytes(matrix, col, buf[col]);
263	}
264
265	template <typename T, typename MinMaxT = std::conditional_t<is_signed_v<T>, Int64, UInt64>>
266	void restoreUpperBits(T * buf, T upper_min, T upper_max [[maybe_unused]], T sign_bit [[maybe_unused]], UInt32 tail = `64`)
267	{
268	if constexpr (is_signed_v<T>)
269	{
270	/// Restore some data as negatives and others as positives
271	if (sign_bit)
272	{
273	for (UInt32 col = `0`; col < tail; ++col)
274	{
275	T & value = buf[col];
276
277	if (value & sign_bit)
278	value \|= upper_min;
279	else
280	value \|= upper_max;
281	}
282
283	return;
284	}
285	}
286
287	for (UInt32 col = `0`; col < tail; ++col)
288	buf[col] \|= upper_min;
289	}
290
291
292	UInt32 getValuableBitsNumber(UInt64 min, UInt64 max)
293	{
294	UInt64 diff_bits = min ^ max;
295	if (diff_bits)
296	return `64` - __builtin_clzll(diff_bits);
297	return `0`;
298	}
299
300	UInt32 getValuableBitsNumber(Int64 min, Int64 max)
301	{
302	if (min < `0` && max >= `0`)
303	{
304	if (min + max >= `0`)
305	return getValuableBitsNumber(`0ull`, UInt64(max)) + `1`;
306	else
307	return getValuableBitsNumber(`0ull`, UInt64(~min)) + `1`;
308	}
309	else
310	return getValuableBitsNumber(UInt64(min), UInt64(max));
311	}
312
313
314	template <typename T>
315	void findMinMax(const char * src, UInt32 src_size, T & min, T & max)
316	{
317	min = unalignedLoad<T>(src);
318	max = unalignedLoad<T>(src);
319
320	const char * end = src + src_size;
321	for (; src < end; src += sizeof(T))
322	{
323	auto current = unalignedLoad<T>(src);
324	if (current < min)
325	min = current;
326	if (current > max)
327	max = current;
328	}
329	}
330
331
332	using Variant = CompressionCodecT64::Variant;
333
334	template <typename T, bool full>
335	UInt32 compressData(const char * src, UInt32 bytes_size, char * dst)
336	{
337	using MinMaxType = std::conditional_t<is_signed_v<T>, Int64, UInt64>;
338
339	static constexpr const UInt32 matrix_size = `64`;
340	static constexpr const UInt32 header_size = `2` * sizeof(UInt64);
341
342	if (bytes_size % sizeof(T))
343	throw Exception ("Cannot compress, data size " + toString(bytes_size) + " is not multiplier of " + toString(sizeof(T)),
344	ErrorCodes::CANNOT_COMPRESS);
345
346	UInt32 src_size = bytes_size / sizeof(T);
347	UInt32 num_full = src_size / matrix_size;
348	UInt32 tail = src_size % matrix_size;
349
350	T min, max;
351	findMinMax<T>(src, bytes_size, min, max);
352	MinMaxType min64 = min;
353	MinMaxType max64 = max;
354
355	/// Write header
356	{
357	memcpy(dst, &min64, sizeof(MinMaxType));
358	memcpy(dst + `8`, &max64, sizeof(MinMaxType));
359	dst += header_size;
360	}
361
362	UInt32 num_bits = getValuableBitsNumber(min64, max64);
363	if (!num_bits)
364	return header_size;
365
366	T buf[matrix_size];
367	UInt32 src_shift = sizeof(T) * matrix_size;
368	UInt32 dst_shift = sizeof(UInt64) * num_bits;
369	for (UInt32 i = `0`; i < num_full; ++i)
370	{
371	load<T>(src, buf, matrix_size);
372	transpose<T, full>(buf, dst, num_bits);
373	src += src_shift;
374	dst += dst_shift;
375	}
376
377	UInt32 dst_bytes = num_full * dst_shift;
378
379	if (tail)
380	{
381	load<T>(src, buf, tail);
382	transpose<T, full>(buf, dst, num_bits, tail);
383	dst_bytes += dst_shift;
384	}
385
386	return header_size + dst_bytes;
387	}
388
389	template <typename T, bool full>
390	void decompressData(const char * src, UInt32 bytes_size, char * dst, UInt32 uncompressed_size)
391	{
392	using MinMaxType = std::conditional_t<is_signed_v<T>, Int64, UInt64>;
393
394	static constexpr const UInt32 matrix_size = `64`;
395	static constexpr const UInt32 header_size = `2` * sizeof(UInt64);
396
397	if (bytes_size < header_size)
398	throw Exception ("Cannot decompress, data size " + toString(bytes_size) + " is less then T64 header",
399	ErrorCodes::CANNOT_DECOMPRESS);
400
401	if (uncompressed_size % sizeof(T))
402	throw Exception ("Cannot decompress, unexpected uncompressed size " + toString(uncompressed_size),
403	ErrorCodes::CANNOT_DECOMPRESS);
404
405	UInt64 num_elements = uncompressed_size / sizeof(T);
406	MinMaxType min;
407	MinMaxType max;
408
409	/// Read header
410	{
411	memcpy(&min, src, sizeof(MinMaxType));
412	memcpy(&max, src + `8`, sizeof(MinMaxType));
413	src += header_size;
414	bytes_size -= header_size;
415	}
416
417	UInt32 num_bits = getValuableBitsNumber(min, max);
418	if (!num_bits)
419	{
420	T min_value = min;
421	for (UInt32 i = `0`; i < num_elements; ++i, dst += sizeof(T))
422	unalignedStore<T>(dst, min_value);
423	return;
424	}
425
426	UInt32 src_shift = sizeof(UInt64) * num_bits;
427	UInt32 dst_shift = sizeof(T) * matrix_size;
428
429	if (!bytes_size \|\| bytes_size % src_shift)
430	throw Exception ("Cannot decompress, data size " + toString(bytes_size) + " is not multiplier of " + toString(src_shift),
431	ErrorCodes::CANNOT_DECOMPRESS);
432
433	UInt32 num_full = bytes_size / src_shift;
434	UInt32 tail = num_elements % matrix_size;
435	if (tail)
436	--num_full;
437
438	T upper_min = `0`;
439	T upper_max [[maybe_unused]] = `0`;
440	T sign_bit [[maybe_unused]] = `0`;
441	if (num_bits < `64`)
442	upper_min = UInt64(min) >> num_bits << num_bits;
443
444	if constexpr (is_signed_v<T>)
445	{
446	if (min < `0` && max >= `0` && num_bits < `64`)
447	{
448	sign_bit = `1ull` << (num_bits - `1`);
449	upper_max = UInt64(max) >> num_bits << num_bits;
450	}
451	}
452
453	T buf[matrix_size];
454	for (UInt32 i = `0`; i < num_full; ++i)
455	{
456	reverseTranspose<T, full>(src, buf, num_bits);
457	restoreUpperBits(buf, upper_min, upper_max, sign_bit);
458	store<T>(buf, dst, matrix_size);
459	src += src_shift;
460	dst += dst_shift;
461	}
462
463	if (tail)
464	{
465	reverseTranspose<T, full>(src, buf, num_bits, tail);
466	restoreUpperBits(buf, upper_min, upper_max, sign_bit, tail);
467	store<T>(buf, dst, tail);
468	}
469	}
470
471	template <typename T>
472	UInt32 compressData(const char * src, UInt32 src_size, char * dst, Variant variant)
473	{
474	if (variant == Variant::Bit)
475	return compressData<T, true>(src, src_size, dst);
476	return compressData<T, false>(src, src_size, dst);
477	}
478
479	template <typename T>
480	void decompressData(const char * src, UInt32 src_size, char * dst, UInt32 uncompressed_size, Variant variant)
481	{
482	if (variant == Variant::Bit)
483	decompressData<T, true>(src, src_size, dst, uncompressed_size);
484	else
485	decompressData<T, false>(src, src_size, dst, uncompressed_size);
486	}
487
488	}
489
490
491	UInt32 CompressionCodecT64::doCompressData(const char * src, UInt32 src_size, char * dst) const
492	{
493	UInt8 cookie = static_cast<UInt8>(type_idx) \| (static_cast<UInt8>(variant) << `7`);
494	memcpy(dst, &cookie, `1`);
495	dst += `1`;
496
497	switch (baseType(type_idx))
498	{
499	case TypeIndex::Int8:
500	return `1` + compressData<Int8>(src, src_size, dst, variant);
501	case TypeIndex::Int16:
502	return `1` + compressData<Int16>(src, src_size, dst, variant);
503	case TypeIndex::Int32:
504	return `1` + compressData<Int32>(src, src_size, dst, variant);
505	case TypeIndex::Int64:
506	return `1` + compressData<Int64>(src, src_size, dst, variant);
507	case TypeIndex::UInt8:
508	return `1` + compressData<UInt8>(src, src_size, dst, variant);
509	case TypeIndex::UInt16:
510	return `1` + compressData<UInt16>(src, src_size, dst, variant);
511	case TypeIndex::UInt32:
512	return `1` + compressData<UInt32>(src, src_size, dst, variant);
513	case TypeIndex::UInt64:
514	return `1` + compressData<UInt64>(src, src_size, dst, variant);
515	default:
516	break;
517	}
518
519	throw Exception ("Connot compress with T64", ErrorCodes::CANNOT_COMPRESS);
520	}
521
522	void CompressionCodecT64::doDecompressData(const char * src, UInt32 src_size, char * dst, UInt32 uncompressed_size) const
523	{
524	if (!src_size)
525	throw Exception ("Connot decompress with T64", ErrorCodes::CANNOT_DECOMPRESS);
526
527	UInt8 cookie = unalignedLoad<UInt8>(src);
528	src += `1`;
529	src_size -= `1`;
530
531	auto saved_variant = static_cast<Variant>(cookie >> `7`);
532	auto saved_type_id = static_cast<TypeIndex>(cookie & `0x7F`);
533
534	switch (baseType(saved_type_id))
535	{
536	case TypeIndex::Int8:
537	return decompressData<Int8>(src, src_size, dst, uncompressed_size, saved_variant);
538	case TypeIndex::Int16:
539	return decompressData<Int16>(src, src_size, dst, uncompressed_size, saved_variant);
540	case TypeIndex::Int32:
541	return decompressData<Int32>(src, src_size, dst, uncompressed_size, saved_variant);
542	case TypeIndex::Int64:
543	return decompressData<Int64>(src, src_size, dst, uncompressed_size, saved_variant);
544	case TypeIndex::UInt8:
545	return decompressData<UInt8>(src, src_size, dst, uncompressed_size, saved_variant);
546	case TypeIndex::UInt16:
547	return decompressData<UInt16>(src, src_size, dst, uncompressed_size, saved_variant);
548	case TypeIndex::UInt32:
549	return decompressData<UInt32>(src, src_size, dst, uncompressed_size, saved_variant);
550	case TypeIndex::UInt64:
551	return decompressData<UInt64>(src, src_size, dst, uncompressed_size, saved_variant);
552	default:
553	break;
554	}
555
556	throw Exception ("Connot decompress with T64", ErrorCodes::CANNOT_DECOMPRESS);
557	}
558
559	void CompressionCodecT64::useInfoAboutType(DataTypePtr data_type)
560	{
561	if (data_type)
562	{
563	type_idx = typeIdx(data_type);
564	if (type_idx == TypeIndex::Nothing)
565	throw Exception ("T64 codec is not supported for specified type", ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE);
566	}
567	}
568
569	UInt8 CompressionCodecT64::getMethodByte() const
570	{
571	return codecId();
572	}
573
574	void registerCodecT64(CompressionCodecFactory & factory)
575	{
576	auto reg_func = [&](const ASTPtr & arguments, DataTypePtr type) -> CompressionCodecPtr
577	{
578	Variant variant = Variant::Byte;
579
580	if (arguments && !arguments ->children.empty())
581	{
582	if (arguments ->children.size() > `1`)
583	throw Exception ("T64 support zero or one parameter, given " + std::to_string(arguments ->children.size()),
584	ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE);
585
586	const auto children = arguments ->children;
587	const auto * literal = children [`0`]->as<ASTLiteral>();
588	if (!literal)
589	throw Exception ("Wrong modification for T64. Expected: 'bit', 'byte')",
590	ErrorCodes::ILLEGAL_CODEC_PARAMETER);
591	String name = literal->value.safeGet<String>();
592
593	if (name == "byte")
594	variant = Variant::Byte;
595	else if (name == "bit")
596	variant = Variant::Bit;
597	else
598	throw Exception ("Wrong modification for T64: " + name, ErrorCodes::ILLEGAL_CODEC_PARAMETER);
599	}
600
601	auto type_idx = typeIdx(type);
602	if (type && type_idx == TypeIndex::Nothing)
603	throw Exception ("T64 codec is not supported for specified type", ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE);
604	return std::make_shared<CompressionCodecT64>(type_idx, variant);
605	};
606
607	factory.registerCompressionCodecWithType("T64", codecId(), reg_func);
608	}
609	}
610

Browse the source code of ClickHouse/dbms/src/Compression/CompressionCodecT64.cpp