encoding.h source code [ClickHouse/contrib/arrow/cpp/src/parquet/encoding.h]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#pragma once
19
20	#include <cstdint>
21	#include <cstring>
22	#include <memory>
23	#include <vector>
24
25	#include "parquet/exception.h"
26	#include "parquet/platform.h"
27	#include "parquet/types.h"
28
29	namespace arrow {
30
31	class Array;
32	class ArrayBuilder;
33	class BinaryArray;
34	class BinaryBuilder;
35	class BinaryDictionary32Builder;
36
37	namespace internal {
38
39	class ChunkedBinaryBuilder;
40
41	} // namespace internal
42	} // namespace arrow
43
44	namespace parquet {
45
46	class ColumnDescriptor;
47
48	// Untyped base for all encoders
49	class Encoder {
50	public:
51	virtual ~Encoder() = default;
52
53	virtual int64_t EstimatedDataEncodedSize() = `0`;
54	virtual std::shared_ptr<Buffer> FlushValues() = `0`;
55	virtual Encoding::type encoding() const = `0`;
56
57	virtual void Put(const ::arrow::Array& values) = `0`;
58
59	virtual MemoryPool* memory_pool() const = `0`;
60	};
61
62	// Base class for value encoders. Since encoders may or not have state (e.g.,
63	// dictionary encoding) we use a class instance to maintain any state.
64	//
65	// TODO(wesm): Encode interface API is temporary
66	template <typename DType>
67	class TypedEncoder : virtual public Encoder {
68	public:
69	typedef typename DType::c_type T;
70
71	using Encoder::Put;
72
73	virtual void Put(const T* src, int num_values) = `0`;
74
75	virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
76	int64_t valid_bits_offset) = `0`;
77	};
78
79	// Base class for dictionary encoders
80	template <typename DType>
81	class DictEncoder : virtual public TypedEncoder<DType> {
82	public:
83	/// Writes out any buffered indices to buffer preceded by the bit width of this data.
84	/// Returns the number of bytes written.
85	/// If the supplied buffer is not big enough, returns -1.
86	/// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
87	/// to size buffer.
88	virtual int WriteIndices(uint8_t* buffer, int buffer_len) = `0`;
89
90	virtual int dict_encoded_size() = `0`;
91	// virtual int dict_encoded_size() { return dict_encoded_size_; }
92
93	virtual int bit_width() const = `0`;
94
95	/// Writes out the encoded dictionary to buffer. buffer must be preallocated to
96	/// dict_encoded_size() bytes.
97	virtual void WriteDict(uint8_t* buffer) = `0`;
98
99	virtual int num_entries() const = `0`;
100
101	/// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
102	/// assumed (without any boundschecking) that the indices reference
103	/// pre-existing dictionary values
104	/// \param[in] indices the dictionary index values. Only Int32Array currently
105	/// supported
106	virtual void PutIndices(const ::arrow::Array& indices) = `0`;
107
108	/// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
109	/// separately. Currently throws exception if the current dictionary memo is
110	/// non-empty
111	/// \param[in] values the dictionary values. Only valid for certain
112	/// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
113	virtual void PutDictionary(const ::arrow::Array& values) = `0`;
114	};
115
116	// ----------------------------------------------------------------------
117	// Value decoding
118
119	class Decoder {
120	public:
121	virtual ~Decoder() = default;
122
123	// Sets the data for a new page. This will be called multiple times on the same
124	// decoder and should reset all internal state.
125	virtual void SetData(int num_values, const uint8_t* data, int len) = `0`;
126
127	// Returns the number of values left (for the last call to SetData()). This is
128	// the number of values left in this page.
129	virtual int values_left() const = `0`;
130	virtual Encoding::type encoding() const = `0`;
131	};
132
133	template <typename DType>
134	class TypedDecoder : virtual public Decoder {
135	public:
136	using T = typename DType::c_type;
137
138	// Subclasses should override the ones they support. In each of these functions,
139	// the decoder would decode put to 'max_values', storing the result in 'buffer'.
140	// The function returns the number of values decoded, which should be max_values
141	// except for end of the current data page.
142	virtual int Decode(T* buffer, int max_values) = `0`;
143
144	// Decode the values in this data page but leave spaces for null entries.
145	//
146	// num_values is the size of the def_levels and buffer arrays including the number of
147	// null values.
148	virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
149	const uint8_t* valid_bits, int64_t valid_bits_offset) {
150	int values_to_read = num_values - null_count;
151	int values_read = Decode(buffer, values_to_read);
152	if (values_read != values_to_read) {
153	throw ParquetException ("Number of values / definition_levels read did not match");
154	}
155
156	// Depending on the number of nulls, some of the value slots in buffer may
157	// be uninitialized, and this will cause valgrind warnings / potentially UB
158	memset(static_cast<void*>(buffer + values_read), `0`,
159	(num_values - values_read) * sizeof(T));
160
161	// Add spacing for null entries. As we have filled the buffer from the front,
162	// we need to add the spacing from the back.
163	int values_to_move = values_read;
164	for (int i = num_values - `1`; i >= `0`; i--) {
165	if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
166	buffer[i] = buffer[--values_to_move];
167	}
168	}
169	return num_values;
170	}
171	};
172
173	template <typename DType>
174	class DictDecoder : virtual public TypedDecoder<DType> {
175	public:
176	virtual void SetDict(TypedDecoder<DType>* dictionary) = `0`;
177
178	/// \brief Insert dictionary values into the Arrow dictionary builder's memo,
179	/// but do not append any indices
180	virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = `0`;
181
182	/// \brief Decode only dictionary indices and append to dictionary
183	/// builder. The builder must have had the dictionary from this decoder
184	/// inserted already.
185	///
186	/// Remember to reset the builder each time the dict decoder is initialized
187	/// with a new dictionary page
188	virtual int DecodeIndicesSpaced(int num_values, int null_count,
189	const uint8_t* valid_bits, int64_t valid_bits_offset,
190	::arrow::ArrayBuilder* builder) = `0`;
191
192	/// \brief Decode only dictionary indices (no nulls)
193	///
194	/// Remember to reset the builder each time the dict decoder is initialized
195	/// with a new dictionary page
196	virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = `0`;
197	};
198
199	// ----------------------------------------------------------------------
200	// TypedEncoder specializations, traits, and factory functions
201
202	class BooleanEncoder : virtual public TypedEncoder<BooleanType> {
203	public:
204	using TypedEncoder<BooleanType>::Put;
205	virtual void Put(const std::vector<bool>& src, int num_values) = `0`;
206	};
207
208	using Int32Encoder = TypedEncoder<Int32Type>;
209	using Int64Encoder = TypedEncoder<Int64Type>;
210	using Int96Encoder = TypedEncoder<Int96Type>;
211	using FloatEncoder = TypedEncoder<FloatType>;
212	using DoubleEncoder = TypedEncoder<DoubleType>;
213	using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
214	using FLBAEncoder = TypedEncoder<FLBAType>;
215
216	class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
217	public:
218	using TypedDecoder<BooleanType>::Decode;
219	virtual int Decode(uint8_t* buffer, int max_values) = `0`;
220	};
221
222	using Int32Decoder = TypedDecoder<Int32Type>;
223	using Int64Decoder = TypedDecoder<Int64Type>;
224	using Int96Decoder = TypedDecoder<Int96Type>;
225	using FloatDecoder = TypedDecoder<FloatType>;
226	using DoubleDecoder = TypedDecoder<DoubleType>;
227
228	/// \brief Internal helper class for decoding BYTE_ARRAY data where we can
229	/// overflow the capacity of a single arrow::BinaryArray
230	struct ArrowBinaryAccumulator {
231	std::unique_ptr<::arrow::BinaryBuilder> builder;
232	std::vector<std::shared_ptr<::arrow::Array>> chunks;
233	};
234
235	class ByteArrayDecoder : virtual public TypedDecoder<ByteArrayType> {
236	public:
237	using TypedDecoder<ByteArrayType>::DecodeSpaced;
238
239	/// \brief Returns number of encoded values decoded
240	virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
241	int64_t valid_bits_offset,
242	::arrow::BinaryDictionary32Builder* builder) = `0`;
243
244	virtual int DecodeArrowNonNull(int num_values,
245	::arrow::BinaryDictionary32Builder* builder) = `0`;
246
247	/// \brief Returns number of encoded values decoded
248	virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
249	int64_t valid_bits_offset, ArrowBinaryAccumulator* out) = `0`;
250
251	virtual int DecodeArrowNonNull(int num_values, ArrowBinaryAccumulator* out) = `0`;
252	};
253
254	class FLBADecoder : virtual public TypedDecoder<FLBAType> {
255	public:
256	using TypedDecoder<FLBAType>::DecodeSpaced;
257
258	// TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
259	// there is value in adding specialized read methods for
260	// FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
261	// then perhaps not
262	};
263
264	template <typename T>
265	struct EncodingTraits {};
266
267	template <>
268	struct EncodingTraits<BooleanType> {
269	using Encoder = BooleanEncoder;
270	using Decoder = BooleanDecoder;
271	};
272
273	template <>
274	struct EncodingTraits<Int32Type> {
275	using Encoder = Int32Encoder;
276	using Decoder = Int32Decoder;
277	};
278
279	template <>
280	struct EncodingTraits<Int64Type> {
281	using Encoder = Int64Encoder;
282	using Decoder = Int64Decoder;
283	};
284
285	template <>
286	struct EncodingTraits<Int96Type> {
287	using Encoder = Int96Encoder;
288	using Decoder = Int96Decoder;
289	};
290
291	template <>
292	struct EncodingTraits<FloatType> {
293	using Encoder = FloatEncoder;
294	using Decoder = FloatDecoder;
295	};
296
297	template <>
298	struct EncodingTraits<DoubleType> {
299	using Encoder = DoubleEncoder;
300	using Decoder = DoubleDecoder;
301	};
302
303	template <>
304	struct EncodingTraits<ByteArrayType> {
305	using Encoder = ByteArrayEncoder;
306	using Decoder = ByteArrayDecoder;
307	};
308
309	template <>
310	struct EncodingTraits<FLBAType> {
311	using Encoder = FLBAEncoder;
312	using Decoder = FLBADecoder;
313	};
314
315	PARQUET_EXPORT
316	std::unique_ptr<Encoder> MakeEncoder(
317	Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
318	const ColumnDescriptor* descr = NULLPTR,
319	::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
320
321	template <typename DType>
322	std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
323	Encoding::type encoding, bool use_dictionary = false,
324	const ColumnDescriptor* descr = NULLPTR,
325	::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
326	using OutType = typename EncodingTraits<DType>::Encoder;
327	std::unique_ptr<Encoder> base =
328	MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
329	return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
330	}
331
332	PARQUET_EXPORT
333	std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
334	const ColumnDescriptor* descr = NULLPTR);
335
336	namespace detail {
337
338	PARQUET_EXPORT
339	std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
340	const ColumnDescriptor* descr,
341	::arrow::MemoryPool* pool);
342
343	} // namespace detail
344
345	template <typename DType>
346	std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
347	const ColumnDescriptor* descr = NULLPTR,
348	::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
349	using OutType = DictDecoder<DType>;
350	auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
351	return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
352	}
353
354	template <typename DType>
355	std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
356	Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
357	using OutType = typename EncodingTraits<DType>::Decoder;
358	std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
359	return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
360	}
361
362	} // namespace parquet
363

Browse the source code of ClickHouse/contrib/arrow/cpp/src/parquet/encoding.h