1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <cstdint>
21#include <cstring>
22#include <memory>
23#include <vector>
24
25#include "parquet/exception.h"
26#include "parquet/platform.h"
27#include "parquet/types.h"
28
29namespace arrow {
30
31class Array;
32class ArrayBuilder;
33class BinaryArray;
34class BinaryBuilder;
35class BinaryDictionary32Builder;
36
37namespace internal {
38
39class ChunkedBinaryBuilder;
40
41} // namespace internal
42} // namespace arrow
43
44namespace parquet {
45
46class ColumnDescriptor;
47
48// Untyped base for all encoders
49class Encoder {
50 public:
51 virtual ~Encoder() = default;
52
53 virtual int64_t EstimatedDataEncodedSize() = 0;
54 virtual std::shared_ptr<Buffer> FlushValues() = 0;
55 virtual Encoding::type encoding() const = 0;
56
57 virtual void Put(const ::arrow::Array& values) = 0;
58
59 virtual MemoryPool* memory_pool() const = 0;
60};
61
62// Base class for value encoders. Since encoders may or not have state (e.g.,
63// dictionary encoding) we use a class instance to maintain any state.
64//
65// TODO(wesm): Encode interface API is temporary
66template <typename DType>
67class TypedEncoder : virtual public Encoder {
68 public:
69 typedef typename DType::c_type T;
70
71 using Encoder::Put;
72
73 virtual void Put(const T* src, int num_values) = 0;
74
75 virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
76 int64_t valid_bits_offset) = 0;
77};
78
79// Base class for dictionary encoders
80template <typename DType>
81class DictEncoder : virtual public TypedEncoder<DType> {
82 public:
83 /// Writes out any buffered indices to buffer preceded by the bit width of this data.
84 /// Returns the number of bytes written.
85 /// If the supplied buffer is not big enough, returns -1.
86 /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
87 /// to size buffer.
88 virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
89
90 virtual int dict_encoded_size() = 0;
91 // virtual int dict_encoded_size() { return dict_encoded_size_; }
92
93 virtual int bit_width() const = 0;
94
95 /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
96 /// dict_encoded_size() bytes.
97 virtual void WriteDict(uint8_t* buffer) = 0;
98
99 virtual int num_entries() const = 0;
100
101 /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
102 /// assumed (without any boundschecking) that the indices reference
103 /// pre-existing dictionary values
104 /// \param[in] indices the dictionary index values. Only Int32Array currently
105 /// supported
106 virtual void PutIndices(const ::arrow::Array& indices) = 0;
107
108 /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
109 /// separately. Currently throws exception if the current dictionary memo is
110 /// non-empty
111 /// \param[in] values the dictionary values. Only valid for certain
112 /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
113 virtual void PutDictionary(const ::arrow::Array& values) = 0;
114};
115
116// ----------------------------------------------------------------------
117// Value decoding
118
119class Decoder {
120 public:
121 virtual ~Decoder() = default;
122
123 // Sets the data for a new page. This will be called multiple times on the same
124 // decoder and should reset all internal state.
125 virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
126
127 // Returns the number of values left (for the last call to SetData()). This is
128 // the number of values left in this page.
129 virtual int values_left() const = 0;
130 virtual Encoding::type encoding() const = 0;
131};
132
133template <typename DType>
134class TypedDecoder : virtual public Decoder {
135 public:
136 using T = typename DType::c_type;
137
138 // Subclasses should override the ones they support. In each of these functions,
139 // the decoder would decode put to 'max_values', storing the result in 'buffer'.
140 // The function returns the number of values decoded, which should be max_values
141 // except for end of the current data page.
142 virtual int Decode(T* buffer, int max_values) = 0;
143
144 // Decode the values in this data page but leave spaces for null entries.
145 //
146 // num_values is the size of the def_levels and buffer arrays including the number of
147 // null values.
148 virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
149 const uint8_t* valid_bits, int64_t valid_bits_offset) {
150 int values_to_read = num_values - null_count;
151 int values_read = Decode(buffer, values_to_read);
152 if (values_read != values_to_read) {
153 throw ParquetException("Number of values / definition_levels read did not match");
154 }
155
156 // Depending on the number of nulls, some of the value slots in buffer may
157 // be uninitialized, and this will cause valgrind warnings / potentially UB
158 memset(static_cast<void*>(buffer + values_read), 0,
159 (num_values - values_read) * sizeof(T));
160
161 // Add spacing for null entries. As we have filled the buffer from the front,
162 // we need to add the spacing from the back.
163 int values_to_move = values_read;
164 for (int i = num_values - 1; i >= 0; i--) {
165 if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
166 buffer[i] = buffer[--values_to_move];
167 }
168 }
169 return num_values;
170 }
171};
172
173template <typename DType>
174class DictDecoder : virtual public TypedDecoder<DType> {
175 public:
176 virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
177
178 /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
179 /// but do not append any indices
180 virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
181
182 /// \brief Decode only dictionary indices and append to dictionary
183 /// builder. The builder must have had the dictionary from this decoder
184 /// inserted already.
185 ///
186 /// Remember to reset the builder each time the dict decoder is initialized
187 /// with a new dictionary page
188 virtual int DecodeIndicesSpaced(int num_values, int null_count,
189 const uint8_t* valid_bits, int64_t valid_bits_offset,
190 ::arrow::ArrayBuilder* builder) = 0;
191
192 /// \brief Decode only dictionary indices (no nulls)
193 ///
194 /// Remember to reset the builder each time the dict decoder is initialized
195 /// with a new dictionary page
196 virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
197};
198
199// ----------------------------------------------------------------------
200// TypedEncoder specializations, traits, and factory functions
201
202class BooleanEncoder : virtual public TypedEncoder<BooleanType> {
203 public:
204 using TypedEncoder<BooleanType>::Put;
205 virtual void Put(const std::vector<bool>& src, int num_values) = 0;
206};
207
208using Int32Encoder = TypedEncoder<Int32Type>;
209using Int64Encoder = TypedEncoder<Int64Type>;
210using Int96Encoder = TypedEncoder<Int96Type>;
211using FloatEncoder = TypedEncoder<FloatType>;
212using DoubleEncoder = TypedEncoder<DoubleType>;
213using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
214using FLBAEncoder = TypedEncoder<FLBAType>;
215
216class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
217 public:
218 using TypedDecoder<BooleanType>::Decode;
219 virtual int Decode(uint8_t* buffer, int max_values) = 0;
220};
221
222using Int32Decoder = TypedDecoder<Int32Type>;
223using Int64Decoder = TypedDecoder<Int64Type>;
224using Int96Decoder = TypedDecoder<Int96Type>;
225using FloatDecoder = TypedDecoder<FloatType>;
226using DoubleDecoder = TypedDecoder<DoubleType>;
227
228/// \brief Internal helper class for decoding BYTE_ARRAY data where we can
229/// overflow the capacity of a single arrow::BinaryArray
230struct ArrowBinaryAccumulator {
231 std::unique_ptr<::arrow::BinaryBuilder> builder;
232 std::vector<std::shared_ptr<::arrow::Array>> chunks;
233};
234
235class ByteArrayDecoder : virtual public TypedDecoder<ByteArrayType> {
236 public:
237 using TypedDecoder<ByteArrayType>::DecodeSpaced;
238
239 /// \brief Returns number of encoded values decoded
240 virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
241 int64_t valid_bits_offset,
242 ::arrow::BinaryDictionary32Builder* builder) = 0;
243
244 virtual int DecodeArrowNonNull(int num_values,
245 ::arrow::BinaryDictionary32Builder* builder) = 0;
246
247 /// \brief Returns number of encoded values decoded
248 virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
249 int64_t valid_bits_offset, ArrowBinaryAccumulator* out) = 0;
250
251 virtual int DecodeArrowNonNull(int num_values, ArrowBinaryAccumulator* out) = 0;
252};
253
254class FLBADecoder : virtual public TypedDecoder<FLBAType> {
255 public:
256 using TypedDecoder<FLBAType>::DecodeSpaced;
257
258 // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
259 // there is value in adding specialized read methods for
260 // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
261 // then perhaps not
262};
263
264template <typename T>
265struct EncodingTraits {};
266
267template <>
268struct EncodingTraits<BooleanType> {
269 using Encoder = BooleanEncoder;
270 using Decoder = BooleanDecoder;
271};
272
273template <>
274struct EncodingTraits<Int32Type> {
275 using Encoder = Int32Encoder;
276 using Decoder = Int32Decoder;
277};
278
279template <>
280struct EncodingTraits<Int64Type> {
281 using Encoder = Int64Encoder;
282 using Decoder = Int64Decoder;
283};
284
285template <>
286struct EncodingTraits<Int96Type> {
287 using Encoder = Int96Encoder;
288 using Decoder = Int96Decoder;
289};
290
291template <>
292struct EncodingTraits<FloatType> {
293 using Encoder = FloatEncoder;
294 using Decoder = FloatDecoder;
295};
296
297template <>
298struct EncodingTraits<DoubleType> {
299 using Encoder = DoubleEncoder;
300 using Decoder = DoubleDecoder;
301};
302
303template <>
304struct EncodingTraits<ByteArrayType> {
305 using Encoder = ByteArrayEncoder;
306 using Decoder = ByteArrayDecoder;
307};
308
309template <>
310struct EncodingTraits<FLBAType> {
311 using Encoder = FLBAEncoder;
312 using Decoder = FLBADecoder;
313};
314
315PARQUET_EXPORT
316std::unique_ptr<Encoder> MakeEncoder(
317 Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
318 const ColumnDescriptor* descr = NULLPTR,
319 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
320
321template <typename DType>
322std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
323 Encoding::type encoding, bool use_dictionary = false,
324 const ColumnDescriptor* descr = NULLPTR,
325 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
326 using OutType = typename EncodingTraits<DType>::Encoder;
327 std::unique_ptr<Encoder> base =
328 MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
329 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
330}
331
332PARQUET_EXPORT
333std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
334 const ColumnDescriptor* descr = NULLPTR);
335
336namespace detail {
337
338PARQUET_EXPORT
339std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
340 const ColumnDescriptor* descr,
341 ::arrow::MemoryPool* pool);
342
343} // namespace detail
344
345template <typename DType>
346std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
347 const ColumnDescriptor* descr = NULLPTR,
348 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
349 using OutType = DictDecoder<DType>;
350 auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
351 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
352}
353
354template <typename DType>
355std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
356 Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
357 using OutType = typename EncodingTraits<DType>::Decoder;
358 std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
359 return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
360}
361
362} // namespace parquet
363