1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #pragma once |
19 | |
20 | #include <cstdint> |
21 | #include <cstring> |
22 | #include <memory> |
23 | |
24 | #include "parquet/exception.h" |
25 | #include "parquet/platform.h" |
26 | #include "parquet/types.h" |
27 | |
28 | namespace arrow { |
29 | |
30 | class Array; |
31 | |
32 | namespace BitUtil { |
33 | class BitWriter; |
34 | } // namespace BitUtil |
35 | |
36 | namespace util { |
37 | class RleEncoder; |
38 | } // namespace util |
39 | |
40 | } // namespace arrow |
41 | |
42 | namespace parquet { |
43 | |
44 | struct ArrowWriteContext; |
45 | class ColumnDescriptor; |
46 | class CompressedDataPage; |
47 | class DictionaryPage; |
48 | class ColumnChunkMetaDataBuilder; |
49 | class WriterProperties; |
50 | |
51 | class PARQUET_EXPORT LevelEncoder { |
52 | public: |
53 | LevelEncoder(); |
54 | ~LevelEncoder(); |
55 | |
56 | static int MaxBufferSize(Encoding::type encoding, int16_t max_level, |
57 | int num_buffered_values); |
58 | |
59 | // Initialize the LevelEncoder. |
60 | void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values, |
61 | uint8_t* data, int data_size); |
62 | |
63 | // Encodes a batch of levels from an array and returns the number of levels encoded |
64 | int Encode(int batch_size, const int16_t* levels); |
65 | |
66 | int32_t len() { |
67 | if (encoding_ != Encoding::RLE) { |
68 | throw ParquetException("Only implemented for RLE encoding" ); |
69 | } |
70 | return rle_length_; |
71 | } |
72 | |
73 | private: |
74 | int bit_width_; |
75 | int rle_length_; |
76 | Encoding::type encoding_; |
77 | std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_; |
78 | std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_; |
79 | }; |
80 | |
81 | class PARQUET_EXPORT PageWriter { |
82 | public: |
83 | virtual ~PageWriter() {} |
84 | |
85 | static std::unique_ptr<PageWriter> Open( |
86 | const std::shared_ptr<ArrowOutputStream>& sink, Compression::type codec, |
87 | int compression_level, ColumnChunkMetaDataBuilder* metadata, |
88 | ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), |
89 | bool buffered_row_group = false); |
90 | |
91 | // The Column Writer decides if dictionary encoding is used if set and |
92 | // if the dictionary encoding has fallen back to default encoding on reaching dictionary |
93 | // page limit |
94 | virtual void Close(bool has_dictionary, bool fallback) = 0; |
95 | |
96 | virtual int64_t WriteDataPage(const CompressedDataPage& page) = 0; |
97 | |
98 | virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0; |
99 | |
100 | virtual bool has_compressor() = 0; |
101 | |
102 | virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0; |
103 | }; |
104 | |
105 | static constexpr int WRITE_BATCH_SIZE = 1000; |
106 | class PARQUET_EXPORT ColumnWriter { |
107 | public: |
108 | virtual ~ColumnWriter() = default; |
109 | |
110 | static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*, |
111 | std::unique_ptr<PageWriter>, |
112 | const WriterProperties* properties); |
113 | |
114 | /// \brief Closes the ColumnWriter, commits any buffered values to pages. |
115 | /// \return Total size of the column in bytes |
116 | virtual int64_t Close() = 0; |
117 | |
118 | /// \brief The physical Parquet type of the column |
119 | virtual Type::type type() const = 0; |
120 | |
121 | /// \brief The schema for the column |
122 | virtual const ColumnDescriptor* descr() const = 0; |
123 | |
124 | /// \brief The number of rows written so far |
125 | virtual int64_t rows_written() const = 0; |
126 | |
127 | /// \brief The total size of the compressed pages + page headers. Some values |
128 | /// might be still buffered an not written to a page yet |
129 | virtual int64_t total_compressed_bytes() const = 0; |
130 | |
131 | /// \brief The total number of bytes written as serialized data and |
132 | /// dictionary pages to the ColumnChunk so far |
133 | virtual int64_t total_bytes_written() const = 0; |
134 | |
135 | /// \brief The file-level writer properties |
136 | virtual const WriterProperties* properties() = 0; |
137 | |
138 | /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns |
139 | /// error status if the array data type is not compatible with the concrete |
140 | /// writer type |
141 | virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, |
142 | int64_t num_levels, const ::arrow::Array& array, |
143 | ArrowWriteContext* ctx) = 0; |
144 | }; |
145 | |
146 | // API to write values to a single column. This is the main client facing API. |
147 | template <typename DType> |
148 | class TypedColumnWriter : public ColumnWriter { |
149 | public: |
150 | using T = typename DType::c_type; |
151 | |
152 | // Write a batch of repetition levels, definition levels, and values to the |
153 | // column. |
154 | virtual void WriteBatch(int64_t num_values, const int16_t* def_levels, |
155 | const int16_t* rep_levels, const T* values) = 0; |
156 | |
157 | /// Write a batch of repetition levels, definition levels, and values to the |
158 | /// column. |
159 | /// |
160 | /// In comparision to WriteBatch the length of repetition and definition levels |
161 | /// is the same as of the number of values read for max_definition_level == 1. |
162 | /// In the case of max_definition_level > 1, the repetition and definition |
163 | /// levels are larger than the values but the values include the null entries |
164 | /// with definition_level == (max_definition_level - 1). Thus we have to differentiate |
165 | /// in the parameters of this function if the input has the length of num_values or the |
166 | /// _number of rows in the lowest nesting level_. |
167 | /// |
168 | /// In the case that the most inner node in the Parquet is required, the _number of rows |
169 | /// in the lowest nesting level_ is equal to the number of non-null values. If the |
170 | /// inner-most schema node is optional, the _number of rows in the lowest nesting level_ |
171 | /// also includes all values with definition_level == (max_definition_level - 1). |
172 | /// |
173 | /// @param num_values number of levels to write. |
174 | /// @param def_levels The Parquet definiton levels, length is num_values |
175 | /// @param rep_levels The Parquet repetition levels, length is num_values |
176 | /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting |
177 | /// level. The length is number of rows in the lowest nesting level. |
178 | /// @param valid_bits_offset The offset in bits of the valid_bits where the |
179 | /// first relevant bit resides. |
180 | /// @param values The values in the lowest nested level including |
181 | /// spacing for nulls on the lowest levels; input has the length |
182 | /// of the number of rows on the lowest nesting level. |
183 | virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, |
184 | const int16_t* rep_levels, const uint8_t* valid_bits, |
185 | int64_t valid_bits_offset, const T* values) = 0; |
186 | |
187 | // Estimated size of the values that are not written to a page yet |
188 | virtual int64_t EstimatedBufferedValueBytes() const = 0; |
189 | }; |
190 | |
191 | using BoolWriter = TypedColumnWriter<BooleanType>; |
192 | using Int32Writer = TypedColumnWriter<Int32Type>; |
193 | using Int64Writer = TypedColumnWriter<Int64Type>; |
194 | using Int96Writer = TypedColumnWriter<Int96Type>; |
195 | using FloatWriter = TypedColumnWriter<FloatType>; |
196 | using DoubleWriter = TypedColumnWriter<DoubleType>; |
197 | using ByteArrayWriter = TypedColumnWriter<ByteArrayType>; |
198 | using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>; |
199 | |
200 | namespace internal { |
201 | |
202 | /** |
203 | * Timestamp conversion constants |
204 | */ |
205 | constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588); |
206 | |
207 | template <int64_t UnitPerDay, int64_t NanosecondsPerUnit> |
208 | inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) { |
209 | int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays; |
210 | (*impala_timestamp).value[2] = (uint32_t)julian_days; |
211 | |
212 | int64_t last_day_units = time % UnitPerDay; |
213 | auto last_day_nanos = last_day_units * NanosecondsPerUnit; |
214 | // impala_timestamp will be unaligned every other entry so do memcpy instead |
215 | // of assign and reinterpret cast to avoid undefined behavior. |
216 | std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t)); |
217 | } |
218 | |
219 | constexpr int64_t kSecondsInNanos = INT64_C(1000000000); |
220 | |
221 | inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) { |
222 | ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds, |
223 | impala_timestamp); |
224 | } |
225 | |
226 | constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000); |
227 | |
228 | inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds, |
229 | Int96* impala_timestamp) { |
230 | ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>( |
231 | milliseconds, impala_timestamp); |
232 | } |
233 | |
234 | constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000); |
235 | |
236 | inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds, |
237 | Int96* impala_timestamp) { |
238 | ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>( |
239 | microseconds, impala_timestamp); |
240 | } |
241 | |
242 | constexpr int64_t kNanosecondsInNanos = INT64_C(1); |
243 | |
244 | inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds, |
245 | Int96* impala_timestamp) { |
246 | ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>( |
247 | nanoseconds, impala_timestamp); |
248 | } |
249 | |
250 | } // namespace internal |
251 | } // namespace parquet |
252 | |