column_writer.h source code [ClickHouse/contrib/arrow/cpp/src/parquet/column_writer.h]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#pragma once
19
20	#include <cstdint>
21	#include <cstring>
22	#include <memory>
23
24	#include "parquet/exception.h"
25	#include "parquet/platform.h"
26	#include "parquet/types.h"
27
28	namespace arrow {
29
30	class Array;
31
32	namespace BitUtil {
33	class BitWriter;
34	} // namespace BitUtil
35
36	namespace util {
37	class RleEncoder;
38	} // namespace util
39
40	} // namespace arrow
41
42	namespace parquet {
43
44	struct ArrowWriteContext;
45	class ColumnDescriptor;
46	class CompressedDataPage;
47	class DictionaryPage;
48	class ColumnChunkMetaDataBuilder;
49	class WriterProperties;
50
51	class PARQUET_EXPORT LevelEncoder {
52	public:
53	LevelEncoder();
54	~LevelEncoder();
55
56	static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
57	int num_buffered_values);
58
59	// Initialize the LevelEncoder.
60	void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
61	uint8_t* data, int data_size);
62
63	// Encodes a batch of levels from an array and returns the number of levels encoded
64	int Encode(int batch_size, const int16_t* levels);
65
66	int32_t len() {
67	if (encoding_ != Encoding::RLE) {
68	throw ParquetException ("Only implemented for RLE encoding");
69	}
70	return rle_length_;
71	}
72
73	private:
74	int bit_width_;
75	int rle_length_;
76	Encoding::type encoding_;
77	std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
78	std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_;
79	};
80
81	class PARQUET_EXPORT PageWriter {
82	public:
83	virtual ~PageWriter() {}
84
85	static std::unique_ptr<PageWriter> Open(
86	const std::shared_ptr<ArrowOutputStream>& sink, Compression::type codec,
87	int compression_level, ColumnChunkMetaDataBuilder* metadata,
88	::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
89	bool buffered_row_group = false);
90
91	// The Column Writer decides if dictionary encoding is used if set and
92	// if the dictionary encoding has fallen back to default encoding on reaching dictionary
93	// page limit
94	virtual void Close(bool has_dictionary, bool fallback) = `0`;
95
96	virtual int64_t WriteDataPage(const CompressedDataPage& page) = `0`;
97
98	virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = `0`;
99
100	virtual bool has_compressor() = `0`;
101
102	virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = `0`;
103	};
104
105	static constexpr int WRITE_BATCH_SIZE = `1000`;
106	class PARQUET_EXPORT ColumnWriter {
107	public:
108	virtual ~ColumnWriter() = default;
109
110	static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
111	std::unique_ptr<PageWriter>,
112	const WriterProperties* properties);
113
114	/// \brief Closes the ColumnWriter, commits any buffered values to pages.
115	/// \return Total size of the column in bytes
116	virtual int64_t Close() = `0`;
117
118	/// \brief The physical Parquet type of the column
119	virtual Type::type type() const = `0`;
120
121	/// \brief The schema for the column
122	virtual const ColumnDescriptor* descr() const = `0`;
123
124	/// \brief The number of rows written so far
125	virtual int64_t rows_written() const = `0`;
126
127	/// \brief The total size of the compressed pages + page headers. Some values
128	/// might be still buffered an not written to a page yet
129	virtual int64_t total_compressed_bytes() const = `0`;
130
131	/// \brief The total number of bytes written as serialized data and
132	/// dictionary pages to the ColumnChunk so far
133	virtual int64_t total_bytes_written() const = `0`;
134
135	/// \brief The file-level writer properties
136	virtual const WriterProperties* properties() = `0`;
137
138	/// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
139	/// error status if the array data type is not compatible with the concrete
140	/// writer type
141	virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
142	int64_t num_levels, const ::arrow::Array& array,
143	ArrowWriteContext* ctx) = `0`;
144	};
145
146	// API to write values to a single column. This is the main client facing API.
147	template <typename DType>
148	class TypedColumnWriter : public ColumnWriter {
149	public:
150	using T = typename DType::c_type;
151
152	// Write a batch of repetition levels, definition levels, and values to the
153	// column.
154	virtual void WriteBatch(int64_t num_values, const int16_t* def_levels,
155	const int16_t* rep_levels, const T* values) = `0`;
156
157	/// Write a batch of repetition levels, definition levels, and values to the
158	/// column.
159	///
160	/// In comparision to WriteBatch the length of repetition and definition levels
161	/// is the same as of the number of values read for max_definition_level == 1.
162	/// In the case of max_definition_level > 1, the repetition and definition
163	/// levels are larger than the values but the values include the null entries
164	/// with definition_level == (max_definition_level - 1). Thus we have to differentiate
165	/// in the parameters of this function if the input has the length of num_values or the
166	/// _number of rows in the lowest nesting level_.
167	///
168	/// In the case that the most inner node in the Parquet is required, the _number of rows
169	/// in the lowest nesting level_ is equal to the number of non-null values. If the
170	/// inner-most schema node is optional, the _number of rows in the lowest nesting level_
171	/// also includes all values with definition_level == (max_definition_level - 1).
172	///
173	/// @param num_values number of levels to write.
174	/// @param def_levels The Parquet definiton levels, length is num_values
175	/// @param rep_levels The Parquet repetition levels, length is num_values
176	/// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
177	/// level. The length is number of rows in the lowest nesting level.
178	/// @param valid_bits_offset The offset in bits of the valid_bits where the
179	/// first relevant bit resides.
180	/// @param values The values in the lowest nested level including
181	/// spacing for nulls on the lowest levels; input has the length
182	/// of the number of rows on the lowest nesting level.
183	virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
184	const int16_t* rep_levels, const uint8_t* valid_bits,
185	int64_t valid_bits_offset, const T* values) = `0`;
186
187	// Estimated size of the values that are not written to a page yet
188	virtual int64_t EstimatedBufferedValueBytes() const = `0`;
189	};
190
191	using BoolWriter = TypedColumnWriter<BooleanType>;
192	using Int32Writer = TypedColumnWriter<Int32Type>;
193	using Int64Writer = TypedColumnWriter<Int64Type>;
194	using Int96Writer = TypedColumnWriter<Int96Type>;
195	using FloatWriter = TypedColumnWriter<FloatType>;
196	using DoubleWriter = TypedColumnWriter<DoubleType>;
197	using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
198	using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
199
200	namespace internal {
201
202	/**
203	* Timestamp conversion constants
204	*/
205	constexpr int64_t kJulianEpochOffsetDays = INT64_C(`2440588`);
206
207	template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
208	inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
209	int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
210	(*impala_timestamp).value[`2`] = (uint32_t)julian_days;
211
212	int64_t last_day_units = time % UnitPerDay;
213	auto last_day_nanos = last_day_units * NanosecondsPerUnit;
214	// impala_timestamp will be unaligned every other entry so do memcpy instead
215	// of assign and reinterpret cast to avoid undefined behavior.
216	std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
217	}
218
219	constexpr int64_t kSecondsInNanos = INT64_C(`1000000000`);
220
221	inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
222	ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
223	impala_timestamp);
224	}
225
226	constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(`1000`);
227
228	inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
229	Int96* impala_timestamp) {
230	ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
231	milliseconds, impala_timestamp);
232	}
233
234	constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(`1000`);
235
236	inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
237	Int96* impala_timestamp) {
238	ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
239	microseconds, impala_timestamp);
240	}
241
242	constexpr int64_t kNanosecondsInNanos = INT64_C(`1`);
243
244	inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
245	Int96* impala_timestamp) {
246	ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
247	nanoseconds, impala_timestamp);
248	}
249
250	} // namespace internal
251	} // namespace parquet
252

Browse the source code of ClickHouse/contrib/arrow/cpp/src/parquet/column_writer.h