1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#pragma once
19
20#include <algorithm>
21#include <cstddef>
22#include <cstdint>
23#include <memory>
24#include <string>
25
26#include "parquet/platform.h"
27#include "parquet/types.h"
28
29namespace arrow {
30
31class Array;
32class BinaryArray;
33
34} // namespace arrow
35
36namespace parquet {
37
38class ColumnDescriptor;
39
40// ----------------------------------------------------------------------
41// Value comparator interfaces
42
43/// \brief Base class for value comparators. Generally used with
44/// TypedComparator<T>
45class PARQUET_EXPORT Comparator {
46 public:
47 virtual ~Comparator() {}
48
49 /// \brief Create a comparator explicitly from physical type and
50 /// sort order
51 /// \param[in] physical_type the physical type for the typed
52 /// comparator
53 /// \param[in] sort_order either SortOrder::SIGNED or
54 /// SortOrder::UNSIGNED
55 /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
56 static std::shared_ptr<Comparator> Make(Type::type physical_type,
57 SortOrder::type sort_order,
58 int type_length = -1);
59
60 /// \brief Create typed comparator inferring default sort order from
61 /// ColumnDescriptor
62 /// \param[in] descr the Parquet column schema
63 static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
64};
65
66/// \brief Interface for comparison of physical types according to the
67/// semantics of a particular logical type.
68template <typename DType>
69class TypedComparator : public Comparator {
70 public:
71 using T = typename DType::c_type;
72
73 /// \brief Scalar comparison of two elements, return true if first
74 /// is strictly less than the second
75 virtual bool Compare(const T& a, const T& b) = 0;
76
77 /// \brief Compute maximum and minimum elements in a batch of
78 /// elements without any nulls
79 virtual void GetMinMax(const T* values, int64_t length, T* out_min, T* out_max) = 0;
80
81 /// \brief Compute minimum and maximum elements from an Arrow array. Only
82 /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
83 /// / arrow::BinaryArray
84 virtual void GetMinMax(const ::arrow::Array& values, T* out_min, T* out_max) = 0;
85
86 /// \brief Compute maximum and minimum elements in a batch of
87 /// elements with accompanying bitmap indicating which elements are
88 /// included (bit set) and excluded (bit not set)
89 ///
90 /// \param[in] values the sequence of values
91 /// \param[in] length the length of the sequence
92 /// \param[in] valid_bits a bitmap indicating which elements are
93 /// included (1) or excluded (0)
94 /// \param[in] valid_bits_offset the bit offset into the bitmap of
95 /// the first element in the sequence
96 /// \param[out] out_min the returned minimum element
97 /// \param[out] out_max the returned maximum element
98 virtual void GetMinMaxSpaced(const T* values, int64_t length, const uint8_t* valid_bits,
99 int64_t valid_bits_offset, T* out_min, T* out_max) = 0;
100};
101
102/// \brief Typed version of Comparator::Make
103template <typename DType>
104std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
105 SortOrder::type sort_order,
106 int type_length = -1) {
107 return std::static_pointer_cast<TypedComparator<DType>>(
108 Comparator::Make(physical_type, sort_order, type_length));
109}
110
111/// \brief Typed version of Comparator::Make
112template <typename DType>
113std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
114 return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
115}
116
117// ----------------------------------------------------------------------
118
119/// \brief Structure represented encoded statistics to be written to
120/// and from Parquet serialized metadata
121class PARQUET_EXPORT EncodedStatistics {
122 std::shared_ptr<std::string> max_, min_;
123 bool is_signed_ = false;
124
125 public:
126 EncodedStatistics()
127 : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
128
129 const std::string& max() const { return *max_; }
130 const std::string& min() const { return *min_; }
131
132 int64_t null_count = 0;
133 int64_t distinct_count = 0;
134
135 bool has_min = false;
136 bool has_max = false;
137 bool has_null_count = false;
138 bool has_distinct_count = false;
139
140 // From parquet-mr
141 // Don't write stats larger than the max size rather than truncating. The
142 // rationale is that some engines may use the minimum value in the page as
143 // the true minimum for aggregations and there is no way to mark that a
144 // value has been truncated and is a lower bound and not in the page.
145 void ApplyStatSizeLimits(size_t length) {
146 if (max_->length() > length) {
147 has_max = false;
148 }
149 if (min_->length() > length) {
150 has_min = false;
151 }
152 }
153
154 bool is_set() const {
155 return has_min || has_max || has_null_count || has_distinct_count;
156 }
157
158 bool is_signed() const { return is_signed_; }
159
160 void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
161
162 EncodedStatistics& set_max(const std::string& value) {
163 *max_ = value;
164 has_max = true;
165 return *this;
166 }
167
168 EncodedStatistics& set_min(const std::string& value) {
169 *min_ = value;
170 has_min = true;
171 return *this;
172 }
173
174 EncodedStatistics& set_null_count(int64_t value) {
175 null_count = value;
176 has_null_count = true;
177 return *this;
178 }
179
180 EncodedStatistics& set_distinct_count(int64_t value) {
181 distinct_count = value;
182 has_distinct_count = true;
183 return *this;
184 }
185};
186
187/// \brief Base type for computing column statistics while writing a file
188class PARQUET_EXPORT Statistics {
189 public:
190 virtual ~Statistics() {}
191
192 /// \brief Create a new statistics instance given a column schema
193 /// definition
194 /// \param[in] descr the column schema
195 /// \param[in] pool a memory pool to use for any memory allocations, optional
196 static std::shared_ptr<Statistics> Make(
197 const ColumnDescriptor* descr,
198 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
199
200 /// \brief Create a new statistics instance given a column schema
201 /// definition and pre-existing state
202 /// \param[in] descr the column schema
203 /// \param[in] encoded_min the encoded minimum value
204 /// \param[in] encoded_max the encoded maximum value
205 /// \param[in] num_values total number of values
206 /// \param[in] null_count number of null values
207 /// \param[in] distinct_count number of distinct values
208 /// \param[in] has_min_max whether the min/max statistics are set
209 /// \param[in] pool a memory pool to use for any memory allocations, optional
210 static std::shared_ptr<Statistics> Make(
211 const ColumnDescriptor* descr, const std::string& encoded_min,
212 const std::string& encoded_max, int64_t num_values, int64_t null_count,
213 int64_t distinct_count, bool has_min_max,
214 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
215
216 /// \brief The number of null values, may not be set
217 virtual int64_t null_count() const = 0;
218
219 /// \brief The number of distinct values, may not be set
220 virtual int64_t distinct_count() const = 0;
221
222 /// \brief The total number of values in the column
223 virtual int64_t num_values() const = 0;
224
225 /// \brief Return true if the min and max statistics are set. Obtain
226 /// with TypedStatistics<T>::min and max
227 virtual bool HasMinMax() const = 0;
228
229 /// \brief Reset state of object to initial (no data observed) state
230 virtual void Reset() = 0;
231
232 /// \brief Plain-encoded minimum value
233 virtual std::string EncodeMin() = 0;
234
235 /// \brief Plain-encoded maximum value
236 virtual std::string EncodeMax() = 0;
237
238 /// \brief The finalized encoded form of the statistics for transport
239 virtual EncodedStatistics Encode() = 0;
240
241 /// \brief The physical type of the column schema
242 virtual Type::type physical_type() const = 0;
243
244 /// \brief The full type descriptor from the column schema
245 virtual const ColumnDescriptor* descr() const = 0;
246
247 protected:
248 static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
249 const void* max, int64_t num_values,
250 int64_t null_count, int64_t distinct_count);
251};
252
253/// \brief A typed implementation of Statistics
254template <typename DType>
255class TypedStatistics : public Statistics {
256 public:
257 using T = typename DType::c_type;
258
259 /// \brief The current minimum value
260 virtual const T& min() const = 0;
261
262 /// \brief The current maximum value
263 virtual const T& max() const = 0;
264
265 /// \brief Update state with state of another Statistics object
266 virtual void Merge(const TypedStatistics<DType>& other) = 0;
267
268 /// \brief Batch statistics update
269 virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
270
271 /// \brief Batch statistics update with supplied validity bitmap
272 virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
273 int64_t valid_bits_offset, int64_t num_not_null,
274 int64_t num_null) = 0;
275
276 /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
277 /// conversion to a primitive Parquet C type. Only implemented for certain
278 /// Parquet type / Arrow type combinations like BYTE_ARRAY /
279 /// arrow::BinaryArray
280 virtual void Update(const ::arrow::Array& values) = 0;
281
282 /// \brief Set min and max values to particular values
283 virtual void SetMinMax(const T& min, const T& max) = 0;
284};
285
286using BoolStatistics = TypedStatistics<BooleanType>;
287using Int32Statistics = TypedStatistics<Int32Type>;
288using Int64Statistics = TypedStatistics<Int64Type>;
289using FloatStatistics = TypedStatistics<FloatType>;
290using DoubleStatistics = TypedStatistics<DoubleType>;
291using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
292using FLBAStatistics = TypedStatistics<FLBAType>;
293
294/// \brief Typed version of Statistics::Make
295template <typename DType>
296std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
297 const ColumnDescriptor* descr,
298 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
299 return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
300}
301
302/// \brief Create Statistics initialized to a particular state
303/// \param[in] min the minimum value
304/// \param[in] max the minimum value
305/// \param[in] num_values number of values
306/// \param[in] null_count number of null values
307/// \param[in] distinct_count number of distinct values
308template <typename DType>
309std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
310 const typename DType::c_type& max,
311 int64_t num_values,
312 int64_t null_count,
313 int64_t distinct_count) {
314 return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
315 DType::type_num, &min, &max, num_values, null_count, distinct_count));
316}
317
318/// \brief Typed version of Statistics::Make
319template <typename DType>
320std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
321 const ColumnDescriptor* descr, const std::string& encoded_min,
322 const std::string& encoded_max, int64_t num_values, int64_t null_count,
323 int64_t distinct_count, bool has_min_max,
324 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
325 return std::static_pointer_cast<TypedStatistics<DType>>(
326 Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
327 distinct_count, has_min_max, pool));
328}
329
330} // namespace parquet
331