1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_COLUMN_STATISTICS_H
19#define PARQUET_COLUMN_STATISTICS_H
20
21#include <algorithm>
22#include <cstdint>
23#include <memory>
24#include <string>
25
26#include "parquet/schema.h"
27#include "parquet/types.h"
28#include "parquet/util/comparison.h"
29#include "parquet/util/macros.h"
30#include "parquet/util/memory.h"
31#include "parquet/util/visibility.h"
32
33namespace parquet {
34
35class PARQUET_EXPORT EncodedStatistics {
36 std::shared_ptr<std::string> max_, min_;
37
38 public:
39 EncodedStatistics()
40 : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
41
42 const std::string& max() const { return *max_; }
43 const std::string& min() const { return *min_; }
44
45 int64_t null_count = 0;
46 int64_t distinct_count = 0;
47
48 bool has_min = false;
49 bool has_max = false;
50 bool has_null_count = false;
51 bool has_distinct_count = false;
52
53 inline bool is_set() const {
54 return has_min || has_max || has_null_count || has_distinct_count;
55 }
56
57 // larger of the max_ and min_ stat values
58 inline size_t max_stat_length() { return std::max(max_->length(), min_->length()); }
59
60 inline EncodedStatistics& set_max(const std::string& value) {
61 *max_ = value;
62 has_max = true;
63 return *this;
64 }
65
66 inline EncodedStatistics& set_min(const std::string& value) {
67 *min_ = value;
68 has_min = true;
69 return *this;
70 }
71
72 inline EncodedStatistics& set_null_count(int64_t value) {
73 null_count = value;
74 has_null_count = true;
75 return *this;
76 }
77
78 inline EncodedStatistics& set_distinct_count(int64_t value) {
79 distinct_count = value;
80 has_distinct_count = true;
81 return *this;
82 }
83};
84
85template <typename DType>
86class PARQUET_TEMPLATE_CLASS_EXPORT TypedRowGroupStatistics;
87
88class PARQUET_EXPORT RowGroupStatistics
89 : public std::enable_shared_from_this<RowGroupStatistics> {
90 public:
91 int64_t null_count() const { return statistics_.null_count; }
92 int64_t distinct_count() const { return statistics_.distinct_count; }
93 int64_t num_values() const { return num_values_; }
94
95 virtual bool HasMinMax() const = 0;
96 virtual void Reset() = 0;
97
98 // Plain-encoded minimum value
99 virtual std::string EncodeMin() = 0;
100
101 // Plain-encoded maximum value
102 virtual std::string EncodeMax() = 0;
103
104 virtual EncodedStatistics Encode() = 0;
105
106 // Set the Corresponding Comparator
107 virtual void SetComparator() = 0;
108
109 virtual ~RowGroupStatistics() {}
110
111 Type::type physical_type() const { return descr_->physical_type(); }
112
113 protected:
114 const ColumnDescriptor* descr() const { return descr_; }
115 void SetDescr(const ColumnDescriptor* schema) {
116 descr_ = schema;
117 SetComparator();
118 }
119
120 void IncrementNullCount(int64_t n) { statistics_.null_count += n; }
121
122 void IncrementNumValues(int64_t n) { num_values_ += n; }
123
124 void IncrementDistinctCount(int64_t n) { statistics_.distinct_count += n; }
125
126 void MergeCounts(const RowGroupStatistics& other) {
127 this->statistics_.null_count += other.statistics_.null_count;
128 this->statistics_.distinct_count += other.statistics_.distinct_count;
129 this->num_values_ += other.num_values_;
130 }
131
132 void ResetCounts() {
133 this->statistics_.null_count = 0;
134 this->statistics_.distinct_count = 0;
135 this->num_values_ = 0;
136 }
137
138 const ColumnDescriptor* descr_ = NULLPTR;
139 int64_t num_values_ = 0;
140 EncodedStatistics statistics_;
141};
142
143template <typename DType>
144class PARQUET_TEMPLATE_CLASS_EXPORT TypedRowGroupStatistics : public RowGroupStatistics {
145 public:
146 using T = typename DType::c_type;
147
148 TypedRowGroupStatistics(const ColumnDescriptor* schema,
149 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
150
151 TypedRowGroupStatistics(const T& min, const T& max, int64_t num_values,
152 int64_t null_count, int64_t distinct_count);
153
154 TypedRowGroupStatistics(const ColumnDescriptor* schema, const std::string& encoded_min,
155 const std::string& encoded_max, int64_t num_values,
156 int64_t null_count, int64_t distinct_count, bool has_min_max,
157 ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
158
159 bool HasMinMax() const override;
160 void Reset() override;
161 void SetComparator() override;
162 void Merge(const TypedRowGroupStatistics<DType>& other);
163
164 void Update(const T* values, int64_t num_not_null, int64_t num_null);
165 void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced,
166 int64_t num_not_null, int64_t num_null);
167 void SetMinMax(const T& min, const T& max);
168
169 const T& min() const;
170 const T& max() const;
171
172 std::string EncodeMin() override;
173 std::string EncodeMax() override;
174 EncodedStatistics Encode() override;
175
176 private:
177 bool has_min_max_ = false;
178 T min_;
179 T max_;
180 ::arrow::MemoryPool* pool_;
181 std::shared_ptr<CompareDefault<DType> > comparator_;
182
183 void PlainEncode(const T& src, std::string* dst);
184 void PlainDecode(const std::string& src, T* dst);
185 void Copy(const T& src, T* dst, ResizableBuffer* buffer);
186
187 std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
188};
189
190template <typename DType>
191inline void TypedRowGroupStatistics<DType>::Copy(const T& src, T* dst, ResizableBuffer*) {
192 *dst = src;
193}
194
195template <>
196inline void TypedRowGroupStatistics<FLBAType>::Copy(const FLBA& src, FLBA* dst,
197 ResizableBuffer* buffer) {
198 if (dst->ptr == src.ptr) return;
199 uint32_t len = descr_->type_length();
200 PARQUET_THROW_NOT_OK(buffer->Resize(len, false));
201 std::memcpy(buffer->mutable_data(), src.ptr, len);
202 *dst = FLBA(buffer->data());
203}
204
205template <>
206inline void TypedRowGroupStatistics<ByteArrayType>::Copy(const ByteArray& src,
207 ByteArray* dst,
208 ResizableBuffer* buffer) {
209 if (dst->ptr == src.ptr) return;
210 PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
211 std::memcpy(buffer->mutable_data(), src.ptr, src.len);
212 *dst = ByteArray(src.len, buffer->data());
213}
214
215template <>
216void TypedRowGroupStatistics<ByteArrayType>::PlainEncode(const T& src, std::string* dst);
217
218template <>
219void TypedRowGroupStatistics<ByteArrayType>::PlainDecode(const std::string& src, T* dst);
220
221typedef TypedRowGroupStatistics<BooleanType> BoolStatistics;
222typedef TypedRowGroupStatistics<Int32Type> Int32Statistics;
223typedef TypedRowGroupStatistics<Int64Type> Int64Statistics;
224typedef TypedRowGroupStatistics<Int96Type> Int96Statistics;
225typedef TypedRowGroupStatistics<FloatType> FloatStatistics;
226typedef TypedRowGroupStatistics<DoubleType> DoubleStatistics;
227typedef TypedRowGroupStatistics<ByteArrayType> ByteArrayStatistics;
228typedef TypedRowGroupStatistics<FLBAType> FLBAStatistics;
229
230PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<BooleanType>;
231PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<Int32Type>;
232PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<Int64Type>;
233PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<Int96Type>;
234PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<FloatType>;
235PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<DoubleType>;
236PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<ByteArrayType>;
237PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<FLBAType>;
238
239} // namespace parquet
240
241#endif // PARQUET_COLUMN_STATISTICS_H
242