1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_COLUMN_STATISTICS_H |
19 | #define PARQUET_COLUMN_STATISTICS_H |
20 | |
21 | #include <algorithm> |
22 | #include <cstdint> |
23 | #include <memory> |
24 | #include <string> |
25 | |
26 | #include "parquet/schema.h" |
27 | #include "parquet/types.h" |
28 | #include "parquet/util/comparison.h" |
29 | #include "parquet/util/macros.h" |
30 | #include "parquet/util/memory.h" |
31 | #include "parquet/util/visibility.h" |
32 | |
33 | namespace parquet { |
34 | |
35 | class PARQUET_EXPORT EncodedStatistics { |
36 | std::shared_ptr<std::string> max_, min_; |
37 | |
38 | public: |
39 | EncodedStatistics() |
40 | : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {} |
41 | |
42 | const std::string& max() const { return *max_; } |
43 | const std::string& min() const { return *min_; } |
44 | |
45 | int64_t null_count = 0; |
46 | int64_t distinct_count = 0; |
47 | |
48 | bool has_min = false; |
49 | bool has_max = false; |
50 | bool has_null_count = false; |
51 | bool has_distinct_count = false; |
52 | |
53 | inline bool is_set() const { |
54 | return has_min || has_max || has_null_count || has_distinct_count; |
55 | } |
56 | |
57 | // larger of the max_ and min_ stat values |
58 | inline size_t max_stat_length() { return std::max(max_->length(), min_->length()); } |
59 | |
60 | inline EncodedStatistics& set_max(const std::string& value) { |
61 | *max_ = value; |
62 | has_max = true; |
63 | return *this; |
64 | } |
65 | |
66 | inline EncodedStatistics& set_min(const std::string& value) { |
67 | *min_ = value; |
68 | has_min = true; |
69 | return *this; |
70 | } |
71 | |
72 | inline EncodedStatistics& set_null_count(int64_t value) { |
73 | null_count = value; |
74 | has_null_count = true; |
75 | return *this; |
76 | } |
77 | |
78 | inline EncodedStatistics& set_distinct_count(int64_t value) { |
79 | distinct_count = value; |
80 | has_distinct_count = true; |
81 | return *this; |
82 | } |
83 | }; |
84 | |
85 | template <typename DType> |
86 | class PARQUET_TEMPLATE_CLASS_EXPORT TypedRowGroupStatistics; |
87 | |
88 | class PARQUET_EXPORT RowGroupStatistics |
89 | : public std::enable_shared_from_this<RowGroupStatistics> { |
90 | public: |
91 | int64_t null_count() const { return statistics_.null_count; } |
92 | int64_t distinct_count() const { return statistics_.distinct_count; } |
93 | int64_t num_values() const { return num_values_; } |
94 | |
95 | virtual bool HasMinMax() const = 0; |
96 | virtual void Reset() = 0; |
97 | |
98 | // Plain-encoded minimum value |
99 | virtual std::string EncodeMin() = 0; |
100 | |
101 | // Plain-encoded maximum value |
102 | virtual std::string EncodeMax() = 0; |
103 | |
104 | virtual EncodedStatistics Encode() = 0; |
105 | |
106 | // Set the Corresponding Comparator |
107 | virtual void SetComparator() = 0; |
108 | |
109 | virtual ~RowGroupStatistics() {} |
110 | |
111 | Type::type physical_type() const { return descr_->physical_type(); } |
112 | |
113 | protected: |
114 | const ColumnDescriptor* descr() const { return descr_; } |
115 | void SetDescr(const ColumnDescriptor* schema) { |
116 | descr_ = schema; |
117 | SetComparator(); |
118 | } |
119 | |
120 | void IncrementNullCount(int64_t n) { statistics_.null_count += n; } |
121 | |
122 | void IncrementNumValues(int64_t n) { num_values_ += n; } |
123 | |
124 | void IncrementDistinctCount(int64_t n) { statistics_.distinct_count += n; } |
125 | |
126 | void MergeCounts(const RowGroupStatistics& other) { |
127 | this->statistics_.null_count += other.statistics_.null_count; |
128 | this->statistics_.distinct_count += other.statistics_.distinct_count; |
129 | this->num_values_ += other.num_values_; |
130 | } |
131 | |
132 | void ResetCounts() { |
133 | this->statistics_.null_count = 0; |
134 | this->statistics_.distinct_count = 0; |
135 | this->num_values_ = 0; |
136 | } |
137 | |
138 | const ColumnDescriptor* descr_ = NULLPTR; |
139 | int64_t num_values_ = 0; |
140 | EncodedStatistics statistics_; |
141 | }; |
142 | |
143 | template <typename DType> |
144 | class PARQUET_TEMPLATE_CLASS_EXPORT TypedRowGroupStatistics : public RowGroupStatistics { |
145 | public: |
146 | using T = typename DType::c_type; |
147 | |
148 | TypedRowGroupStatistics(const ColumnDescriptor* schema, |
149 | ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); |
150 | |
151 | TypedRowGroupStatistics(const T& min, const T& max, int64_t num_values, |
152 | int64_t null_count, int64_t distinct_count); |
153 | |
154 | TypedRowGroupStatistics(const ColumnDescriptor* schema, const std::string& encoded_min, |
155 | const std::string& encoded_max, int64_t num_values, |
156 | int64_t null_count, int64_t distinct_count, bool has_min_max, |
157 | ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); |
158 | |
159 | bool HasMinMax() const override; |
160 | void Reset() override; |
161 | void SetComparator() override; |
162 | void Merge(const TypedRowGroupStatistics<DType>& other); |
163 | |
164 | void Update(const T* values, int64_t num_not_null, int64_t num_null); |
165 | void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced, |
166 | int64_t num_not_null, int64_t num_null); |
167 | void SetMinMax(const T& min, const T& max); |
168 | |
169 | const T& min() const; |
170 | const T& max() const; |
171 | |
172 | std::string EncodeMin() override; |
173 | std::string EncodeMax() override; |
174 | EncodedStatistics Encode() override; |
175 | |
176 | private: |
177 | bool has_min_max_ = false; |
178 | T min_; |
179 | T max_; |
180 | ::arrow::MemoryPool* pool_; |
181 | std::shared_ptr<CompareDefault<DType> > comparator_; |
182 | |
183 | void PlainEncode(const T& src, std::string* dst); |
184 | void PlainDecode(const std::string& src, T* dst); |
185 | void Copy(const T& src, T* dst, ResizableBuffer* buffer); |
186 | |
187 | std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_; |
188 | }; |
189 | |
190 | template <typename DType> |
191 | inline void TypedRowGroupStatistics<DType>::Copy(const T& src, T* dst, ResizableBuffer*) { |
192 | *dst = src; |
193 | } |
194 | |
195 | template <> |
196 | inline void TypedRowGroupStatistics<FLBAType>::Copy(const FLBA& src, FLBA* dst, |
197 | ResizableBuffer* buffer) { |
198 | if (dst->ptr == src.ptr) return; |
199 | uint32_t len = descr_->type_length(); |
200 | PARQUET_THROW_NOT_OK(buffer->Resize(len, false)); |
201 | std::memcpy(buffer->mutable_data(), src.ptr, len); |
202 | *dst = FLBA(buffer->data()); |
203 | } |
204 | |
205 | template <> |
206 | inline void TypedRowGroupStatistics<ByteArrayType>::Copy(const ByteArray& src, |
207 | ByteArray* dst, |
208 | ResizableBuffer* buffer) { |
209 | if (dst->ptr == src.ptr) return; |
210 | PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false)); |
211 | std::memcpy(buffer->mutable_data(), src.ptr, src.len); |
212 | *dst = ByteArray(src.len, buffer->data()); |
213 | } |
214 | |
215 | template <> |
216 | void TypedRowGroupStatistics<ByteArrayType>::PlainEncode(const T& src, std::string* dst); |
217 | |
218 | template <> |
219 | void TypedRowGroupStatistics<ByteArrayType>::PlainDecode(const std::string& src, T* dst); |
220 | |
221 | typedef TypedRowGroupStatistics<BooleanType> BoolStatistics; |
222 | typedef TypedRowGroupStatistics<Int32Type> Int32Statistics; |
223 | typedef TypedRowGroupStatistics<Int64Type> Int64Statistics; |
224 | typedef TypedRowGroupStatistics<Int96Type> Int96Statistics; |
225 | typedef TypedRowGroupStatistics<FloatType> FloatStatistics; |
226 | typedef TypedRowGroupStatistics<DoubleType> DoubleStatistics; |
227 | typedef TypedRowGroupStatistics<ByteArrayType> ByteArrayStatistics; |
228 | typedef TypedRowGroupStatistics<FLBAType> FLBAStatistics; |
229 | |
230 | PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<BooleanType>; |
231 | PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<Int32Type>; |
232 | PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<Int64Type>; |
233 | PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<Int96Type>; |
234 | PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<FloatType>; |
235 | PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<DoubleType>; |
236 | PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<ByteArrayType>; |
237 | PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics<FLBAType>; |
238 | |
239 | } // namespace parquet |
240 | |
241 | #endif // PARQUET_COLUMN_STATISTICS_H |
242 | |