1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#include "orc/Exceptions.hh"
20#include "RLE.hh"
21#include "Statistics.hh"
22
23#include "wrap/coded-stream-wrapper.h"
24
25namespace orc {
26
27 ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
28 const StatContext& statContext) {
29 if (s.has_intstatistics()) {
30 return new IntegerColumnStatisticsImpl(s);
31 } else if (s.has_doublestatistics()) {
32 return new DoubleColumnStatisticsImpl(s);
33 } else if (s.has_stringstatistics()) {
34 return new StringColumnStatisticsImpl(s, statContext);
35 } else if (s.has_bucketstatistics()) {
36 return new BooleanColumnStatisticsImpl(s, statContext);
37 } else if (s.has_decimalstatistics()) {
38 return new DecimalColumnStatisticsImpl(s, statContext);
39 } else if (s.has_timestampstatistics()) {
40 return new TimestampColumnStatisticsImpl(s, statContext);
41 } else if (s.has_datestatistics()) {
42 return new DateColumnStatisticsImpl(s, statContext);
43 } else if (s.has_binarystatistics()) {
44 return new BinaryColumnStatisticsImpl(s, statContext);
45 } else {
46 return new ColumnStatisticsImpl(s);
47 }
48 }
49
50 StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats,
51 const StatContext& statContext) {
52 for(int i = 0; i < stripeStats.colstats_size(); i++) {
53 colStats.push_back(
54 convertColumnStatistics(stripeStats.colstats(i), statContext));
55 }
56 }
57
58 StatisticsImpl::StatisticsImpl(const proto::Footer& footer,
59 const StatContext& statContext) {
60 for(int i = 0; i < footer.statistics_size(); i++) {
61 colStats.push_back(
62 convertColumnStatistics(footer.statistics(i), statContext));
63 }
64 }
65
66 StatisticsImpl::~StatisticsImpl() {
67 for(std::list<ColumnStatistics*>::iterator ptr = colStats.begin();
68 ptr != colStats.end();
69 ++ptr) {
70 delete *ptr;
71 }
72 }
73
74 Statistics::~Statistics() {
75 // PASS
76 }
77
78 StripeStatistics::~StripeStatistics() {
79 // PASS
80 }
81
82 StripeStatisticsImpl::~StripeStatisticsImpl() {
83 // PASS
84 }
85
86 StripeStatisticsImpl::StripeStatisticsImpl(
87 const proto::StripeStatistics& stripeStats,
88 std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
89 const StatContext& statContext) {
90 columnStats.reset(new StatisticsImpl(stripeStats, statContext));
91 rowIndexStats.resize(indexStats.size());
92 for(size_t i = 0; i < rowIndexStats.size(); i++) {
93 for(size_t j = 0; j < indexStats[i].size(); j++) {
94 rowIndexStats[i].push_back(
95 std::shared_ptr<const ColumnStatistics>(
96 convertColumnStatistics(indexStats[i][j], statContext)));
97 }
98 }
99 }
100
101
102 ColumnStatistics::~ColumnStatistics() {
103 // PASS
104 }
105
106 BinaryColumnStatistics::~BinaryColumnStatistics() {
107 // PASS
108 }
109
110 BooleanColumnStatistics::~BooleanColumnStatistics() {
111 // PASS
112 }
113
114 DateColumnStatistics::~DateColumnStatistics() {
115 // PASS
116 }
117
118 DecimalColumnStatistics::~DecimalColumnStatistics() {
119 // PASS
120 }
121
122 DoubleColumnStatistics::~DoubleColumnStatistics() {
123 // PASS
124 }
125
126 IntegerColumnStatistics::~IntegerColumnStatistics() {
127 // PASS
128 }
129
130 StringColumnStatistics::~StringColumnStatistics() {
131 // PASS
132 }
133
134 TimestampColumnStatistics::~TimestampColumnStatistics() {
135 // PASS
136 }
137
138 MutableColumnStatistics::~MutableColumnStatistics() {
139 // PASS
140 }
141
142 ColumnStatisticsImpl::~ColumnStatisticsImpl() {
143 // PASS
144 }
145
146 BinaryColumnStatisticsImpl::~BinaryColumnStatisticsImpl() {
147 // PASS
148 }
149
150 BooleanColumnStatisticsImpl::~BooleanColumnStatisticsImpl() {
151 // PASS
152 }
153
154 DateColumnStatisticsImpl::~DateColumnStatisticsImpl() {
155 // PASS
156 }
157
158 DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() {
159 // PASS
160 }
161
162 DoubleColumnStatisticsImpl::~DoubleColumnStatisticsImpl() {
163 // PASS
164 }
165
166 IntegerColumnStatisticsImpl::~IntegerColumnStatisticsImpl() {
167 // PASS
168 }
169
170 void IntegerColumnStatisticsImpl::update(int64_t value, int repetitions) {
171 _stats.updateMinMax(value);
172
173 if (_stats.hasSum()) {
174 bool wasPositive = _stats.getSum() >= 0;
175 _stats.setSum(value * repetitions + _stats.getSum());
176 if ((value >= 0) == wasPositive) {
177 _stats.setHasSum((_stats.getSum() >= 0) == wasPositive);
178 }
179 }
180 }
181
182 StringColumnStatisticsImpl::~StringColumnStatisticsImpl() {
183 // PASS
184 }
185
186 TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() {
187 // PASS
188 }
189
190 ColumnStatisticsImpl::ColumnStatisticsImpl
191 (const proto::ColumnStatistics& pb) {
192 _stats.setNumberOfValues(pb.numberofvalues());
193 _stats.setHasNull(pb.hasnull());
194 }
195
196 BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl
197 (const proto::ColumnStatistics& pb, const StatContext& statContext){
198 _stats.setNumberOfValues(pb.numberofvalues());
199 _stats.setHasNull(pb.hasnull());
200 if (pb.has_binarystatistics() && statContext.correctStats) {
201 _stats.setHasTotalLength(pb.binarystatistics().has_sum());
202 _stats.setTotalLength(
203 static_cast<uint64_t>(pb.binarystatistics().sum()));
204 }
205 }
206
207 BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl
208 (const proto::ColumnStatistics& pb, const StatContext& statContext){
209 _stats.setNumberOfValues(pb.numberofvalues());
210 _stats.setHasNull(pb.hasnull());
211 if (pb.has_bucketstatistics() && statContext.correctStats) {
212 _hasCount = true;
213 _trueCount = pb.bucketstatistics().count(0);
214 } else {
215 _hasCount = false;
216 _trueCount = 0;
217 }
218 }
219
220 DateColumnStatisticsImpl::DateColumnStatisticsImpl
221 (const proto::ColumnStatistics& pb, const StatContext& statContext){
222 _stats.setNumberOfValues(pb.numberofvalues());
223 _stats.setHasNull(pb.hasnull());
224 if (!pb.has_datestatistics() || !statContext.correctStats) {
225 // hasMinimum_ is false by default;
226 // hasMaximum_ is false by default;
227 _stats.setMinimum(0);
228 _stats.setMaximum(0);
229 } else {
230 _stats.setHasMinimum(pb.datestatistics().has_minimum());
231 _stats.setHasMaximum(pb.datestatistics().has_maximum());
232 _stats.setMinimum(pb.datestatistics().minimum());
233 _stats.setMaximum(pb.datestatistics().maximum());
234 }
235 }
236
237 DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl
238 (const proto::ColumnStatistics& pb, const StatContext& statContext){
239 _stats.setNumberOfValues(pb.numberofvalues());
240 _stats.setHasNull(pb.hasnull());
241 if (pb.has_decimalstatistics() && statContext.correctStats) {
242 const proto::DecimalStatistics& stats = pb.decimalstatistics();
243 _stats.setHasMinimum(stats.has_minimum());
244 _stats.setHasMaximum(stats.has_maximum());
245 _stats.setHasSum(stats.has_sum());
246
247 _stats.setMinimum(Decimal(stats.minimum()));
248 _stats.setMaximum(Decimal(stats.maximum()));
249 _stats.setSum(Decimal(stats.sum()));
250 }
251 }
252
253 DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl
254 (const proto::ColumnStatistics& pb){
255 _stats.setNumberOfValues(pb.numberofvalues());
256 _stats.setHasNull(pb.hasnull());
257 if (!pb.has_doublestatistics()) {
258 _stats.setMinimum(0);
259 _stats.setMaximum(0);
260 _stats.setSum(0);
261 }else{
262 const proto::DoubleStatistics& stats = pb.doublestatistics();
263 _stats.setHasMinimum(stats.has_minimum());
264 _stats.setHasMaximum(stats.has_maximum());
265 _stats.setHasSum(stats.has_sum());
266
267 _stats.setMinimum(stats.minimum());
268 _stats.setMaximum(stats.maximum());
269 _stats.setSum(stats.sum());
270 }
271 }
272
273 IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl
274 (const proto::ColumnStatistics& pb){
275 _stats.setNumberOfValues(pb.numberofvalues());
276 _stats.setHasNull(pb.hasnull());
277 if (!pb.has_intstatistics()) {
278 _stats.setMinimum(0);
279 _stats.setMaximum(0);
280 _stats.setSum(0);
281 }else{
282 const proto::IntegerStatistics& stats = pb.intstatistics();
283 _stats.setHasMinimum(stats.has_minimum());
284 _stats.setHasMaximum(stats.has_maximum());
285 _stats.setHasSum(stats.has_sum());
286
287 _stats.setMinimum(stats.minimum());
288 _stats.setMaximum(stats.maximum());
289 _stats.setSum(stats.sum());
290 }
291 }
292
293 StringColumnStatisticsImpl::StringColumnStatisticsImpl
294 (const proto::ColumnStatistics& pb, const StatContext& statContext){
295 _stats.setNumberOfValues(pb.numberofvalues());
296 _stats.setHasNull(pb.hasnull());
297 if (!pb.has_stringstatistics() || !statContext.correctStats) {
298 _stats.setTotalLength(0);
299 }else{
300 const proto::StringStatistics& stats = pb.stringstatistics();
301 _stats.setHasMinimum(stats.has_minimum());
302 _stats.setHasMaximum(stats.has_maximum());
303 _stats.setHasTotalLength(stats.has_sum());
304
305 _stats.setMinimum(stats.minimum());
306 _stats.setMaximum(stats.maximum());
307 _stats.setTotalLength(static_cast<uint64_t>(stats.sum()));
308 }
309 }
310
311 TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl
312 (const proto::ColumnStatistics& pb, const StatContext& statContext) {
313 _stats.setNumberOfValues(pb.numberofvalues());
314 _stats.setHasNull(pb.hasnull());
315 if (!pb.has_timestampstatistics() || !statContext.correctStats) {
316 _stats.setMinimum(0);
317 _stats.setMaximum(0);
318 _lowerBound = 0;
319 _upperBound = 0;
320 }else{
321 const proto::TimestampStatistics& stats = pb.timestampstatistics();
322 _stats.setHasMinimum(
323 stats.has_minimumutc() ||
324 (stats.has_minimum() && (statContext.writerTimezone != nullptr)));
325 _stats.setHasMaximum(
326 stats.has_maximumutc() ||
327 (stats.has_maximum() && (statContext.writerTimezone != nullptr)));
328 _hasLowerBound = stats.has_minimumutc() || stats.has_minimum();
329 _hasUpperBound = stats.has_maximumutc() || stats.has_maximum();
330
331 // Timestamp stats are stored in milliseconds
332 if (stats.has_minimumutc()) {
333 int64_t minimum = stats.minimumutc();
334 _stats.setMinimum(minimum);
335 _lowerBound = minimum;
336 } else if (statContext.writerTimezone) {
337 int64_t writerTimeSec = stats.minimum() / 1000;
338 // multiply the offset by 1000 to convert to millisecond
339 int64_t minimum =
340 stats.minimum() +
341 (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset)
342 * 1000;
343 _stats.setMinimum(minimum);
344 _lowerBound = minimum;
345 } else {
346 _stats.setMinimum(0);
347 // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown
348 // TZ and daylight savings
349 _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000);
350 }
351
352 // Timestamp stats are stored in milliseconds
353 if (stats.has_maximumutc()) {
354 int64_t maximum = stats.maximumutc();
355 _stats.setMaximum(maximum);
356 _upperBound = maximum;
357 } else if (statContext.writerTimezone) {
358 int64_t writerTimeSec = stats.maximum() / 1000;
359 // multiply the offset by 1000 to convert to millisecond
360 int64_t maximum = stats.maximum() +
361 (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset)
362 * 1000;
363 _stats.setMaximum(maximum);
364 _upperBound = maximum;
365 } else {
366 _stats.setMaximum(0);
367 // add 1 day 1 hour (25 hours) in milliseconds to handle unknown
368 // TZ and daylight savings
369 _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000);
370 }
371 // Add 1 millisecond to account for microsecond precision of values
372 _upperBound += 1;
373 }
374 }
375
376 std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
377 const Type& type) {
378 switch (static_cast<int64_t>(type.getKind())) {
379 case BOOLEAN:
380 return std::unique_ptr<MutableColumnStatistics>(
381 new BooleanColumnStatisticsImpl());
382 case BYTE:
383 case INT:
384 case LONG:
385 case SHORT:
386 return std::unique_ptr<MutableColumnStatistics>(
387 new IntegerColumnStatisticsImpl());
388 case STRUCT:
389 case MAP:
390 case LIST:
391 case UNION:
392 return std::unique_ptr<MutableColumnStatistics>(
393 new ColumnStatisticsImpl());
394 case FLOAT:
395 case DOUBLE:
396 return std::unique_ptr<MutableColumnStatistics>(
397 new DoubleColumnStatisticsImpl());
398 case BINARY:
399 return std::unique_ptr<MutableColumnStatistics>(
400 new BinaryColumnStatisticsImpl());
401 case STRING:
402 case CHAR:
403 case VARCHAR:
404 return std::unique_ptr<MutableColumnStatistics>(
405 new StringColumnStatisticsImpl());
406 case DATE:
407 return std::unique_ptr<MutableColumnStatistics>(
408 new DateColumnStatisticsImpl());
409 case TIMESTAMP:
410 return std::unique_ptr<MutableColumnStatistics>(
411 new TimestampColumnStatisticsImpl());
412 case DECIMAL:
413 return std::unique_ptr<MutableColumnStatistics>(
414 new DecimalColumnStatisticsImpl());
415 default:
416 throw NotImplementedYet("Not supported type: " + type.toString());
417 }
418 }
419
420}// namespace
421