1/**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#ifndef ORC_STATISTICS_IMPL_HH
20#define ORC_STATISTICS_IMPL_HH
21
22#include "orc/Common.hh"
23#include "orc/Int128.hh"
24#include "orc/OrcFile.hh"
25#include "orc/Reader.hh"
26
27#include "Timezone.hh"
28#include "TypeImpl.hh"
29
30namespace orc {
31
32/**
33 * StatContext contains fields required to compute statistics
34 */
35
36 struct StatContext {
37 const bool correctStats;
38 const Timezone* const writerTimezone;
39 StatContext() : correctStats(false), writerTimezone(nullptr) {}
40 StatContext(bool cStat, const Timezone* const timezone = nullptr) :
41 correctStats(cStat), writerTimezone(timezone) {}
42 };
43
44/**
45 * Internal Statistics Implementation
46 */
47
48 template <typename T>
49 class InternalStatisticsImpl {
50 private:
51 bool _hasNull;
52 bool _hasMinimum;
53 bool _hasMaximum;
54 bool _hasSum;
55 bool _hasTotalLength;
56 uint64_t _totalLength;
57 uint64_t _valueCount;
58 T _minimum;
59 T _maximum;
60 T _sum;
61 public:
62 InternalStatisticsImpl() {
63 _hasNull = false;
64 _hasMinimum = false;
65 _hasMaximum = false;
66 _hasSum = false;
67 _hasTotalLength = false;
68 _totalLength = 0;
69 _valueCount = 0;
70 }
71
72 ~InternalStatisticsImpl() {}
73
74 // GET / SET _totalLength
75 bool hasTotalLength() const { return _hasTotalLength; }
76
77 void setHasTotalLength(bool hasTotalLength) {
78 _hasTotalLength = hasTotalLength;
79 }
80
81 uint64_t getTotalLength() const { return _totalLength; }
82
83 void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; }
84
85 // GET / SET _sum
86 bool hasSum() const { return _hasSum; }
87
88 void setHasSum(bool hasSum) { _hasSum = hasSum; }
89
90 T getSum() const { return _sum; }
91
92 void setSum(T sum) { _sum = sum; }
93
94 // GET / SET _maximum
95 bool hasMaximum() const { return _hasMaximum; }
96
97 T getMaximum() const { return _maximum; }
98
99 void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; }
100
101 void setMaximum(T max) { _maximum = max; }
102
103 // GET / SET _minimum
104 bool hasMinimum() const { return _hasMinimum; }
105
106 void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; }
107
108 T getMinimum() const { return _minimum; }
109
110 void setMinimum(T min) { _minimum = min; }
111
112 // GET / SET _valueCount
113 uint64_t getNumberOfValues() const { return _valueCount; }
114
115 void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; }
116
117 // GET / SET _hasNullValue
118 bool hasNull() const { return _hasNull; }
119
120 void setHasNull(bool hasNull) { _hasNull = hasNull; }
121
122 void reset() {
123 _hasNull = false;
124 _hasMinimum = false;
125 _hasMaximum = false;
126 _hasSum = false;
127 _hasTotalLength = false;
128 _totalLength = 0;
129 _valueCount = 0;
130 }
131
132 void updateMinMax(T value) {
133 if (!_hasMinimum) {
134 _hasMinimum = _hasMaximum = true;
135 _minimum = _maximum = value;
136 } else if (compare(value, _minimum)) {
137 _minimum = value;
138 } else if (compare(_maximum, value)) {
139 _maximum = value;
140 }
141 }
142
143 // sum is not merged here as we need to check overflow
144 void merge(const InternalStatisticsImpl& other) {
145 _hasNull = _hasNull || other._hasNull;
146 _valueCount += other._valueCount;
147
148 if (other._hasMinimum) {
149 if (!_hasMinimum) {
150 _hasMinimum = _hasMaximum = true;
151 _minimum = other._minimum;
152 _maximum = other._maximum;
153 } else {
154 // all template types should support operator<
155 if (compare(_maximum, other._maximum)) {
156 _maximum = other._maximum;
157 }
158 if (compare(other._minimum, _minimum)) {
159 _minimum = other._minimum;
160 }
161 }
162 }
163
164 _hasTotalLength = _hasTotalLength && other._hasTotalLength;
165 _totalLength += other._totalLength;
166 }
167 };
168
169 typedef InternalStatisticsImpl<char> InternalCharStatistics;
170 typedef InternalStatisticsImpl<uint64_t> InternalBooleanStatistics;
171 typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics;
172 typedef InternalStatisticsImpl<int32_t> InternalDateStatistics;
173 typedef InternalStatisticsImpl<double> InternalDoubleStatistics;
174 typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics;
175 typedef InternalStatisticsImpl<std::string> InternalStringStatistics;
176
177 /**
178 * Mutable column statistics for use by the writer.
179 */
180 class MutableColumnStatistics {
181 public:
182 virtual ~MutableColumnStatistics();
183
184 virtual void increase(uint64_t count) = 0;
185
186 virtual void setNumberOfValues(uint64_t value) = 0;
187
188 virtual void setHasNull(bool hasNull) = 0;
189
190 virtual void merge(const MutableColumnStatistics& other) = 0;
191
192 virtual void reset() = 0;
193
194 virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0;
195 };
196
197/**
198 * ColumnStatistics Implementation
199 */
200
201 class ColumnStatisticsImpl: public ColumnStatistics,
202 public MutableColumnStatistics {
203 private:
204 InternalCharStatistics _stats;
205 public:
206 ColumnStatisticsImpl() { reset(); }
207 ColumnStatisticsImpl(const proto::ColumnStatistics& stats);
208 virtual ~ColumnStatisticsImpl() override;
209
210 uint64_t getNumberOfValues() const override {
211 return _stats.getNumberOfValues();
212 }
213
214 void setNumberOfValues(uint64_t value) override {
215 _stats.setNumberOfValues(value);
216 }
217
218 void increase(uint64_t count) override {
219 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
220 }
221
222 bool hasNull() const override {
223 return _stats.hasNull();
224 }
225
226 void setHasNull(bool hasNull) override {
227 _stats.setHasNull(hasNull);
228 }
229
230 void merge(const MutableColumnStatistics& other) override {
231 _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats);
232 }
233
234 void reset() override {
235 _stats.reset();
236 }
237
238 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
239 pbStats.set_hasnull(_stats.hasNull());
240 pbStats.set_numberofvalues(_stats.getNumberOfValues());
241 }
242
243 std::string toString() const override {
244 std::ostringstream buffer;
245 buffer << "Column has " << getNumberOfValues() << " values"
246 << " and has null value: " << (hasNull() ? "yes" : "no")
247 << std::endl;
248 return buffer.str();
249 }
250 };
251
252 class BinaryColumnStatisticsImpl: public BinaryColumnStatistics,
253 public MutableColumnStatistics {
254 private:
255 InternalCharStatistics _stats;
256 public:
257 BinaryColumnStatisticsImpl() { reset(); }
258 BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
259 const StatContext& statContext);
260 virtual ~BinaryColumnStatisticsImpl() override;
261
262 uint64_t getNumberOfValues() const override {
263 return _stats.getNumberOfValues();
264 }
265
266 void setNumberOfValues(uint64_t value) override {
267 _stats.setNumberOfValues(value);
268 }
269
270 void increase(uint64_t count) override {
271 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
272 }
273
274 bool hasNull() const override {
275 return _stats.hasNull();
276 }
277
278 void setHasNull(bool hasNull) override {
279 _stats.setHasNull(hasNull);
280 }
281
282 bool hasTotalLength() const override {
283 return _stats.hasTotalLength();
284 }
285
286 uint64_t getTotalLength() const override {
287 if(hasTotalLength()){
288 return _stats.getTotalLength();
289 }else{
290 throw ParseError("Total length is not defined.");
291 }
292 }
293
294 void setTotalLength(uint64_t length) {
295 _stats.setHasTotalLength(true);
296 _stats.setTotalLength(length);
297 }
298
299 void update(size_t length) {
300 _stats.setTotalLength(_stats.getTotalLength() + length);
301 }
302
303 void merge(const MutableColumnStatistics& other) override {
304 const BinaryColumnStatisticsImpl& binStats =
305 dynamic_cast<const BinaryColumnStatisticsImpl&>(other);
306 _stats.merge(binStats._stats);
307 }
308
309 void reset() override {
310 _stats.reset();
311 setTotalLength(0);
312 }
313
314 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
315 pbStats.set_hasnull(_stats.hasNull());
316 pbStats.set_numberofvalues(_stats.getNumberOfValues());
317
318 proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics();
319 binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
320 }
321
322 std::string toString() const override {
323 std::ostringstream buffer;
324 buffer << "Data type: Binary" << std::endl
325 << "Values: " << getNumberOfValues() << std::endl
326 << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
327 if(hasTotalLength()){
328 buffer << "Total length: " << getTotalLength() << std::endl;
329 }else{
330 buffer << "Total length: not defined" << std::endl;
331 }
332 return buffer.str();
333 }
334 };
335
336 class BooleanColumnStatisticsImpl: public BooleanColumnStatistics,
337 public MutableColumnStatistics {
338 private:
339 InternalBooleanStatistics _stats;
340 bool _hasCount;
341 uint64_t _trueCount;
342
343 public:
344 BooleanColumnStatisticsImpl() { reset(); }
345 BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats,
346 const StatContext& statContext);
347 virtual ~BooleanColumnStatisticsImpl() override;
348
349 bool hasCount() const override {
350 return _hasCount;
351 }
352
353 void increase(uint64_t count) override {
354 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
355 _hasCount = true;
356 }
357
358 uint64_t getNumberOfValues() const override {
359 return _stats.getNumberOfValues();
360 }
361
362 void setNumberOfValues(uint64_t value) override {
363 _stats.setNumberOfValues(value);
364 }
365
366 bool hasNull() const override {
367 return _stats.hasNull();
368 }
369
370 void setHasNull(bool hasNull) override {
371 _stats.setHasNull(hasNull);
372 }
373
374 uint64_t getFalseCount() const override {
375 if(hasCount()){
376 return getNumberOfValues() - _trueCount;
377 }else{
378 throw ParseError("False count is not defined.");
379 }
380 }
381
382 uint64_t getTrueCount() const override {
383 if(hasCount()){
384 return _trueCount;
385 }else{
386 throw ParseError("True count is not defined.");
387 }
388 }
389
390 void setTrueCount(uint64_t trueCount) {
391 _hasCount = true;
392 _trueCount = trueCount;
393 }
394
395 void update(bool value, size_t repetitions) {
396 if (value) {
397 _trueCount += repetitions;
398 }
399 }
400
401 void merge(const MutableColumnStatistics& other) override {
402 const BooleanColumnStatisticsImpl& boolStats =
403 dynamic_cast<const BooleanColumnStatisticsImpl&>(other);
404 _stats.merge(boolStats._stats);
405 _hasCount = _hasCount && boolStats._hasCount;
406 _trueCount += boolStats._trueCount;
407 }
408
409 void reset() override {
410 _stats.reset();
411 setTrueCount(0);
412 }
413
414 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
415 pbStats.set_hasnull(_stats.hasNull());
416 pbStats.set_numberofvalues(_stats.getNumberOfValues());
417
418 proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics();
419 if (_hasCount) {
420 bucketStats->add_count(_trueCount);
421 }
422 }
423
424 std::string toString() const override {
425 std::ostringstream buffer;
426 buffer << "Data type: Boolean" << std::endl
427 << "Values: " << getNumberOfValues() << std::endl
428 << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
429 if(hasCount()){
430 buffer << "(true: " << getTrueCount() << "; false: "
431 << getFalseCount() << ")" << std::endl;
432 } else {
433 buffer << "(true: not defined; false: not defined)" << std::endl;
434 buffer << "True and false count are not defined" << std::endl;
435 }
436 return buffer.str();
437 }
438 };
439
440 class DateColumnStatisticsImpl: public DateColumnStatistics,
441 public MutableColumnStatistics{
442 private:
443 InternalDateStatistics _stats;
444 public:
445 DateColumnStatisticsImpl() { reset(); }
446 DateColumnStatisticsImpl(const proto::ColumnStatistics& stats,
447 const StatContext& statContext);
448 virtual ~DateColumnStatisticsImpl() override;
449
450 bool hasMinimum() const override {
451 return _stats.hasMinimum();
452 }
453
454 bool hasMaximum() const override {
455 return _stats.hasMaximum();
456 }
457
458 void increase(uint64_t count) override {
459 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
460 }
461
462 uint64_t getNumberOfValues() const override {
463 return _stats.getNumberOfValues();
464 }
465
466 void setNumberOfValues(uint64_t value) override {
467 _stats.setNumberOfValues(value);
468 }
469
470 bool hasNull() const override {
471 return _stats.hasNull();
472 }
473
474 void setHasNull(bool hasNull) override {
475 _stats.setHasNull(hasNull);
476 }
477
478 int32_t getMinimum() const override {
479 if(hasMinimum()){
480 return _stats.getMinimum();
481 }else{
482 throw ParseError("Minimum is not defined.");
483 }
484 }
485
486 int32_t getMaximum() const override {
487 if(hasMaximum()){
488 return _stats.getMaximum();
489 }else{
490 throw ParseError("Maximum is not defined.");
491 }
492 }
493
494 void setMinimum(int32_t minimum) {
495 _stats.setHasMinimum(true);
496 _stats.setMinimum(minimum);
497 }
498
499 void setMaximum(int32_t maximum) {
500 _stats.setHasMaximum(true);
501 _stats.setMaximum(maximum);
502 }
503
504 void update(int32_t value) {
505 _stats.updateMinMax(value);
506 }
507
508 void merge(const MutableColumnStatistics& other) override {
509 const DateColumnStatisticsImpl& dateStats =
510 dynamic_cast<const DateColumnStatisticsImpl&>(other);
511 _stats.merge(dateStats._stats);
512 }
513
514 void reset() override {
515 _stats.reset();
516 }
517
518 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
519 pbStats.set_hasnull(_stats.hasNull());
520 pbStats.set_numberofvalues(_stats.getNumberOfValues());
521
522 if (_stats.hasMinimum()) {
523 proto::DateStatistics* dateStatistics =
524 pbStats.mutable_datestatistics();
525 dateStatistics->set_maximum(_stats.getMaximum());
526 dateStatistics->set_minimum(_stats.getMinimum());
527 }
528 }
529
530 std::string toString() const override {
531 std::ostringstream buffer;
532 buffer << "Data type: Date" << std::endl
533 << "Values: " << getNumberOfValues() << std::endl
534 << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
535 if(hasMinimum()){
536 buffer << "Minimum: " << getMinimum() << std::endl;
537 }else{
538 buffer << "Minimum: not defined" << std::endl;
539 }
540
541 if(hasMaximum()){
542 buffer << "Maximum: " << getMaximum() << std::endl;
543 }else{
544 buffer << "Maximum: not defined" << std::endl;
545 }
546 return buffer.str();
547 }
548 };
549
550 class DecimalColumnStatisticsImpl: public DecimalColumnStatistics,
551 public MutableColumnStatistics {
552 private:
553 InternalDecimalStatistics _stats;
554
555 public:
556 DecimalColumnStatisticsImpl() { reset(); }
557 DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats,
558 const StatContext& statContext);
559 virtual ~DecimalColumnStatisticsImpl() override;
560
561 bool hasMinimum() const override {
562 return _stats.hasMinimum();
563 }
564
565 bool hasMaximum() const override {
566 return _stats.hasMaximum();
567 }
568
569 bool hasSum() const override {
570 return _stats.hasSum();
571 }
572
573 void increase(uint64_t count) override {
574 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
575 }
576
577 uint64_t getNumberOfValues() const override {
578 return _stats.getNumberOfValues();
579 }
580
581 void setNumberOfValues(uint64_t value) override {
582 _stats.setNumberOfValues(value);
583 }
584
585 bool hasNull() const override {
586 return _stats.hasNull();
587 }
588
589 void setHasNull(bool hasNull) override {
590 _stats.setHasNull(hasNull);
591 }
592
593 Decimal getMinimum() const override {
594 if(hasMinimum()){
595 return _stats.getMinimum();
596 }else{
597 throw ParseError("Minimum is not defined.");
598 }
599 }
600
601 Decimal getMaximum() const override {
602 if(hasMaximum()){
603 return _stats.getMaximum();
604 }else{
605 throw ParseError("Maximum is not defined.");
606 }
607 }
608
609 void setMinimum(Decimal minimum) {
610 _stats.setHasMinimum(true);
611 _stats.setMinimum(minimum);
612 }
613
614 void setMaximum(Decimal maximum) {
615 _stats.setHasMaximum(true);
616 _stats.setMaximum(maximum);
617 }
618
619 Decimal getSum() const override {
620 if(hasSum()){
621 return _stats.getSum();
622 }else{
623 throw ParseError("Sum is not defined.");
624 }
625 }
626
627 void setSum(Decimal sum) {
628 _stats.setHasSum(true);
629 _stats.setSum(sum);
630 }
631
632 void update(const Decimal& value) {
633 _stats.updateMinMax(value);
634
635 if (_stats.hasSum()) {
636 updateSum(value);
637 }
638 }
639
640 void merge(const MutableColumnStatistics& other) override {
641 const DecimalColumnStatisticsImpl& decStats =
642 dynamic_cast<const DecimalColumnStatisticsImpl&>(other);
643
644 _stats.merge(decStats._stats);
645
646 _stats.setHasSum(_stats.hasSum() && decStats.hasSum());
647 if (_stats.hasSum()) {
648 updateSum(decStats.getSum());
649 }
650 }
651
652 void reset() override {
653 _stats.reset();
654 setSum(Decimal());
655 }
656
657 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
658 pbStats.set_hasnull(_stats.hasNull());
659 pbStats.set_numberofvalues(_stats.getNumberOfValues());
660
661 proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics();
662 if (_stats.hasMinimum()) {
663 decStats->set_minimum(_stats.getMinimum().toString());
664 decStats->set_maximum(_stats.getMaximum().toString());
665 }
666 if (_stats.hasSum()) {
667 decStats->set_sum(_stats.getSum().toString());
668 }
669 }
670
671 std::string toString() const override {
672 std::ostringstream buffer;
673 buffer << "Data type: Decimal" << std::endl
674 << "Values: " << getNumberOfValues() << std::endl
675 << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
676 if(hasMinimum()){
677 buffer << "Minimum: " << getMinimum().toString() << std::endl;
678 }else{
679 buffer << "Minimum: not defined" << std::endl;
680 }
681
682 if(hasMaximum()){
683 buffer << "Maximum: " << getMaximum().toString() << std::endl;
684 }else{
685 buffer << "Maximum: not defined" << std::endl;
686 }
687
688 if(hasSum()){
689 buffer << "Sum: " << getSum().toString() << std::endl;
690 }else{
691 buffer << "Sum: not defined" << std::endl;
692 }
693
694 return buffer.str();
695 }
696
697 private:
698 void updateSum(Decimal value) {
699 if (_stats.hasSum()) {
700 bool overflow = false;
701 Decimal sum = _stats.getSum();
702 if (sum.scale > value.scale) {
703 value.value = scaleUpInt128ByPowerOfTen(value.value,
704 sum.scale - value.scale,
705 overflow);
706 } else if (sum.scale < value.scale) {
707 sum.value = scaleUpInt128ByPowerOfTen(sum.value,
708 value.scale - sum.scale,
709 overflow);
710 sum.scale = value.scale;
711 }
712
713 if (!overflow) {
714 bool wasPositive = sum.value >= 0;
715 sum.value += value.value;
716 if ((value.value >= 0) == wasPositive) {
717 _stats.setHasSum((sum.value >= 0) == wasPositive);
718 }
719 } else {
720 _stats.setHasSum(false);
721 }
722
723 if (_stats.hasSum()) {
724 _stats.setSum(sum);
725 }
726 }
727 }
728 };
729
730 class DoubleColumnStatisticsImpl: public DoubleColumnStatistics,
731 public MutableColumnStatistics {
732 private:
733 InternalDoubleStatistics _stats;
734 public:
735 DoubleColumnStatisticsImpl() { reset(); }
736 DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats);
737 virtual ~DoubleColumnStatisticsImpl() override;
738
739 bool hasMinimum() const override {
740 return _stats.hasMinimum();
741 }
742
743 bool hasMaximum() const override {
744 return _stats.hasMaximum();
745 }
746
747 bool hasSum() const override {
748 return _stats.hasSum();
749 }
750
751 void increase(uint64_t count) override {
752 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
753 }
754
755 uint64_t getNumberOfValues() const override {
756 return _stats.getNumberOfValues();
757 }
758
759 void setNumberOfValues(uint64_t value) override {
760 _stats.setNumberOfValues(value);
761 }
762
763 bool hasNull() const override {
764 return _stats.hasNull();
765 }
766
767 void setHasNull(bool hasNull) override {
768 _stats.setHasNull(hasNull);
769 }
770
771 double getMinimum() const override {
772 if(hasMinimum()){
773 return _stats.getMinimum();
774 }else{
775 throw ParseError("Minimum is not defined.");
776 }
777 }
778
779 double getMaximum() const override {
780 if(hasMaximum()){
781 return _stats.getMaximum();
782 }else{
783 throw ParseError("Maximum is not defined.");
784 }
785 }
786
787 void setMinimum(double minimum) {
788 _stats.setHasMinimum(true);
789 _stats.setMinimum(minimum);
790 }
791
792 void setMaximum(double maximum) {
793 _stats.setHasMaximum(true);
794 _stats.setMaximum(maximum);
795 }
796
797 double getSum() const override {
798 if(hasSum()){
799 return _stats.getSum();
800 }else{
801 throw ParseError("Sum is not defined.");
802 }
803 }
804
805 void setSum(double sum) {
806 _stats.setHasSum(true);
807 _stats.setSum(sum);
808 }
809
810 void update(double value) {
811 _stats.updateMinMax(value);
812 _stats.setSum(_stats.getSum() + value);
813 }
814
815 void merge(const MutableColumnStatistics& other) override {
816 const DoubleColumnStatisticsImpl& doubleStats =
817 dynamic_cast<const DoubleColumnStatisticsImpl&>(other);
818 _stats.merge(doubleStats._stats);
819
820 _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum());
821 if (_stats.hasSum()) {
822 _stats.setSum(_stats.getSum() + doubleStats.getSum());
823 }
824 }
825
826 void reset() override {
827 _stats.reset();
828 setSum(0.0);
829 }
830
831 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
832 pbStats.set_hasnull(_stats.hasNull());
833 pbStats.set_numberofvalues(_stats.getNumberOfValues());
834
835 proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics();
836 if (_stats.hasMinimum()) {
837 doubleStats->set_minimum(_stats.getMinimum());
838 doubleStats->set_maximum(_stats.getMaximum());
839 }
840 if (_stats.hasSum()) {
841 doubleStats->set_sum(_stats.getSum());
842 }
843 }
844
845 std::string toString() const override {
846 std::ostringstream buffer;
847 buffer << "Data type: Double" << std::endl
848 << "Values: " << getNumberOfValues() << std::endl
849 << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
850 if(hasMinimum()){
851 buffer << "Minimum: " << getMinimum() << std::endl;
852 }else{
853 buffer << "Minimum: not defined" << std::endl;
854 }
855
856 if(hasMaximum()){
857 buffer << "Maximum: " << getMaximum() << std::endl;
858 }else{
859 buffer << "Maximum: not defined" << std::endl;
860 }
861
862 if(hasSum()){
863 buffer << "Sum: " << getSum() << std::endl;
864 }else{
865 buffer << "Sum: not defined" << std::endl;
866 }
867 return buffer.str();
868 }
869 };
870
871 class IntegerColumnStatisticsImpl: public IntegerColumnStatistics,
872 public MutableColumnStatistics {
873 private:
874 InternalIntegerStatistics _stats;
875 public:
876 IntegerColumnStatisticsImpl() { reset(); }
877 IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats);
878 virtual ~IntegerColumnStatisticsImpl() override;
879
880 bool hasMinimum() const override {
881 return _stats.hasMinimum();
882 }
883
884 bool hasMaximum() const override {
885 return _stats.hasMaximum();
886 }
887
888 bool hasSum() const override {
889 return _stats.hasSum();
890 }
891
892 void increase(uint64_t count) override {
893 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
894 }
895
896 uint64_t getNumberOfValues() const override {
897 return _stats.getNumberOfValues();
898 }
899
900 void setNumberOfValues(uint64_t value) override {
901 _stats.setNumberOfValues(value);
902 }
903
904 bool hasNull() const override {
905 return _stats.hasNull();
906 }
907
908 void setHasNull(bool hasNull) override {
909 _stats.setHasNull(hasNull);
910 }
911
912 int64_t getMinimum() const override {
913 if(hasMinimum()){
914 return _stats.getMinimum();
915 }else{
916 throw ParseError("Minimum is not defined.");
917 }
918 }
919
920 int64_t getMaximum() const override {
921 if(hasMaximum()){
922 return _stats.getMaximum();
923 }else{
924 throw ParseError("Maximum is not defined.");
925 }
926 }
927
928 void setMinimum(int64_t minimum) {
929 _stats.setHasMinimum(true);
930 _stats.setMinimum(minimum);
931 }
932
933 void setMaximum(int64_t maximum) {
934 _stats.setHasMaximum(true);
935 _stats.setMaximum(maximum);
936 }
937
938 int64_t getSum() const override {
939 if(hasSum()){
940 return _stats.getSum();
941 }else{
942 throw ParseError("Sum is not defined.");
943 }
944 }
945
946 void setSum(int64_t sum) {
947 _stats.setHasSum(true);
948 _stats.setSum(sum);
949 }
950
951 void update(int64_t value, int repetitions);
952
953 void merge(const MutableColumnStatistics& other) override {
954 const IntegerColumnStatisticsImpl& intStats =
955 dynamic_cast<const IntegerColumnStatisticsImpl&>(other);
956
957 _stats.merge(intStats._stats);
958
959 // update sum and check overflow
960 _stats.setHasSum(_stats.hasSum() && intStats.hasSum());
961 if (_stats.hasSum()) {
962 bool wasPositive = _stats.getSum() >= 0;
963 _stats.setSum(_stats.getSum() + intStats.getSum());
964 if ((intStats.getSum() >= 0) == wasPositive) {
965 _stats.setHasSum((_stats.getSum() >= 0) == wasPositive);
966 }
967 }
968 }
969
970 void reset() override {
971 _stats.reset();
972 setSum(0);
973 }
974
975 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
976 pbStats.set_hasnull(_stats.hasNull());
977 pbStats.set_numberofvalues(_stats.getNumberOfValues());
978
979 proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics();
980 if (_stats.hasMinimum()) {
981 intStats->set_minimum(_stats.getMinimum());
982 intStats->set_maximum(_stats.getMaximum());
983 }
984 if (_stats.hasSum()) {
985 intStats->set_sum(_stats.getSum());
986 }
987 }
988
989 std::string toString() const override {
990 std::ostringstream buffer;
991 buffer << "Data type: Integer" << std::endl
992 << "Values: " << getNumberOfValues() << std::endl
993 << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
994 if(hasMinimum()){
995 buffer << "Minimum: " << getMinimum() << std::endl;
996 }else{
997 buffer << "Minimum: not defined" << std::endl;
998 }
999
1000 if(hasMaximum()){
1001 buffer << "Maximum: " << getMaximum() << std::endl;
1002 }else{
1003 buffer << "Maximum: not defined" << std::endl;
1004 }
1005
1006 if(hasSum()){
1007 buffer << "Sum: " << getSum() << std::endl;
1008 }else{
1009 buffer << "Sum: not defined" << std::endl;
1010 }
1011 return buffer.str();
1012 }
1013 };
1014
1015 class StringColumnStatisticsImpl: public StringColumnStatistics,
1016 public MutableColumnStatistics{
1017 private:
1018 InternalStringStatistics _stats;
1019
1020 public:
1021 StringColumnStatisticsImpl() {
1022 reset();
1023 }
1024 StringColumnStatisticsImpl(const proto::ColumnStatistics& stats,
1025 const StatContext& statContext);
1026 virtual ~StringColumnStatisticsImpl() override;
1027
1028 bool hasMinimum() const override {
1029 return _stats.hasMinimum();
1030 }
1031
1032 bool hasMaximum() const override {
1033 return _stats.hasMaximum();
1034 }
1035
1036 bool hasTotalLength() const override {
1037 return _stats.hasTotalLength();
1038 }
1039
1040 void increase(uint64_t count) override {
1041 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
1042 }
1043
1044 uint64_t getNumberOfValues() const override {
1045 return _stats.getNumberOfValues();
1046 }
1047
1048 void setNumberOfValues(uint64_t value) override {
1049 _stats.setNumberOfValues(value);
1050 }
1051
1052 bool hasNull() const override {
1053 return _stats.hasNull();
1054 }
1055
1056 void setHasNull(bool hasNull) override {
1057 _stats.setHasNull(hasNull);
1058 }
1059
1060 std::string getMinimum() const override {
1061 if(hasMinimum()){
1062 return _stats.getMinimum();
1063 }else{
1064 throw ParseError("Minimum is not defined.");
1065 }
1066 }
1067
1068 std::string getMaximum() const override {
1069 if(hasMaximum()){
1070 return _stats.getMaximum();
1071 }else{
1072 throw ParseError("Maximum is not defined.");
1073 }
1074 }
1075
1076 void setMinimum(std::string minimum) {
1077 _stats.setHasMinimum(true);
1078 _stats.setMinimum(minimum);
1079 }
1080
1081 void setMaximum(std::string maximum) {
1082 _stats.setHasMaximum(true);
1083 _stats.setMaximum(maximum);
1084 }
1085
1086 uint64_t getTotalLength() const override {
1087 if(hasTotalLength()){
1088 return _stats.getTotalLength();
1089 }else{
1090 throw ParseError("Total length is not defined.");
1091 }
1092 }
1093
1094 void setTotalLength(uint64_t length) {
1095 _stats.setHasTotalLength(true);
1096 _stats.setTotalLength(length);
1097 }
1098
1099 void update(const char* value, size_t length) {
1100 if (value != nullptr) {
1101 if (!_stats.hasMinimum()) {
1102 setMinimum(std::string(value, value + length));
1103 setMaximum(std::string(value, value + length));
1104 } else {
1105 // update min
1106 int minCmp = strncmp(_stats.getMinimum().c_str(),
1107 value,
1108 std::min(_stats.getMinimum().length(), length));
1109 if (minCmp > 0 ||
1110 (minCmp == 0 && length < _stats.getMinimum().length())) {
1111 setMinimum(std::string(value, value + length));
1112 }
1113
1114 // update max
1115 int maxCmp = strncmp(_stats.getMaximum().c_str(),
1116 value,
1117 std::min(_stats.getMaximum().length(), length));
1118 if (maxCmp < 0 ||
1119 (maxCmp == 0 && length > _stats.getMaximum().length())) {
1120 setMaximum(std::string(value, value + length));
1121 }
1122 }
1123 }
1124
1125 _stats.setTotalLength(_stats.getTotalLength() + length);
1126 }
1127
1128 void update(std::string value) {
1129 update(value.c_str(), value.length());
1130 }
1131
1132 void merge(const MutableColumnStatistics& other) override {
1133 const StringColumnStatisticsImpl& strStats =
1134 dynamic_cast<const StringColumnStatisticsImpl&>(other);
1135 _stats.merge(strStats._stats);
1136 }
1137
1138 void reset() override {
1139 _stats.reset();
1140 setTotalLength(0);
1141 }
1142
1143 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
1144 pbStats.set_hasnull(_stats.hasNull());
1145 pbStats.set_numberofvalues(_stats.getNumberOfValues());
1146
1147 proto::StringStatistics* strStats = pbStats.mutable_stringstatistics();
1148 if (_stats.hasMinimum()) {
1149 strStats->set_minimum(_stats.getMinimum());
1150 strStats->set_maximum(_stats.getMaximum());
1151 }
1152 if (_stats.hasTotalLength()) {
1153 strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength()));
1154 }
1155 }
1156
1157 std::string toString() const override {
1158 std::ostringstream buffer;
1159 buffer << "Data type: String" << std::endl
1160 << "Values: " << getNumberOfValues() << std::endl
1161 << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
1162 if(hasMinimum()){
1163 buffer << "Minimum: " << getMinimum() << std::endl;
1164 }else{
1165 buffer << "Minimum is not defined" << std::endl;
1166 }
1167
1168 if(hasMaximum()){
1169 buffer << "Maximum: " << getMaximum() << std::endl;
1170 }else{
1171 buffer << "Maximum is not defined" << std::endl;
1172 }
1173
1174 if(hasTotalLength()){
1175 buffer << "Total length: " << getTotalLength() << std::endl;
1176 }else{
1177 buffer << "Total length is not defined" << std::endl;
1178 }
1179 return buffer.str();
1180 }
1181 };
1182
1183 class TimestampColumnStatisticsImpl: public TimestampColumnStatistics,
1184 public MutableColumnStatistics {
1185 private:
1186 InternalIntegerStatistics _stats;
1187 bool _hasLowerBound;
1188 bool _hasUpperBound;
1189 int64_t _lowerBound;
1190 int64_t _upperBound;
1191
1192 public:
1193 TimestampColumnStatisticsImpl() { reset(); }
1194 TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
1195 const StatContext& statContext);
1196 virtual ~TimestampColumnStatisticsImpl() override;
1197
1198 bool hasMinimum() const override {
1199 return _stats.hasMinimum();
1200 }
1201
1202 bool hasMaximum() const override {
1203 return _stats.hasMaximum();
1204 }
1205
1206 uint64_t getNumberOfValues() const override {
1207 return _stats.getNumberOfValues();
1208 }
1209
1210 void setNumberOfValues(uint64_t value) override {
1211 _stats.setNumberOfValues(value);
1212 }
1213
1214 void increase(uint64_t count) override {
1215 _stats.setNumberOfValues(_stats.getNumberOfValues() + count);
1216 }
1217
1218 bool hasNull() const override {
1219 return _stats.hasNull();
1220 }
1221
1222 void setHasNull(bool hasNull) override {
1223 _stats.setHasNull(hasNull);
1224 }
1225
1226 int64_t getMinimum() const override {
1227 if(hasMinimum()){
1228 return _stats.getMinimum();
1229 }else{
1230 throw ParseError("Minimum is not defined.");
1231 }
1232 }
1233
1234 int64_t getMaximum() const override {
1235 if(hasMaximum()){
1236 return _stats.getMaximum();
1237 }else{
1238 throw ParseError("Maximum is not defined.");
1239 }
1240 }
1241
1242 void setMinimum(int64_t minimum) {
1243 _stats.setHasMinimum(true);
1244 _stats.setMinimum(minimum);
1245 }
1246
1247 void setMaximum(int64_t maximum) {
1248 _stats.setHasMaximum(true);
1249 _stats.setMaximum(maximum);
1250 }
1251
1252 void update(int64_t value) {
1253 _stats.updateMinMax(value);
1254 }
1255
1256 void merge(const MutableColumnStatistics& other) override {
1257 const TimestampColumnStatisticsImpl& tsStats =
1258 dynamic_cast<const TimestampColumnStatisticsImpl&>(other);
1259 _stats.merge(tsStats._stats);
1260 }
1261
1262 void reset() override {
1263 _stats.reset();
1264 }
1265
1266 void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
1267 pbStats.set_hasnull(_stats.hasNull());
1268 pbStats.set_numberofvalues(_stats.getNumberOfValues());
1269
1270 if (_stats.hasMinimum()) {
1271 proto::TimestampStatistics* tsStats =
1272 pbStats.mutable_timestampstatistics();
1273 tsStats->set_minimumutc(_stats.getMinimum());
1274 tsStats->set_maximumutc(_stats.getMaximum());
1275 }
1276 }
1277
1278 std::string toString() const override {
1279 std::ostringstream buffer;
1280 struct tm tmValue;
1281 char timeBuffer[20];
1282 time_t secs = 0;
1283
1284 buffer << "Data type: Timestamp" << std::endl
1285 << "Values: " << getNumberOfValues() << std::endl
1286 << "Has null: " << (hasNull() ? "yes" : "no") << std::endl;
1287 if(hasMinimum()){
1288 secs = static_cast<time_t>(getMinimum() / 1000);
1289 gmtime_r(&secs, &tmValue);
1290 strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
1291 buffer << "Minimum: " << timeBuffer << "."
1292 << (getMinimum() % 1000) << std::endl;
1293 }else{
1294 buffer << "Minimum is not defined" << std::endl;
1295 }
1296
1297 if(hasLowerBound()){
1298 secs = static_cast<time_t>(getLowerBound() / 1000);
1299 gmtime_r(&secs, &tmValue);
1300 strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
1301 buffer << "LowerBound: " << timeBuffer << "."
1302 << (getLowerBound() % 1000) << std::endl;
1303 }else{
1304 buffer << "LowerBound is not defined" << std::endl;
1305 }
1306
1307 if(hasMaximum()){
1308 secs = static_cast<time_t>(getMaximum()/1000);
1309 gmtime_r(&secs, &tmValue);
1310 strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
1311 buffer << "Maximum: " << timeBuffer << "."
1312 << (getMaximum() % 1000) << std::endl;
1313 }else{
1314 buffer << "Maximum is not defined" << std::endl;
1315 }
1316
1317 if(hasUpperBound()){
1318 secs = static_cast<time_t>(getUpperBound() / 1000);
1319 gmtime_r(&secs, &tmValue);
1320 strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
1321 buffer << "UpperBound: " << timeBuffer << "."
1322 << (getUpperBound() % 1000) << std::endl;
1323 }else{
1324 buffer << "UpperBound is not defined" << std::endl;
1325 }
1326
1327 return buffer.str();
1328 }
1329
1330 bool hasLowerBound() const override {
1331 return _hasLowerBound;
1332 }
1333
1334 bool hasUpperBound() const override {
1335 return _hasUpperBound;
1336 }
1337
1338 int64_t getLowerBound() const override {
1339 if(hasLowerBound()){
1340 return _lowerBound;
1341 }else{
1342 throw ParseError("LowerBound is not defined.");
1343 }
1344 }
1345
1346 int64_t getUpperBound() const override {
1347 if(hasUpperBound()){
1348 return _upperBound;
1349 }else{
1350 throw ParseError("UpperBound is not defined.");
1351 }
1352 }
1353 };
1354
1355 ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
1356 const StatContext& statContext);
1357
1358 class StatisticsImpl: public Statistics {
1359 private:
1360 std::list<ColumnStatistics*> colStats;
1361
1362 // DELIBERATELY NOT IMPLEMENTED
1363 StatisticsImpl(const StatisticsImpl&);
1364 StatisticsImpl& operator=(const StatisticsImpl&);
1365
1366 public:
1367 StatisticsImpl(const proto::StripeStatistics& stripeStats,
1368 const StatContext& statContext);
1369
1370 StatisticsImpl(const proto::Footer& footer, const StatContext& statContext);
1371
1372 virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
1373 ) const override {
1374 std::list<ColumnStatistics*>::const_iterator it = colStats.begin();
1375 std::advance(it, static_cast<int64_t>(columnId));
1376 return *it;
1377 }
1378
1379 virtual ~StatisticsImpl() override;
1380
1381 uint32_t getNumberOfColumns() const override {
1382 return static_cast<uint32_t>(colStats.size());
1383 }
1384 };
1385
1386 class StripeStatisticsImpl: public StripeStatistics {
1387 private:
1388 std::unique_ptr<StatisticsImpl> columnStats;
1389 std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > >
1390 rowIndexStats;
1391
1392 // DELIBERATELY NOT IMPLEMENTED
1393 StripeStatisticsImpl(const StripeStatisticsImpl&);
1394 StripeStatisticsImpl& operator=(const StripeStatisticsImpl&);
1395
1396 public:
1397 StripeStatisticsImpl(
1398 const proto::StripeStatistics& stripeStats,
1399 std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
1400 const StatContext& statContext);
1401
1402 virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId
1403 ) const override {
1404 return columnStats->getColumnStatistics(columnId);
1405 }
1406
1407 uint32_t getNumberOfColumns() const override {
1408 return columnStats->getNumberOfColumns();
1409 }
1410
1411 virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
1412 uint32_t rowIndex
1413 ) const override {
1414 // check id indices are valid
1415 return rowIndexStats[columnId][rowIndex].get();
1416 }
1417
1418 virtual ~StripeStatisticsImpl() override;
1419
1420 uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
1421 return static_cast<uint32_t>(rowIndexStats[columnId].size());
1422 }
1423 };
1424
1425 /**
1426 * Create ColumnStatistics for writers
1427 * @param type of column
1428 * @return MutableColumnStatistics instances
1429 */
1430 std::unique_ptr<MutableColumnStatistics> createColumnStatistics(
1431 const Type& type);
1432
1433}// namespace
1434
1435#endif
1436