1 | /** |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, software |
13 | * distributed under the License is distributed on an "AS IS" BASIS, |
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | * See the License for the specific language governing permissions and |
16 | * limitations under the License. |
17 | */ |
18 | |
19 | #include "orc/Exceptions.hh" |
20 | #include "RLE.hh" |
21 | #include "Statistics.hh" |
22 | |
23 | #include "wrap/coded-stream-wrapper.h" |
24 | |
25 | namespace orc { |
26 | |
27 | ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, |
28 | const StatContext& statContext) { |
29 | if (s.has_intstatistics()) { |
30 | return new IntegerColumnStatisticsImpl(s); |
31 | } else if (s.has_doublestatistics()) { |
32 | return new DoubleColumnStatisticsImpl(s); |
33 | } else if (s.has_stringstatistics()) { |
34 | return new StringColumnStatisticsImpl(s, statContext); |
35 | } else if (s.has_bucketstatistics()) { |
36 | return new BooleanColumnStatisticsImpl(s, statContext); |
37 | } else if (s.has_decimalstatistics()) { |
38 | return new DecimalColumnStatisticsImpl(s, statContext); |
39 | } else if (s.has_timestampstatistics()) { |
40 | return new TimestampColumnStatisticsImpl(s, statContext); |
41 | } else if (s.has_datestatistics()) { |
42 | return new DateColumnStatisticsImpl(s, statContext); |
43 | } else if (s.has_binarystatistics()) { |
44 | return new BinaryColumnStatisticsImpl(s, statContext); |
45 | } else { |
46 | return new ColumnStatisticsImpl(s); |
47 | } |
48 | } |
49 | |
50 | StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats, |
51 | const StatContext& statContext) { |
52 | for(int i = 0; i < stripeStats.colstats_size(); i++) { |
53 | colStats.push_back( |
54 | convertColumnStatistics(stripeStats.colstats(i), statContext)); |
55 | } |
56 | } |
57 | |
58 | StatisticsImpl::(const proto::Footer& , |
59 | const StatContext& statContext) { |
60 | for(int i = 0; i < footer.statistics_size(); i++) { |
61 | colStats.push_back( |
62 | convertColumnStatistics(footer.statistics(i), statContext)); |
63 | } |
64 | } |
65 | |
66 | StatisticsImpl::~StatisticsImpl() { |
67 | for(std::list<ColumnStatistics*>::iterator ptr = colStats.begin(); |
68 | ptr != colStats.end(); |
69 | ++ptr) { |
70 | delete *ptr; |
71 | } |
72 | } |
73 | |
74 | Statistics::~Statistics() { |
75 | // PASS |
76 | } |
77 | |
78 | StripeStatistics::~StripeStatistics() { |
79 | // PASS |
80 | } |
81 | |
82 | StripeStatisticsImpl::~StripeStatisticsImpl() { |
83 | // PASS |
84 | } |
85 | |
86 | StripeStatisticsImpl::StripeStatisticsImpl( |
87 | const proto::StripeStatistics& stripeStats, |
88 | std::vector<std::vector<proto::ColumnStatistics> >& indexStats, |
89 | const StatContext& statContext) { |
90 | columnStats.reset(new StatisticsImpl(stripeStats, statContext)); |
91 | rowIndexStats.resize(indexStats.size()); |
92 | for(size_t i = 0; i < rowIndexStats.size(); i++) { |
93 | for(size_t j = 0; j < indexStats[i].size(); j++) { |
94 | rowIndexStats[i].push_back( |
95 | std::shared_ptr<const ColumnStatistics>( |
96 | convertColumnStatistics(indexStats[i][j], statContext))); |
97 | } |
98 | } |
99 | } |
100 | |
101 | |
102 | ColumnStatistics::~ColumnStatistics() { |
103 | // PASS |
104 | } |
105 | |
106 | BinaryColumnStatistics::~BinaryColumnStatistics() { |
107 | // PASS |
108 | } |
109 | |
110 | BooleanColumnStatistics::~BooleanColumnStatistics() { |
111 | // PASS |
112 | } |
113 | |
114 | DateColumnStatistics::~DateColumnStatistics() { |
115 | // PASS |
116 | } |
117 | |
118 | DecimalColumnStatistics::~DecimalColumnStatistics() { |
119 | // PASS |
120 | } |
121 | |
122 | DoubleColumnStatistics::~DoubleColumnStatistics() { |
123 | // PASS |
124 | } |
125 | |
126 | IntegerColumnStatistics::~IntegerColumnStatistics() { |
127 | // PASS |
128 | } |
129 | |
130 | StringColumnStatistics::~StringColumnStatistics() { |
131 | // PASS |
132 | } |
133 | |
134 | TimestampColumnStatistics::~TimestampColumnStatistics() { |
135 | // PASS |
136 | } |
137 | |
138 | MutableColumnStatistics::~MutableColumnStatistics() { |
139 | // PASS |
140 | } |
141 | |
142 | ColumnStatisticsImpl::~ColumnStatisticsImpl() { |
143 | // PASS |
144 | } |
145 | |
146 | BinaryColumnStatisticsImpl::~BinaryColumnStatisticsImpl() { |
147 | // PASS |
148 | } |
149 | |
150 | BooleanColumnStatisticsImpl::~BooleanColumnStatisticsImpl() { |
151 | // PASS |
152 | } |
153 | |
154 | DateColumnStatisticsImpl::~DateColumnStatisticsImpl() { |
155 | // PASS |
156 | } |
157 | |
158 | DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() { |
159 | // PASS |
160 | } |
161 | |
162 | DoubleColumnStatisticsImpl::~DoubleColumnStatisticsImpl() { |
163 | // PASS |
164 | } |
165 | |
166 | IntegerColumnStatisticsImpl::~IntegerColumnStatisticsImpl() { |
167 | // PASS |
168 | } |
169 | |
170 | void IntegerColumnStatisticsImpl::update(int64_t value, int repetitions) { |
171 | _stats.updateMinMax(value); |
172 | |
173 | if (_stats.hasSum()) { |
174 | bool wasPositive = _stats.getSum() >= 0; |
175 | _stats.setSum(value * repetitions + _stats.getSum()); |
176 | if ((value >= 0) == wasPositive) { |
177 | _stats.setHasSum((_stats.getSum() >= 0) == wasPositive); |
178 | } |
179 | } |
180 | } |
181 | |
182 | StringColumnStatisticsImpl::~StringColumnStatisticsImpl() { |
183 | // PASS |
184 | } |
185 | |
186 | TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() { |
187 | // PASS |
188 | } |
189 | |
190 | ColumnStatisticsImpl::ColumnStatisticsImpl |
191 | (const proto::ColumnStatistics& pb) { |
192 | _stats.setNumberOfValues(pb.numberofvalues()); |
193 | _stats.setHasNull(pb.hasnull()); |
194 | } |
195 | |
196 | BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl |
197 | (const proto::ColumnStatistics& pb, const StatContext& statContext){ |
198 | _stats.setNumberOfValues(pb.numberofvalues()); |
199 | _stats.setHasNull(pb.hasnull()); |
200 | if (pb.has_binarystatistics() && statContext.correctStats) { |
201 | _stats.setHasTotalLength(pb.binarystatistics().has_sum()); |
202 | _stats.setTotalLength( |
203 | static_cast<uint64_t>(pb.binarystatistics().sum())); |
204 | } |
205 | } |
206 | |
207 | BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl |
208 | (const proto::ColumnStatistics& pb, const StatContext& statContext){ |
209 | _stats.setNumberOfValues(pb.numberofvalues()); |
210 | _stats.setHasNull(pb.hasnull()); |
211 | if (pb.has_bucketstatistics() && statContext.correctStats) { |
212 | _hasCount = true; |
213 | _trueCount = pb.bucketstatistics().count(0); |
214 | } else { |
215 | _hasCount = false; |
216 | _trueCount = 0; |
217 | } |
218 | } |
219 | |
220 | DateColumnStatisticsImpl::DateColumnStatisticsImpl |
221 | (const proto::ColumnStatistics& pb, const StatContext& statContext){ |
222 | _stats.setNumberOfValues(pb.numberofvalues()); |
223 | _stats.setHasNull(pb.hasnull()); |
224 | if (!pb.has_datestatistics() || !statContext.correctStats) { |
225 | // hasMinimum_ is false by default; |
226 | // hasMaximum_ is false by default; |
227 | _stats.setMinimum(0); |
228 | _stats.setMaximum(0); |
229 | } else { |
230 | _stats.setHasMinimum(pb.datestatistics().has_minimum()); |
231 | _stats.setHasMaximum(pb.datestatistics().has_maximum()); |
232 | _stats.setMinimum(pb.datestatistics().minimum()); |
233 | _stats.setMaximum(pb.datestatistics().maximum()); |
234 | } |
235 | } |
236 | |
237 | DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl |
238 | (const proto::ColumnStatistics& pb, const StatContext& statContext){ |
239 | _stats.setNumberOfValues(pb.numberofvalues()); |
240 | _stats.setHasNull(pb.hasnull()); |
241 | if (pb.has_decimalstatistics() && statContext.correctStats) { |
242 | const proto::DecimalStatistics& stats = pb.decimalstatistics(); |
243 | _stats.setHasMinimum(stats.has_minimum()); |
244 | _stats.setHasMaximum(stats.has_maximum()); |
245 | _stats.setHasSum(stats.has_sum()); |
246 | |
247 | _stats.setMinimum(Decimal(stats.minimum())); |
248 | _stats.setMaximum(Decimal(stats.maximum())); |
249 | _stats.setSum(Decimal(stats.sum())); |
250 | } |
251 | } |
252 | |
253 | DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl |
254 | (const proto::ColumnStatistics& pb){ |
255 | _stats.setNumberOfValues(pb.numberofvalues()); |
256 | _stats.setHasNull(pb.hasnull()); |
257 | if (!pb.has_doublestatistics()) { |
258 | _stats.setMinimum(0); |
259 | _stats.setMaximum(0); |
260 | _stats.setSum(0); |
261 | }else{ |
262 | const proto::DoubleStatistics& stats = pb.doublestatistics(); |
263 | _stats.setHasMinimum(stats.has_minimum()); |
264 | _stats.setHasMaximum(stats.has_maximum()); |
265 | _stats.setHasSum(stats.has_sum()); |
266 | |
267 | _stats.setMinimum(stats.minimum()); |
268 | _stats.setMaximum(stats.maximum()); |
269 | _stats.setSum(stats.sum()); |
270 | } |
271 | } |
272 | |
273 | IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl |
274 | (const proto::ColumnStatistics& pb){ |
275 | _stats.setNumberOfValues(pb.numberofvalues()); |
276 | _stats.setHasNull(pb.hasnull()); |
277 | if (!pb.has_intstatistics()) { |
278 | _stats.setMinimum(0); |
279 | _stats.setMaximum(0); |
280 | _stats.setSum(0); |
281 | }else{ |
282 | const proto::IntegerStatistics& stats = pb.intstatistics(); |
283 | _stats.setHasMinimum(stats.has_minimum()); |
284 | _stats.setHasMaximum(stats.has_maximum()); |
285 | _stats.setHasSum(stats.has_sum()); |
286 | |
287 | _stats.setMinimum(stats.minimum()); |
288 | _stats.setMaximum(stats.maximum()); |
289 | _stats.setSum(stats.sum()); |
290 | } |
291 | } |
292 | |
293 | StringColumnStatisticsImpl::StringColumnStatisticsImpl |
294 | (const proto::ColumnStatistics& pb, const StatContext& statContext){ |
295 | _stats.setNumberOfValues(pb.numberofvalues()); |
296 | _stats.setHasNull(pb.hasnull()); |
297 | if (!pb.has_stringstatistics() || !statContext.correctStats) { |
298 | _stats.setTotalLength(0); |
299 | }else{ |
300 | const proto::StringStatistics& stats = pb.stringstatistics(); |
301 | _stats.setHasMinimum(stats.has_minimum()); |
302 | _stats.setHasMaximum(stats.has_maximum()); |
303 | _stats.setHasTotalLength(stats.has_sum()); |
304 | |
305 | _stats.setMinimum(stats.minimum()); |
306 | _stats.setMaximum(stats.maximum()); |
307 | _stats.setTotalLength(static_cast<uint64_t>(stats.sum())); |
308 | } |
309 | } |
310 | |
311 | TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl |
312 | (const proto::ColumnStatistics& pb, const StatContext& statContext) { |
313 | _stats.setNumberOfValues(pb.numberofvalues()); |
314 | _stats.setHasNull(pb.hasnull()); |
315 | if (!pb.has_timestampstatistics() || !statContext.correctStats) { |
316 | _stats.setMinimum(0); |
317 | _stats.setMaximum(0); |
318 | _lowerBound = 0; |
319 | _upperBound = 0; |
320 | }else{ |
321 | const proto::TimestampStatistics& stats = pb.timestampstatistics(); |
322 | _stats.setHasMinimum( |
323 | stats.has_minimumutc() || |
324 | (stats.has_minimum() && (statContext.writerTimezone != nullptr))); |
325 | _stats.setHasMaximum( |
326 | stats.has_maximumutc() || |
327 | (stats.has_maximum() && (statContext.writerTimezone != nullptr))); |
328 | _hasLowerBound = stats.has_minimumutc() || stats.has_minimum(); |
329 | _hasUpperBound = stats.has_maximumutc() || stats.has_maximum(); |
330 | |
331 | // Timestamp stats are stored in milliseconds |
332 | if (stats.has_minimumutc()) { |
333 | int64_t minimum = stats.minimumutc(); |
334 | _stats.setMinimum(minimum); |
335 | _lowerBound = minimum; |
336 | } else if (statContext.writerTimezone) { |
337 | int64_t writerTimeSec = stats.minimum() / 1000; |
338 | // multiply the offset by 1000 to convert to millisecond |
339 | int64_t minimum = |
340 | stats.minimum() + |
341 | (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) |
342 | * 1000; |
343 | _stats.setMinimum(minimum); |
344 | _lowerBound = minimum; |
345 | } else { |
346 | _stats.setMinimum(0); |
347 | // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown |
348 | // TZ and daylight savings |
349 | _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000); |
350 | } |
351 | |
352 | // Timestamp stats are stored in milliseconds |
353 | if (stats.has_maximumutc()) { |
354 | int64_t maximum = stats.maximumutc(); |
355 | _stats.setMaximum(maximum); |
356 | _upperBound = maximum; |
357 | } else if (statContext.writerTimezone) { |
358 | int64_t writerTimeSec = stats.maximum() / 1000; |
359 | // multiply the offset by 1000 to convert to millisecond |
360 | int64_t maximum = stats.maximum() + |
361 | (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) |
362 | * 1000; |
363 | _stats.setMaximum(maximum); |
364 | _upperBound = maximum; |
365 | } else { |
366 | _stats.setMaximum(0); |
367 | // add 1 day 1 hour (25 hours) in milliseconds to handle unknown |
368 | // TZ and daylight savings |
369 | _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); |
370 | } |
371 | // Add 1 millisecond to account for microsecond precision of values |
372 | _upperBound += 1; |
373 | } |
374 | } |
375 | |
376 | std::unique_ptr<MutableColumnStatistics> createColumnStatistics( |
377 | const Type& type) { |
378 | switch (static_cast<int64_t>(type.getKind())) { |
379 | case BOOLEAN: |
380 | return std::unique_ptr<MutableColumnStatistics>( |
381 | new BooleanColumnStatisticsImpl()); |
382 | case BYTE: |
383 | case INT: |
384 | case LONG: |
385 | case SHORT: |
386 | return std::unique_ptr<MutableColumnStatistics>( |
387 | new IntegerColumnStatisticsImpl()); |
388 | case STRUCT: |
389 | case MAP: |
390 | case LIST: |
391 | case UNION: |
392 | return std::unique_ptr<MutableColumnStatistics>( |
393 | new ColumnStatisticsImpl()); |
394 | case FLOAT: |
395 | case DOUBLE: |
396 | return std::unique_ptr<MutableColumnStatistics>( |
397 | new DoubleColumnStatisticsImpl()); |
398 | case BINARY: |
399 | return std::unique_ptr<MutableColumnStatistics>( |
400 | new BinaryColumnStatisticsImpl()); |
401 | case STRING: |
402 | case CHAR: |
403 | case VARCHAR: |
404 | return std::unique_ptr<MutableColumnStatistics>( |
405 | new StringColumnStatisticsImpl()); |
406 | case DATE: |
407 | return std::unique_ptr<MutableColumnStatistics>( |
408 | new DateColumnStatisticsImpl()); |
409 | case TIMESTAMP: |
410 | return std::unique_ptr<MutableColumnStatistics>( |
411 | new TimestampColumnStatisticsImpl()); |
412 | case DECIMAL: |
413 | return std::unique_ptr<MutableColumnStatistics>( |
414 | new DecimalColumnStatisticsImpl()); |
415 | default: |
416 | throw NotImplementedYet("Not supported type: " + type.toString()); |
417 | } |
418 | } |
419 | |
420 | }// namespace |
421 | |