1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | // This module defines an abstract interface for iterating through pages in a |
19 | // Parquet column chunk within a row group. It could be extended in the future |
20 | // to iterate through all data pages in all chunks in a file. |
21 | |
22 | #pragma once |
23 | |
24 | #include <algorithm> |
25 | #include <limits> |
26 | #include <sstream> |
27 | #include <string> |
28 | #include <vector> |
29 | |
30 | #include "parquet/test-util.h" |
31 | |
32 | namespace parquet { |
33 | |
34 | namespace test { |
35 | |
36 | template <> |
37 | void inline InitValues<bool>(int num_values, vector<bool>& values, |
38 | vector<uint8_t>& buffer) { |
39 | values = flip_coins(num_values, 0); |
40 | } |
41 | |
42 | template <> |
43 | void inline InitValues<ByteArray>(int num_values, vector<ByteArray>& values, |
44 | vector<uint8_t>& buffer) { |
45 | int max_byte_array_len = 12; |
46 | int num_bytes = static_cast<int>(max_byte_array_len + sizeof(uint32_t)); |
47 | size_t nbytes = num_values * num_bytes; |
48 | buffer.resize(nbytes); |
49 | random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len); |
50 | } |
51 | |
52 | void inline InitWideByteArrayValues(int num_values, vector<ByteArray>& values, |
53 | vector<uint8_t>& buffer, int min_len, int max_len) { |
54 | int num_bytes = static_cast<int>(max_len + sizeof(uint32_t)); |
55 | size_t nbytes = num_values * num_bytes; |
56 | buffer.resize(nbytes); |
57 | random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len); |
58 | } |
59 | |
60 | template <> |
61 | void inline InitValues<FLBA>(int num_values, vector<FLBA>& values, |
62 | vector<uint8_t>& buffer) { |
63 | size_t nbytes = num_values * FLBA_LENGTH; |
64 | buffer.resize(nbytes); |
65 | random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data()); |
66 | } |
67 | |
68 | template <> |
69 | void inline InitValues<Int96>(int num_values, vector<Int96>& values, |
70 | vector<uint8_t>& buffer) { |
71 | random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(), |
72 | std::numeric_limits<int32_t>::max(), values.data()); |
73 | } |
74 | |
75 | inline std::string TestColumnName(int i) { |
76 | std::stringstream col_name; |
77 | col_name << "column_" << i; |
78 | return col_name.str(); |
79 | } |
80 | |
81 | // This class lives here because of its dependency on the InitValues specializations. |
82 | template <typename TestType> |
83 | class PrimitiveTypedTest : public ::testing::Test { |
84 | public: |
85 | typedef typename TestType::c_type T; |
86 | |
87 | void SetUpSchema(Repetition::type repetition, int num_columns = 1) { |
88 | std::vector<schema::NodePtr> fields; |
89 | |
90 | for (int i = 0; i < num_columns; ++i) { |
91 | std::string name = TestColumnName(i); |
92 | fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num, |
93 | LogicalType::NONE, FLBA_LENGTH)); |
94 | } |
95 | node_ = schema::GroupNode::Make("schema" , Repetition::REQUIRED, fields); |
96 | schema_.Init(node_); |
97 | } |
98 | |
99 | void GenerateData(int64_t num_values); |
100 | void SetupValuesOut(int64_t num_values); |
101 | void SyncValuesOut(); |
102 | |
103 | protected: |
104 | schema::NodePtr node_; |
105 | SchemaDescriptor schema_; |
106 | |
107 | // Input buffers |
108 | std::vector<T> values_; |
109 | |
110 | std::vector<int16_t> def_levels_; |
111 | |
112 | std::vector<uint8_t> buffer_; |
113 | // Pointer to the values, needed as we cannot use vector<bool>::data() |
114 | T* values_ptr_; |
115 | std::vector<uint8_t> bool_buffer_; |
116 | |
117 | // Output buffers |
118 | std::vector<T> values_out_; |
119 | std::vector<uint8_t> bool_buffer_out_; |
120 | T* values_out_ptr_; |
121 | }; |
122 | |
123 | template <typename TestType> |
124 | void PrimitiveTypedTest<TestType>::SyncValuesOut() {} |
125 | |
126 | template <> |
127 | void PrimitiveTypedTest<BooleanType>::SyncValuesOut() { |
128 | std::vector<uint8_t>::const_iterator source_iterator = bool_buffer_out_.begin(); |
129 | std::vector<T>::iterator destination_iterator = values_out_.begin(); |
130 | while (source_iterator != bool_buffer_out_.end()) { |
131 | *destination_iterator++ = *source_iterator++ != 0; |
132 | } |
133 | } |
134 | |
135 | template <typename TestType> |
136 | void PrimitiveTypedTest<TestType>::SetupValuesOut(int64_t num_values) { |
137 | values_out_.clear(); |
138 | values_out_.resize(num_values); |
139 | values_out_ptr_ = values_out_.data(); |
140 | } |
141 | |
142 | template <> |
143 | void PrimitiveTypedTest<BooleanType>::SetupValuesOut(int64_t num_values) { |
144 | values_out_.clear(); |
145 | values_out_.resize(num_values); |
146 | |
147 | bool_buffer_out_.clear(); |
148 | bool_buffer_out_.resize(num_values); |
149 | // Write once to all values so we can copy it without getting Valgrind errors |
150 | // about uninitialised values. |
151 | std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true); |
152 | values_out_ptr_ = reinterpret_cast<bool*>(bool_buffer_out_.data()); |
153 | } |
154 | |
155 | template <typename TestType> |
156 | void PrimitiveTypedTest<TestType>::GenerateData(int64_t num_values) { |
157 | def_levels_.resize(num_values); |
158 | values_.resize(num_values); |
159 | |
160 | InitValues<T>(static_cast<int>(num_values), values_, buffer_); |
161 | values_ptr_ = values_.data(); |
162 | |
163 | std::fill(def_levels_.begin(), def_levels_.end(), 1); |
164 | } |
165 | |
166 | template <> |
167 | void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values) { |
168 | def_levels_.resize(num_values); |
169 | values_.resize(num_values); |
170 | |
171 | InitValues<T>(static_cast<int>(num_values), values_, buffer_); |
172 | bool_buffer_.resize(num_values); |
173 | std::copy(values_.begin(), values_.end(), bool_buffer_.begin()); |
174 | values_ptr_ = reinterpret_cast<bool*>(bool_buffer_.data()); |
175 | |
176 | std::fill(def_levels_.begin(), def_levels_.end(), 1); |
177 | } |
178 | } // namespace test |
179 | |
180 | } // namespace parquet |
181 | |