1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "parquet/metadata.h" |
19 | |
20 | #include <gtest/gtest.h> |
21 | |
22 | #include "parquet/schema.h" |
23 | #include "parquet/statistics.h" |
24 | #include "parquet/thrift.h" |
25 | #include "parquet/types.h" |
26 | |
27 | namespace parquet { |
28 | |
29 | namespace metadata { |
30 | |
31 | TEST(Metadata, TestBuildAccess) { |
32 | parquet::schema::NodeVector fields; |
33 | parquet::schema::NodePtr root; |
34 | parquet::SchemaDescriptor schema; |
35 | |
36 | WriterProperties::Builder prop_builder; |
37 | |
38 | std::shared_ptr<WriterProperties> props = |
39 | prop_builder.version(ParquetVersion::PARQUET_2_0)->build(); |
40 | |
41 | fields.push_back(parquet::schema::Int32("int_col" , Repetition::REQUIRED)); |
42 | fields.push_back(parquet::schema::Float("float_col" , Repetition::REQUIRED)); |
43 | root = parquet::schema::GroupNode::Make("schema" , Repetition::REPEATED, fields); |
44 | schema.Init(root); |
45 | |
46 | int64_t nrows = 1000; |
47 | int32_t int_min = 100, int_max = 200; |
48 | EncodedStatistics stats_int; |
49 | stats_int.set_null_count(0) |
50 | .set_distinct_count(nrows) |
51 | .set_min(std::string(reinterpret_cast<const char*>(&int_min), 4)) |
52 | .set_max(std::string(reinterpret_cast<const char*>(&int_max), 4)); |
53 | EncodedStatistics stats_float; |
54 | float float_min = 100.100f, float_max = 200.200f; |
55 | stats_float.set_null_count(0) |
56 | .set_distinct_count(nrows) |
57 | .set_min(std::string(reinterpret_cast<const char*>(&float_min), 4)) |
58 | .set_max(std::string(reinterpret_cast<const char*>(&float_max), 4)); |
59 | |
60 | auto f_builder = FileMetaDataBuilder::Make(&schema, props); |
61 | auto rg1_builder = f_builder->AppendRowGroup(); |
62 | |
63 | // Write the metadata |
64 | // rowgroup1 metadata |
65 | auto col1_builder = rg1_builder->NextColumnChunk(); |
66 | auto col2_builder = rg1_builder->NextColumnChunk(); |
67 | // column metadata |
68 | col1_builder->SetStatistics(true, stats_int); |
69 | col2_builder->SetStatistics(true, stats_float); |
70 | col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false); |
71 | col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false); |
72 | |
73 | rg1_builder->set_num_rows(nrows / 2); |
74 | rg1_builder->Finish(1024); |
75 | |
76 | // rowgroup2 metadata |
77 | auto rg2_builder = f_builder->AppendRowGroup(); |
78 | col1_builder = rg2_builder->NextColumnChunk(); |
79 | col2_builder = rg2_builder->NextColumnChunk(); |
80 | // column metadata |
81 | col1_builder->SetStatistics(true, stats_int); |
82 | col2_builder->SetStatistics(true, stats_float); |
83 | col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false); |
84 | col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false); |
85 | |
86 | rg2_builder->set_num_rows(nrows / 2); |
87 | rg2_builder->Finish(1024); |
88 | |
89 | // Read the metadata |
90 | auto f_accessor = f_builder->Finish(); |
91 | |
92 | // file metadata |
93 | ASSERT_EQ(nrows, f_accessor->num_rows()); |
94 | ASSERT_LE(0, static_cast<int>(f_accessor->size())); |
95 | ASSERT_EQ(2, f_accessor->num_row_groups()); |
96 | ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version()); |
97 | ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); |
98 | ASSERT_EQ(3, f_accessor->num_schema_elements()); |
99 | |
100 | // row group1 metadata |
101 | auto rg1_accessor = f_accessor->RowGroup(0); |
102 | ASSERT_EQ(2, rg1_accessor->num_columns()); |
103 | ASSERT_EQ(nrows / 2, rg1_accessor->num_rows()); |
104 | ASSERT_EQ(1024, rg1_accessor->total_byte_size()); |
105 | |
106 | auto rg1_column1 = rg1_accessor->ColumnChunk(0); |
107 | auto rg1_column2 = rg1_accessor->ColumnChunk(1); |
108 | ASSERT_EQ(true, rg1_column1->is_stats_set()); |
109 | ASSERT_EQ(true, rg1_column2->is_stats_set()); |
110 | ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin()); |
111 | ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax()); |
112 | ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); |
113 | ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); |
114 | ASSERT_EQ(0, rg1_column1->statistics()->null_count()); |
115 | ASSERT_EQ(0, rg1_column2->statistics()->null_count()); |
116 | ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count()); |
117 | ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count()); |
118 | ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression()); |
119 | ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression()); |
120 | ASSERT_EQ(nrows / 2, rg1_column1->num_values()); |
121 | ASSERT_EQ(nrows / 2, rg1_column2->num_values()); |
122 | ASSERT_EQ(3, rg1_column1->encodings().size()); |
123 | ASSERT_EQ(3, rg1_column2->encodings().size()); |
124 | ASSERT_EQ(512, rg1_column1->total_compressed_size()); |
125 | ASSERT_EQ(512, rg1_column2->total_compressed_size()); |
126 | ASSERT_EQ(600, rg1_column1->total_uncompressed_size()); |
127 | ASSERT_EQ(600, rg1_column2->total_uncompressed_size()); |
128 | ASSERT_EQ(4, rg1_column1->dictionary_page_offset()); |
129 | ASSERT_EQ(24, rg1_column2->dictionary_page_offset()); |
130 | ASSERT_EQ(10, rg1_column1->data_page_offset()); |
131 | ASSERT_EQ(30, rg1_column2->data_page_offset()); |
132 | |
133 | auto rg2_accessor = f_accessor->RowGroup(1); |
134 | ASSERT_EQ(2, rg2_accessor->num_columns()); |
135 | ASSERT_EQ(nrows / 2, rg2_accessor->num_rows()); |
136 | ASSERT_EQ(1024, rg2_accessor->total_byte_size()); |
137 | |
138 | auto rg2_column1 = rg2_accessor->ColumnChunk(0); |
139 | auto rg2_column2 = rg2_accessor->ColumnChunk(1); |
140 | ASSERT_EQ(true, rg2_column1->is_stats_set()); |
141 | ASSERT_EQ(true, rg2_column2->is_stats_set()); |
142 | ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin()); |
143 | ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax()); |
144 | ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin()); |
145 | ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax()); |
146 | ASSERT_EQ(0, rg2_column1->statistics()->null_count()); |
147 | ASSERT_EQ(0, rg2_column2->statistics()->null_count()); |
148 | ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count()); |
149 | ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count()); |
150 | ASSERT_EQ(nrows / 2, rg2_column1->num_values()); |
151 | ASSERT_EQ(nrows / 2, rg2_column2->num_values()); |
152 | ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression()); |
153 | ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression()); |
154 | ASSERT_EQ(3, rg2_column1->encodings().size()); |
155 | ASSERT_EQ(3, rg2_column2->encodings().size()); |
156 | ASSERT_EQ(512, rg2_column1->total_compressed_size()); |
157 | ASSERT_EQ(512, rg2_column2->total_compressed_size()); |
158 | ASSERT_EQ(600, rg2_column1->total_uncompressed_size()); |
159 | ASSERT_EQ(600, rg2_column2->total_uncompressed_size()); |
160 | ASSERT_EQ(6, rg2_column1->dictionary_page_offset()); |
161 | ASSERT_EQ(16, rg2_column2->dictionary_page_offset()); |
162 | ASSERT_EQ(10, rg2_column1->data_page_offset()); |
163 | ASSERT_EQ(26, rg2_column2->data_page_offset()); |
164 | } |
165 | |
166 | TEST(Metadata, TestV1Version) { |
167 | // PARQUET-839 |
168 | parquet::schema::NodeVector fields; |
169 | parquet::schema::NodePtr root; |
170 | parquet::SchemaDescriptor schema; |
171 | |
172 | WriterProperties::Builder prop_builder; |
173 | |
174 | std::shared_ptr<WriterProperties> props = |
175 | prop_builder.version(ParquetVersion::PARQUET_1_0)->build(); |
176 | |
177 | fields.push_back(parquet::schema::Int32("int_col" , Repetition::REQUIRED)); |
178 | fields.push_back(parquet::schema::Float("float_col" , Repetition::REQUIRED)); |
179 | root = parquet::schema::GroupNode::Make("schema" , Repetition::REPEATED, fields); |
180 | schema.Init(root); |
181 | |
182 | auto f_builder = FileMetaDataBuilder::Make(&schema, props); |
183 | |
184 | // Read the metadata |
185 | auto f_accessor = f_builder->Finish(); |
186 | |
187 | // file metadata |
188 | ASSERT_EQ(ParquetVersion::PARQUET_1_0, f_accessor->version()); |
189 | } |
190 | |
191 | TEST(ApplicationVersion, Basics) { |
192 | ApplicationVersion version("parquet-mr version 1.7.9" ); |
193 | ApplicationVersion version1("parquet-mr version 1.8.0" ); |
194 | ApplicationVersion version2("parquet-cpp version 1.0.0" ); |
195 | ApplicationVersion version3("" ); |
196 | ApplicationVersion version4("parquet-mr version 1.5.0ab-cdh5.5.0+cd (build abcd)" ); |
197 | ApplicationVersion version5("parquet-mr" ); |
198 | |
199 | ASSERT_EQ("parquet-mr" , version.application_); |
200 | ASSERT_EQ(1, version.version.major); |
201 | ASSERT_EQ(7, version.version.minor); |
202 | ASSERT_EQ(9, version.version.patch); |
203 | |
204 | ASSERT_EQ("parquet-cpp" , version2.application_); |
205 | ASSERT_EQ(1, version2.version.major); |
206 | ASSERT_EQ(0, version2.version.minor); |
207 | ASSERT_EQ(0, version2.version.patch); |
208 | |
209 | ASSERT_EQ("parquet-mr" , version4.application_); |
210 | ASSERT_EQ("abcd" , version4.build_); |
211 | ASSERT_EQ(1, version4.version.major); |
212 | ASSERT_EQ(5, version4.version.minor); |
213 | ASSERT_EQ(0, version4.version.patch); |
214 | ASSERT_EQ("ab" , version4.version.unknown); |
215 | ASSERT_EQ("cdh5.5.0" , version4.version.pre_release); |
216 | ASSERT_EQ("cd" , version4.version.build_info); |
217 | |
218 | ASSERT_EQ("parquet-mr" , version5.application_); |
219 | ASSERT_EQ(0, version5.version.major); |
220 | ASSERT_EQ(0, version5.version.minor); |
221 | ASSERT_EQ(0, version5.version.patch); |
222 | |
223 | ASSERT_EQ(true, version.VersionLt(version1)); |
224 | |
225 | EncodedStatistics stats; |
226 | ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, stats, SortOrder::UNKNOWN)); |
227 | ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, stats, SortOrder::SIGNED)); |
228 | ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::SIGNED)); |
229 | ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::SIGNED)); |
230 | ASSERT_FALSE( |
231 | version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::UNSIGNED)); |
232 | ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, stats, |
233 | SortOrder::SIGNED)); |
234 | |
235 | // Check that the old stats are correct if min and max are the same |
236 | // regardless of sort order |
237 | EncodedStatistics stats_str; |
238 | stats_str.set_min("a" ).set_max("b" ); |
239 | ASSERT_FALSE( |
240 | version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_str, SortOrder::UNSIGNED)); |
241 | stats_str.set_max("a" ); |
242 | ASSERT_TRUE( |
243 | version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_str, SortOrder::UNSIGNED)); |
244 | |
245 | // Check that the same holds true for ints |
246 | int32_t int_min = 100, int_max = 200; |
247 | EncodedStatistics stats_int; |
248 | stats_int.set_min(std::string(reinterpret_cast<const char*>(&int_min), 4)) |
249 | .set_max(std::string(reinterpret_cast<const char*>(&int_max), 4)); |
250 | ASSERT_FALSE( |
251 | version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED)); |
252 | stats_int.set_max(std::string(reinterpret_cast<const char*>(&int_min), 4)); |
253 | ASSERT_TRUE( |
254 | version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED)); |
255 | } |
256 | |
257 | } // namespace metadata |
258 | } // namespace parquet |
259 | |