1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "parquet/metadata.h"
19
20#include <gtest/gtest.h>
21
22#include "parquet/schema.h"
23#include "parquet/statistics.h"
24#include "parquet/thrift.h"
25#include "parquet/types.h"
26
27namespace parquet {
28
29namespace metadata {
30
31TEST(Metadata, TestBuildAccess) {
32 parquet::schema::NodeVector fields;
33 parquet::schema::NodePtr root;
34 parquet::SchemaDescriptor schema;
35
36 WriterProperties::Builder prop_builder;
37
38 std::shared_ptr<WriterProperties> props =
39 prop_builder.version(ParquetVersion::PARQUET_2_0)->build();
40
41 fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED));
42 fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED));
43 root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);
44 schema.Init(root);
45
46 int64_t nrows = 1000;
47 int32_t int_min = 100, int_max = 200;
48 EncodedStatistics stats_int;
49 stats_int.set_null_count(0)
50 .set_distinct_count(nrows)
51 .set_min(std::string(reinterpret_cast<const char*>(&int_min), 4))
52 .set_max(std::string(reinterpret_cast<const char*>(&int_max), 4));
53 EncodedStatistics stats_float;
54 float float_min = 100.100f, float_max = 200.200f;
55 stats_float.set_null_count(0)
56 .set_distinct_count(nrows)
57 .set_min(std::string(reinterpret_cast<const char*>(&float_min), 4))
58 .set_max(std::string(reinterpret_cast<const char*>(&float_max), 4));
59
60 auto f_builder = FileMetaDataBuilder::Make(&schema, props);
61 auto rg1_builder = f_builder->AppendRowGroup();
62
63 // Write the metadata
64 // rowgroup1 metadata
65 auto col1_builder = rg1_builder->NextColumnChunk();
66 auto col2_builder = rg1_builder->NextColumnChunk();
67 // column metadata
68 col1_builder->SetStatistics(true, stats_int);
69 col2_builder->SetStatistics(true, stats_float);
70 col1_builder->Finish(nrows / 2, 4, 0, 10, 512, 600, true, false);
71 col2_builder->Finish(nrows / 2, 24, 0, 30, 512, 600, true, false);
72
73 rg1_builder->set_num_rows(nrows / 2);
74 rg1_builder->Finish(1024);
75
76 // rowgroup2 metadata
77 auto rg2_builder = f_builder->AppendRowGroup();
78 col1_builder = rg2_builder->NextColumnChunk();
79 col2_builder = rg2_builder->NextColumnChunk();
80 // column metadata
81 col1_builder->SetStatistics(true, stats_int);
82 col2_builder->SetStatistics(true, stats_float);
83 col1_builder->Finish(nrows / 2, 6, 0, 10, 512, 600, true, false);
84 col2_builder->Finish(nrows / 2, 16, 0, 26, 512, 600, true, false);
85
86 rg2_builder->set_num_rows(nrows / 2);
87 rg2_builder->Finish(1024);
88
89 // Read the metadata
90 auto f_accessor = f_builder->Finish();
91
92 // file metadata
93 ASSERT_EQ(nrows, f_accessor->num_rows());
94 ASSERT_LE(0, static_cast<int>(f_accessor->size()));
95 ASSERT_EQ(2, f_accessor->num_row_groups());
96 ASSERT_EQ(ParquetVersion::PARQUET_2_0, f_accessor->version());
97 ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by());
98 ASSERT_EQ(3, f_accessor->num_schema_elements());
99
100 // row group1 metadata
101 auto rg1_accessor = f_accessor->RowGroup(0);
102 ASSERT_EQ(2, rg1_accessor->num_columns());
103 ASSERT_EQ(nrows / 2, rg1_accessor->num_rows());
104 ASSERT_EQ(1024, rg1_accessor->total_byte_size());
105
106 auto rg1_column1 = rg1_accessor->ColumnChunk(0);
107 auto rg1_column2 = rg1_accessor->ColumnChunk(1);
108 ASSERT_EQ(true, rg1_column1->is_stats_set());
109 ASSERT_EQ(true, rg1_column2->is_stats_set());
110 ASSERT_EQ(stats_float.min(), rg1_column2->statistics()->EncodeMin());
111 ASSERT_EQ(stats_float.max(), rg1_column2->statistics()->EncodeMax());
112 ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
113 ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
114 ASSERT_EQ(0, rg1_column1->statistics()->null_count());
115 ASSERT_EQ(0, rg1_column2->statistics()->null_count());
116 ASSERT_EQ(nrows, rg1_column1->statistics()->distinct_count());
117 ASSERT_EQ(nrows, rg1_column2->statistics()->distinct_count());
118 ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column1->compression());
119 ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg1_column2->compression());
120 ASSERT_EQ(nrows / 2, rg1_column1->num_values());
121 ASSERT_EQ(nrows / 2, rg1_column2->num_values());
122 ASSERT_EQ(3, rg1_column1->encodings().size());
123 ASSERT_EQ(3, rg1_column2->encodings().size());
124 ASSERT_EQ(512, rg1_column1->total_compressed_size());
125 ASSERT_EQ(512, rg1_column2->total_compressed_size());
126 ASSERT_EQ(600, rg1_column1->total_uncompressed_size());
127 ASSERT_EQ(600, rg1_column2->total_uncompressed_size());
128 ASSERT_EQ(4, rg1_column1->dictionary_page_offset());
129 ASSERT_EQ(24, rg1_column2->dictionary_page_offset());
130 ASSERT_EQ(10, rg1_column1->data_page_offset());
131 ASSERT_EQ(30, rg1_column2->data_page_offset());
132
133 auto rg2_accessor = f_accessor->RowGroup(1);
134 ASSERT_EQ(2, rg2_accessor->num_columns());
135 ASSERT_EQ(nrows / 2, rg2_accessor->num_rows());
136 ASSERT_EQ(1024, rg2_accessor->total_byte_size());
137
138 auto rg2_column1 = rg2_accessor->ColumnChunk(0);
139 auto rg2_column2 = rg2_accessor->ColumnChunk(1);
140 ASSERT_EQ(true, rg2_column1->is_stats_set());
141 ASSERT_EQ(true, rg2_column2->is_stats_set());
142 ASSERT_EQ(stats_float.min(), rg2_column2->statistics()->EncodeMin());
143 ASSERT_EQ(stats_float.max(), rg2_column2->statistics()->EncodeMax());
144 ASSERT_EQ(stats_int.min(), rg1_column1->statistics()->EncodeMin());
145 ASSERT_EQ(stats_int.max(), rg1_column1->statistics()->EncodeMax());
146 ASSERT_EQ(0, rg2_column1->statistics()->null_count());
147 ASSERT_EQ(0, rg2_column2->statistics()->null_count());
148 ASSERT_EQ(nrows, rg2_column1->statistics()->distinct_count());
149 ASSERT_EQ(nrows, rg2_column2->statistics()->distinct_count());
150 ASSERT_EQ(nrows / 2, rg2_column1->num_values());
151 ASSERT_EQ(nrows / 2, rg2_column2->num_values());
152 ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column1->compression());
153 ASSERT_EQ(DEFAULT_COMPRESSION_TYPE, rg2_column2->compression());
154 ASSERT_EQ(3, rg2_column1->encodings().size());
155 ASSERT_EQ(3, rg2_column2->encodings().size());
156 ASSERT_EQ(512, rg2_column1->total_compressed_size());
157 ASSERT_EQ(512, rg2_column2->total_compressed_size());
158 ASSERT_EQ(600, rg2_column1->total_uncompressed_size());
159 ASSERT_EQ(600, rg2_column2->total_uncompressed_size());
160 ASSERT_EQ(6, rg2_column1->dictionary_page_offset());
161 ASSERT_EQ(16, rg2_column2->dictionary_page_offset());
162 ASSERT_EQ(10, rg2_column1->data_page_offset());
163 ASSERT_EQ(26, rg2_column2->data_page_offset());
164}
165
166TEST(Metadata, TestV1Version) {
167 // PARQUET-839
168 parquet::schema::NodeVector fields;
169 parquet::schema::NodePtr root;
170 parquet::SchemaDescriptor schema;
171
172 WriterProperties::Builder prop_builder;
173
174 std::shared_ptr<WriterProperties> props =
175 prop_builder.version(ParquetVersion::PARQUET_1_0)->build();
176
177 fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED));
178 fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED));
179 root = parquet::schema::GroupNode::Make("schema", Repetition::REPEATED, fields);
180 schema.Init(root);
181
182 auto f_builder = FileMetaDataBuilder::Make(&schema, props);
183
184 // Read the metadata
185 auto f_accessor = f_builder->Finish();
186
187 // file metadata
188 ASSERT_EQ(ParquetVersion::PARQUET_1_0, f_accessor->version());
189}
190
191TEST(ApplicationVersion, Basics) {
192 ApplicationVersion version("parquet-mr version 1.7.9");
193 ApplicationVersion version1("parquet-mr version 1.8.0");
194 ApplicationVersion version2("parquet-cpp version 1.0.0");
195 ApplicationVersion version3("");
196 ApplicationVersion version4("parquet-mr version 1.5.0ab-cdh5.5.0+cd (build abcd)");
197 ApplicationVersion version5("parquet-mr");
198
199 ASSERT_EQ("parquet-mr", version.application_);
200 ASSERT_EQ(1, version.version.major);
201 ASSERT_EQ(7, version.version.minor);
202 ASSERT_EQ(9, version.version.patch);
203
204 ASSERT_EQ("parquet-cpp", version2.application_);
205 ASSERT_EQ(1, version2.version.major);
206 ASSERT_EQ(0, version2.version.minor);
207 ASSERT_EQ(0, version2.version.patch);
208
209 ASSERT_EQ("parquet-mr", version4.application_);
210 ASSERT_EQ("abcd", version4.build_);
211 ASSERT_EQ(1, version4.version.major);
212 ASSERT_EQ(5, version4.version.minor);
213 ASSERT_EQ(0, version4.version.patch);
214 ASSERT_EQ("ab", version4.version.unknown);
215 ASSERT_EQ("cdh5.5.0", version4.version.pre_release);
216 ASSERT_EQ("cd", version4.version.build_info);
217
218 ASSERT_EQ("parquet-mr", version5.application_);
219 ASSERT_EQ(0, version5.version.major);
220 ASSERT_EQ(0, version5.version.minor);
221 ASSERT_EQ(0, version5.version.patch);
222
223 ASSERT_EQ(true, version.VersionLt(version1));
224
225 EncodedStatistics stats;
226 ASSERT_FALSE(version1.HasCorrectStatistics(Type::INT96, stats, SortOrder::UNKNOWN));
227 ASSERT_TRUE(version.HasCorrectStatistics(Type::INT32, stats, SortOrder::SIGNED));
228 ASSERT_FALSE(version.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::SIGNED));
229 ASSERT_TRUE(version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::SIGNED));
230 ASSERT_FALSE(
231 version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats, SortOrder::UNSIGNED));
232 ASSERT_TRUE(version3.HasCorrectStatistics(Type::FIXED_LEN_BYTE_ARRAY, stats,
233 SortOrder::SIGNED));
234
235 // Check that the old stats are correct if min and max are the same
236 // regardless of sort order
237 EncodedStatistics stats_str;
238 stats_str.set_min("a").set_max("b");
239 ASSERT_FALSE(
240 version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_str, SortOrder::UNSIGNED));
241 stats_str.set_max("a");
242 ASSERT_TRUE(
243 version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_str, SortOrder::UNSIGNED));
244
245 // Check that the same holds true for ints
246 int32_t int_min = 100, int_max = 200;
247 EncodedStatistics stats_int;
248 stats_int.set_min(std::string(reinterpret_cast<const char*>(&int_min), 4))
249 .set_max(std::string(reinterpret_cast<const char*>(&int_max), 4));
250 ASSERT_FALSE(
251 version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
252 stats_int.set_max(std::string(reinterpret_cast<const char*>(&int_min), 4));
253 ASSERT_TRUE(
254 version1.HasCorrectStatistics(Type::BYTE_ARRAY, stats_int, SortOrder::UNSIGNED));
255}
256
257} // namespace metadata
258} // namespace parquet
259