1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "parquet/printer.h" |
19 | |
20 | #include <cstdint> |
21 | #include <cstdio> |
22 | #include <memory> |
23 | #include <ostream> |
24 | #include <string> |
25 | #include <vector> |
26 | |
27 | #include "arrow/util/key_value_metadata.h" |
28 | |
29 | #include "parquet/column_scanner.h" |
30 | #include "parquet/exception.h" |
31 | #include "parquet/file_reader.h" |
32 | #include "parquet/metadata.h" |
33 | #include "parquet/schema.h" |
34 | #include "parquet/statistics.h" |
35 | #include "parquet/types.h" |
36 | |
37 | namespace parquet { |
38 | |
39 | class ColumnReader; |
40 | |
41 | // ---------------------------------------------------------------------- |
42 | // ParquetFilePrinter::DebugPrint |
43 | |
44 | // the fixed initial size is just for an example |
45 | #define COL_WIDTH 30 |
46 | |
47 | void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns, |
48 | bool print_values, bool format_dump, |
49 | bool print_key_value_metadata, const char* filename) { |
50 | const FileMetaData* file_metadata = fileReader->metadata().get(); |
51 | |
52 | stream << "File Name: " << filename << "\n" ; |
53 | stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n" ; |
54 | stream << "Created By: " << file_metadata->created_by() << "\n" ; |
55 | stream << "Total rows: " << file_metadata->num_rows() << "\n" ; |
56 | |
57 | if (print_key_value_metadata && file_metadata->key_value_metadata()) { |
58 | auto key_value_metadata = file_metadata->key_value_metadata(); |
59 | int64_t size_of_key_value_metadata = key_value_metadata->size(); |
60 | stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n" ; |
61 | for (int64_t i = 0; i < size_of_key_value_metadata; i++) { |
62 | stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": " |
63 | << key_value_metadata->value(i) << "\n" ; |
64 | } |
65 | } |
66 | |
67 | stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n" ; |
68 | stream << "Number of Real Columns: " |
69 | << file_metadata->schema()->group_node()->field_count() << "\n" ; |
70 | |
71 | if (selected_columns.size() == 0) { |
72 | for (int i = 0; i < file_metadata->num_columns(); i++) { |
73 | selected_columns.push_back(i); |
74 | } |
75 | } else { |
76 | for (auto i : selected_columns) { |
77 | if (i < 0 || i >= file_metadata->num_columns()) { |
78 | throw ParquetException("Selected column is out of range" ); |
79 | } |
80 | } |
81 | } |
82 | |
83 | stream << "Number of Columns: " << file_metadata->num_columns() << "\n" ; |
84 | stream << "Number of Selected Columns: " << selected_columns.size() << "\n" ; |
85 | for (auto i : selected_columns) { |
86 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
87 | stream << "Column " << i << ": " << descr->path()->ToDotString() << " (" |
88 | << TypeToString(descr->physical_type()); |
89 | if (descr->converted_type() != ConvertedType::NONE) { |
90 | stream << "/" << ConvertedTypeToString(descr->converted_type()); |
91 | } |
92 | if (descr->converted_type() == ConvertedType::DECIMAL) { |
93 | stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")" ; |
94 | } |
95 | stream << ")" << std::endl; |
96 | } |
97 | |
98 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { |
99 | stream << "--- Row Group: " << r << " ---\n" ; |
100 | |
101 | auto group_reader = fileReader->RowGroup(r); |
102 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); |
103 | |
104 | stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n" ; |
105 | stream << "--- Rows: " << group_metadata->num_rows() << " ---\n" ; |
106 | |
107 | // Print column metadata |
108 | for (auto i : selected_columns) { |
109 | auto column_chunk = group_metadata->ColumnChunk(i); |
110 | std::shared_ptr<Statistics> stats = column_chunk->statistics(); |
111 | |
112 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
113 | stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values(); |
114 | if (column_chunk->is_stats_set()) { |
115 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); |
116 | stream << ", Null Values: " << stats->null_count() |
117 | << ", Distinct Values: " << stats->distinct_count() << std::endl |
118 | << " Max: " << FormatStatValue(descr->physical_type(), max) |
119 | << ", Min: " << FormatStatValue(descr->physical_type(), min); |
120 | } else { |
121 | stream << " Statistics Not Set" ; |
122 | } |
123 | stream << std::endl |
124 | << " Compression: " << Codec::GetCodecAsString(column_chunk->compression()) |
125 | << ", Encodings:" ; |
126 | for (auto encoding : column_chunk->encodings()) { |
127 | stream << " " << EncodingToString(encoding); |
128 | } |
129 | stream << std::endl |
130 | << " Uncompressed Size: " << column_chunk->total_uncompressed_size() |
131 | << ", Compressed Size: " << column_chunk->total_compressed_size() |
132 | << std::endl; |
133 | } |
134 | |
135 | if (!print_values) { |
136 | continue; |
137 | } |
138 | stream << "--- Values ---\n" ; |
139 | |
140 | static constexpr int bufsize = COL_WIDTH + 1; |
141 | char buffer[bufsize]; |
142 | |
143 | // Create readers for selected columns and print contents |
144 | std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr); |
145 | int j = 0; |
146 | for (auto i : selected_columns) { |
147 | std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i); |
148 | // This is OK in this method as long as the RowGroupReader does not get |
149 | // deleted |
150 | auto& scanner = scanners[j++] = Scanner::Make(col_reader); |
151 | |
152 | if (format_dump) { |
153 | stream << "Column " << i << std::endl; |
154 | while (scanner->HasNext()) { |
155 | scanner->PrintNext(stream, 0, true); |
156 | stream << "\n" ; |
157 | } |
158 | continue; |
159 | } |
160 | |
161 | snprintf(buffer, bufsize, "%-*s" , COL_WIDTH, |
162 | file_metadata->schema()->Column(i)->name().c_str()); |
163 | stream << buffer << '|'; |
164 | } |
165 | if (format_dump) { |
166 | continue; |
167 | } |
168 | stream << "\n" ; |
169 | |
170 | bool hasRow; |
171 | do { |
172 | hasRow = false; |
173 | for (auto scanner : scanners) { |
174 | if (scanner->HasNext()) { |
175 | hasRow = true; |
176 | scanner->PrintNext(stream, COL_WIDTH); |
177 | stream << '|'; |
178 | } |
179 | } |
180 | stream << "\n" ; |
181 | } while (hasRow); |
182 | } |
183 | } |
184 | |
185 | void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns, |
186 | const char* filename) { |
187 | const FileMetaData* file_metadata = fileReader->metadata().get(); |
188 | stream << "{\n" ; |
189 | stream << " \"FileName\": \"" << filename << "\",\n" ; |
190 | stream << " \"Version\": \"" << file_metadata->version() << "\",\n" ; |
191 | stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n" ; |
192 | stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n" ; |
193 | stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n" ; |
194 | stream << " \"NumberOfRealColumns\": \"" |
195 | << file_metadata->schema()->group_node()->field_count() << "\",\n" ; |
196 | stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n" ; |
197 | |
198 | if (selected_columns.size() == 0) { |
199 | for (int i = 0; i < file_metadata->num_columns(); i++) { |
200 | selected_columns.push_back(i); |
201 | } |
202 | } else { |
203 | for (auto i : selected_columns) { |
204 | if (i < 0 || i >= file_metadata->num_columns()) { |
205 | throw ParquetException("Selected column is out of range" ); |
206 | } |
207 | } |
208 | } |
209 | |
210 | stream << " \"Columns\": [\n" ; |
211 | int c = 0; |
212 | for (auto i : selected_columns) { |
213 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
214 | stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\"," |
215 | << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\"," |
216 | << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type()) |
217 | << "\"," |
218 | << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }" ; |
219 | c++; |
220 | if (c != static_cast<int>(selected_columns.size())) { |
221 | stream << ",\n" ; |
222 | } |
223 | } |
224 | |
225 | stream << "\n ],\n \"RowGroups\": [\n" ; |
226 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { |
227 | stream << " {\n \"Id\": \"" << r << "\", " ; |
228 | |
229 | auto group_reader = fileReader->RowGroup(r); |
230 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); |
231 | |
232 | stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", " ; |
233 | stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n" ; |
234 | |
235 | // Print column metadata |
236 | stream << " \"ColumnChunks\": [\n" ; |
237 | int c1 = 0; |
238 | for (auto i : selected_columns) { |
239 | auto column_chunk = group_metadata->ColumnChunk(i); |
240 | std::shared_ptr<Statistics> stats = column_chunk->statistics(); |
241 | |
242 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
243 | stream << " {\"Id\": \"" << i << "\", \"Values\": \"" |
244 | << column_chunk->num_values() << "\", " |
245 | << "\"StatsSet\": " ; |
246 | if (column_chunk->is_stats_set()) { |
247 | stream << "\"True\", \"Stats\": {" ; |
248 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); |
249 | stream << "\"NumNulls\": \"" << stats->null_count() << "\", " |
250 | << "\"DistinctValues\": \"" << stats->distinct_count() << "\", " |
251 | << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", " |
252 | << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min) |
253 | << "\" }," ; |
254 | } else { |
255 | stream << "\"False\"," ; |
256 | } |
257 | stream << "\n \"Compression\": \"" |
258 | << Codec::GetCodecAsString(column_chunk->compression()) |
259 | << "\", \"Encodings\": \"" ; |
260 | for (auto encoding : column_chunk->encodings()) { |
261 | stream << EncodingToString(encoding) << " " ; |
262 | } |
263 | stream << "\", " |
264 | << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() |
265 | << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); |
266 | |
267 | // end of a ColumnChunk |
268 | stream << "\" }" ; |
269 | c1++; |
270 | if (c1 != static_cast<int>(selected_columns.size())) { |
271 | stream << ",\n" ; |
272 | } |
273 | } |
274 | |
275 | stream << "\n ]\n }" ; |
276 | if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { |
277 | stream << ",\n" ; |
278 | } |
279 | } |
280 | stream << "\n ]\n}\n" ; |
281 | } |
282 | |
283 | } // namespace parquet |
284 | |