1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "parquet/printer.h" |
19 | |
20 | #include <string> |
21 | #include <vector> |
22 | |
23 | #include "parquet/column_scanner.h" |
24 | |
25 | using std::string; |
26 | using std::vector; |
27 | |
28 | namespace parquet { |
29 | // ---------------------------------------------------------------------- |
30 | // ParquetFilePrinter::DebugPrint |
31 | |
32 | // the fixed initial size is just for an example |
33 | #define COL_WIDTH "30" |
34 | |
35 | void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns, |
36 | bool print_values, bool print_key_value_metadata, |
37 | const char* filename) { |
38 | const FileMetaData* file_metadata = fileReader->metadata().get(); |
39 | |
40 | stream << "File Name: " << filename << "\n" ; |
41 | stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n" ; |
42 | stream << "Created By: " << file_metadata->created_by() << "\n" ; |
43 | stream << "Total rows: " << file_metadata->num_rows() << "\n" ; |
44 | |
45 | if (print_key_value_metadata) { |
46 | auto key_value_metadata = file_metadata->key_value_metadata(); |
47 | int64_t size_of_key_value_metadata = key_value_metadata->size(); |
48 | stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n" ; |
49 | for (int64_t i = 0; i < size_of_key_value_metadata; i++) { |
50 | stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": " |
51 | << key_value_metadata->value(i) << "\n" ; |
52 | } |
53 | } |
54 | |
55 | stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n" ; |
56 | stream << "Number of Real Columns: " |
57 | << file_metadata->schema()->group_node()->field_count() << "\n" ; |
58 | |
59 | if (selected_columns.size() == 0) { |
60 | for (int i = 0; i < file_metadata->num_columns(); i++) { |
61 | selected_columns.push_back(i); |
62 | } |
63 | } else { |
64 | for (auto i : selected_columns) { |
65 | if (i < 0 || i >= file_metadata->num_columns()) { |
66 | throw ParquetException("Selected column is out of range" ); |
67 | } |
68 | } |
69 | } |
70 | |
71 | stream << "Number of Columns: " << file_metadata->num_columns() << "\n" ; |
72 | stream << "Number of Selected Columns: " << selected_columns.size() << "\n" ; |
73 | for (auto i : selected_columns) { |
74 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
75 | stream << "Column " << i << ": " << descr->name() << " (" |
76 | << TypeToString(descr->physical_type()) << ")" << std::endl; |
77 | } |
78 | |
79 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { |
80 | stream << "--- Row Group " << r << " ---\n" ; |
81 | |
82 | auto group_reader = fileReader->RowGroup(r); |
83 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); |
84 | |
85 | stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n" ; |
86 | stream << " Rows: " << group_metadata->num_rows() << "---\n" ; |
87 | |
88 | // Print column metadata |
89 | for (auto i : selected_columns) { |
90 | auto column_chunk = group_metadata->ColumnChunk(i); |
91 | std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics(); |
92 | |
93 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
94 | stream << "Column " << i << std::endl << ", Values: " << column_chunk->num_values(); |
95 | if (column_chunk->is_stats_set()) { |
96 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); |
97 | stream << ", Null Values: " << stats->null_count() |
98 | << ", Distinct Values: " << stats->distinct_count() << std::endl |
99 | << " Max: " << FormatStatValue(descr->physical_type(), max) |
100 | << ", Min: " << FormatStatValue(descr->physical_type(), min); |
101 | } else { |
102 | stream << " Statistics Not Set" ; |
103 | } |
104 | stream << std::endl |
105 | << " Compression: " << CompressionToString(column_chunk->compression()) |
106 | << ", Encodings: " ; |
107 | for (auto encoding : column_chunk->encodings()) { |
108 | stream << EncodingToString(encoding) << " " ; |
109 | } |
110 | stream << std::endl |
111 | << " Uncompressed Size: " << column_chunk->total_uncompressed_size() |
112 | << ", Compressed Size: " << column_chunk->total_compressed_size() |
113 | << std::endl; |
114 | } |
115 | |
116 | if (!print_values) { |
117 | continue; |
118 | } |
119 | |
120 | static constexpr int bufsize = 25; |
121 | char buffer[bufsize]; |
122 | |
123 | // Create readers for selected columns and print contents |
124 | vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr); |
125 | int j = 0; |
126 | for (auto i : selected_columns) { |
127 | std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i); |
128 | |
129 | std::stringstream ss; |
130 | ss << "%-" << COL_WIDTH << "s" ; |
131 | std::string fmt = ss.str(); |
132 | |
133 | snprintf(buffer, bufsize, fmt.c_str(), |
134 | file_metadata->schema()->Column(i)->name().c_str()); |
135 | stream << buffer; |
136 | |
137 | // This is OK in this method as long as the RowGroupReader does not get |
138 | // deleted |
139 | scanners[j++] = Scanner::Make(col_reader); |
140 | } |
141 | stream << "\n" ; |
142 | |
143 | bool hasRow; |
144 | do { |
145 | hasRow = false; |
146 | for (auto scanner : scanners) { |
147 | if (scanner->HasNext()) { |
148 | hasRow = true; |
149 | scanner->PrintNext(stream, 27); |
150 | } |
151 | } |
152 | stream << "\n" ; |
153 | } while (hasRow); |
154 | } |
155 | } |
156 | |
157 | void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns, |
158 | const char* filename) { |
159 | const FileMetaData* file_metadata = fileReader->metadata().get(); |
160 | stream << "{\n" ; |
161 | stream << " \"FileName\": \"" << filename << "\",\n" ; |
162 | stream << " \"Version\": \"" << file_metadata->version() << "\",\n" ; |
163 | stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n" ; |
164 | stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n" ; |
165 | stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n" ; |
166 | stream << " \"NumberOfRealColumns\": \"" |
167 | << file_metadata->schema()->group_node()->field_count() << "\",\n" ; |
168 | stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n" ; |
169 | |
170 | if (selected_columns.size() == 0) { |
171 | for (int i = 0; i < file_metadata->num_columns(); i++) { |
172 | selected_columns.push_back(i); |
173 | } |
174 | } else { |
175 | for (auto i : selected_columns) { |
176 | if (i < 0 || i >= file_metadata->num_columns()) { |
177 | throw ParquetException("Selected column is out of range" ); |
178 | } |
179 | } |
180 | } |
181 | |
182 | stream << " \"Columns\": [\n" ; |
183 | int c = 0; |
184 | for (auto i : selected_columns) { |
185 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
186 | stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\"," |
187 | << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\"," |
188 | << " \"LogicalType\": \"" << LogicalTypeToString(descr->logical_type()) |
189 | << "\" }" ; |
190 | c++; |
191 | if (c != static_cast<int>(selected_columns.size())) { |
192 | stream << ",\n" ; |
193 | } |
194 | } |
195 | |
196 | stream << "\n ],\n \"RowGroups\": [\n" ; |
197 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { |
198 | stream << " {\n \"Id\": \"" << r << "\", " ; |
199 | |
200 | auto group_reader = fileReader->RowGroup(r); |
201 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); |
202 | |
203 | stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", " ; |
204 | stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n" ; |
205 | |
206 | // Print column metadata |
207 | stream << " \"ColumnChunks\": [\n" ; |
208 | int c1 = 0; |
209 | for (auto i : selected_columns) { |
210 | auto column_chunk = group_metadata->ColumnChunk(i); |
211 | std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics(); |
212 | |
213 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
214 | stream << " {\"Id\": \"" << i << "\", \"Values\": \"" |
215 | << column_chunk->num_values() << "\", " |
216 | << "\"StatsSet\": " ; |
217 | if (column_chunk->is_stats_set()) { |
218 | stream << "\"True\", \"Stats\": {" ; |
219 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); |
220 | stream << "\"NumNulls\": \"" << stats->null_count() << "\", " |
221 | << "\"DistinctValues\": \"" << stats->distinct_count() << "\", " |
222 | << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", " |
223 | << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min) |
224 | << "\" }," ; |
225 | } else { |
226 | stream << "\"False\"," ; |
227 | } |
228 | stream << "\n \"Compression\": \"" |
229 | << CompressionToString(column_chunk->compression()) |
230 | << "\", \"Encodings\": \"" ; |
231 | for (auto encoding : column_chunk->encodings()) { |
232 | stream << EncodingToString(encoding) << " " ; |
233 | } |
234 | stream << "\", " |
235 | << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() |
236 | << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); |
237 | |
238 | // end of a ColumnChunk |
239 | stream << "\" }" ; |
240 | c1++; |
241 | if (c1 != static_cast<int>(selected_columns.size())) { |
242 | stream << ",\n" ; |
243 | } |
244 | } |
245 | |
246 | stream << "\n ]\n }" ; |
247 | if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { |
248 | stream << ",\n" ; |
249 | } |
250 | } |
251 | stream << "\n ]\n}\n" ; |
252 | } |
253 | |
254 | } // namespace parquet |
255 | |