1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "parquet/printer.h"
19
20#include <string>
21#include <vector>
22
23#include "parquet/column_scanner.h"
24
25using std::string;
26using std::vector;
27
28namespace parquet {
29// ----------------------------------------------------------------------
30// ParquetFilePrinter::DebugPrint
31
32// the fixed initial size is just for an example
33#define COL_WIDTH "30"
34
35void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
36 bool print_values, bool print_key_value_metadata,
37 const char* filename) {
38 const FileMetaData* file_metadata = fileReader->metadata().get();
39
40 stream << "File Name: " << filename << "\n";
41 stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
42 stream << "Created By: " << file_metadata->created_by() << "\n";
43 stream << "Total rows: " << file_metadata->num_rows() << "\n";
44
45 if (print_key_value_metadata) {
46 auto key_value_metadata = file_metadata->key_value_metadata();
47 int64_t size_of_key_value_metadata = key_value_metadata->size();
48 stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
49 for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
50 stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
51 << key_value_metadata->value(i) << "\n";
52 }
53 }
54
55 stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
56 stream << "Number of Real Columns: "
57 << file_metadata->schema()->group_node()->field_count() << "\n";
58
59 if (selected_columns.size() == 0) {
60 for (int i = 0; i < file_metadata->num_columns(); i++) {
61 selected_columns.push_back(i);
62 }
63 } else {
64 for (auto i : selected_columns) {
65 if (i < 0 || i >= file_metadata->num_columns()) {
66 throw ParquetException("Selected column is out of range");
67 }
68 }
69 }
70
71 stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
72 stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
73 for (auto i : selected_columns) {
74 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
75 stream << "Column " << i << ": " << descr->name() << " ("
76 << TypeToString(descr->physical_type()) << ")" << std::endl;
77 }
78
79 for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
80 stream << "--- Row Group " << r << " ---\n";
81
82 auto group_reader = fileReader->RowGroup(r);
83 std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
84
85 stream << "--- Total Bytes " << group_metadata->total_byte_size() << " ---\n";
86 stream << " Rows: " << group_metadata->num_rows() << "---\n";
87
88 // Print column metadata
89 for (auto i : selected_columns) {
90 auto column_chunk = group_metadata->ColumnChunk(i);
91 std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
92
93 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
94 stream << "Column " << i << std::endl << ", Values: " << column_chunk->num_values();
95 if (column_chunk->is_stats_set()) {
96 std::string min = stats->EncodeMin(), max = stats->EncodeMax();
97 stream << ", Null Values: " << stats->null_count()
98 << ", Distinct Values: " << stats->distinct_count() << std::endl
99 << " Max: " << FormatStatValue(descr->physical_type(), max)
100 << ", Min: " << FormatStatValue(descr->physical_type(), min);
101 } else {
102 stream << " Statistics Not Set";
103 }
104 stream << std::endl
105 << " Compression: " << CompressionToString(column_chunk->compression())
106 << ", Encodings: ";
107 for (auto encoding : column_chunk->encodings()) {
108 stream << EncodingToString(encoding) << " ";
109 }
110 stream << std::endl
111 << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
112 << ", Compressed Size: " << column_chunk->total_compressed_size()
113 << std::endl;
114 }
115
116 if (!print_values) {
117 continue;
118 }
119
120 static constexpr int bufsize = 25;
121 char buffer[bufsize];
122
123 // Create readers for selected columns and print contents
124 vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
125 int j = 0;
126 for (auto i : selected_columns) {
127 std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
128
129 std::stringstream ss;
130 ss << "%-" << COL_WIDTH << "s";
131 std::string fmt = ss.str();
132
133 snprintf(buffer, bufsize, fmt.c_str(),
134 file_metadata->schema()->Column(i)->name().c_str());
135 stream << buffer;
136
137 // This is OK in this method as long as the RowGroupReader does not get
138 // deleted
139 scanners[j++] = Scanner::Make(col_reader);
140 }
141 stream << "\n";
142
143 bool hasRow;
144 do {
145 hasRow = false;
146 for (auto scanner : scanners) {
147 if (scanner->HasNext()) {
148 hasRow = true;
149 scanner->PrintNext(stream, 27);
150 }
151 }
152 stream << "\n";
153 } while (hasRow);
154 }
155}
156
157void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
158 const char* filename) {
159 const FileMetaData* file_metadata = fileReader->metadata().get();
160 stream << "{\n";
161 stream << " \"FileName\": \"" << filename << "\",\n";
162 stream << " \"Version\": \"" << file_metadata->version() << "\",\n";
163 stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
164 stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
165 stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
166 stream << " \"NumberOfRealColumns\": \""
167 << file_metadata->schema()->group_node()->field_count() << "\",\n";
168 stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
169
170 if (selected_columns.size() == 0) {
171 for (int i = 0; i < file_metadata->num_columns(); i++) {
172 selected_columns.push_back(i);
173 }
174 } else {
175 for (auto i : selected_columns) {
176 if (i < 0 || i >= file_metadata->num_columns()) {
177 throw ParquetException("Selected column is out of range");
178 }
179 }
180 }
181
182 stream << " \"Columns\": [\n";
183 int c = 0;
184 for (auto i : selected_columns) {
185 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
186 stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\","
187 << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
188 << " \"LogicalType\": \"" << LogicalTypeToString(descr->logical_type())
189 << "\" }";
190 c++;
191 if (c != static_cast<int>(selected_columns.size())) {
192 stream << ",\n";
193 }
194 }
195
196 stream << "\n ],\n \"RowGroups\": [\n";
197 for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
198 stream << " {\n \"Id\": \"" << r << "\", ";
199
200 auto group_reader = fileReader->RowGroup(r);
201 std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
202
203 stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
204 stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
205
206 // Print column metadata
207 stream << " \"ColumnChunks\": [\n";
208 int c1 = 0;
209 for (auto i : selected_columns) {
210 auto column_chunk = group_metadata->ColumnChunk(i);
211 std::shared_ptr<RowGroupStatistics> stats = column_chunk->statistics();
212
213 const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
214 stream << " {\"Id\": \"" << i << "\", \"Values\": \""
215 << column_chunk->num_values() << "\", "
216 << "\"StatsSet\": ";
217 if (column_chunk->is_stats_set()) {
218 stream << "\"True\", \"Stats\": {";
219 std::string min = stats->EncodeMin(), max = stats->EncodeMax();
220 stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
221 << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
222 << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
223 << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
224 << "\" },";
225 } else {
226 stream << "\"False\",";
227 }
228 stream << "\n \"Compression\": \""
229 << CompressionToString(column_chunk->compression())
230 << "\", \"Encodings\": \"";
231 for (auto encoding : column_chunk->encodings()) {
232 stream << EncodingToString(encoding) << " ";
233 }
234 stream << "\", "
235 << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
236 << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
237
238 // end of a ColumnChunk
239 stream << "\" }";
240 c1++;
241 if (c1 != static_cast<int>(selected_columns.size())) {
242 stream << ",\n";
243 }
244 }
245
246 stream << "\n ]\n }";
247 if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
248 stream << ",\n";
249 }
250 }
251 stream << "\n ]\n}\n";
252}
253
254} // namespace parquet
255