printer.cc source code [arrow/parquet/printer.cc]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#include "parquet/printer.h"
19
20	#include <string>
21	#include <vector>
22
23	#include "parquet/column_scanner.h"
24
25	using std::string;
26	using std::vector;
27
28	namespace parquet {
29	// ----------------------------------------------------------------------
30	// ParquetFilePrinter::DebugPrint
31
32	// the fixed initial size is just for an example
33	#define COL_WIDTH "30"
34
35	void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
36	bool print_values, bool print_key_value_metadata,
37	const char* filename) {
38	const FileMetaData* file_metadata = fileReader->metadata().get();
39
40	stream << "File Name: " << filename << "\n";
41	stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
42	stream << "Created By: " << file_metadata->created_by() << "\n";
43	stream << "Total rows: " << file_metadata->num_rows() << "\n";
44
45	if (print_key_value_metadata) {
46	auto key_value_metadata = file_metadata->key_value_metadata();
47	int64_t size_of_key_value_metadata = key_value_metadata ->size();
48	stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
49	for (int64_t i = `0`; i < size_of_key_value_metadata; i++) {
50	stream << " Key nr " << i << " " << key_value_metadata ->key(i) << ": "
51	<< key_value_metadata ->value(i) << "\n";
52	}
53	}
54
55	stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
56	stream << "Number of Real Columns: "
57	<< file_metadata->schema()->group_node()->field_count() << "\n";
58
59	if (selected_columns.size() == `0`) {
60	for (int i = `0`; i < file_metadata->num_columns(); i++) {
61	selected_columns.push_back(i);
62	}
63	} else {
64	for (auto i : selected_columns) {
65	if (i < `0` \|\| i >= file_metadata->num_columns()) {
66	throw ParquetException ("Selected column is out of range");
67	}
68	}
69	}
70
71	stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
72	stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
73	for (auto i : selected_columns) {
74	const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
75	stream << "Column " << i << ": " << descr->name() << " ("
76	<< TypeToString(descr->physical_type()) << ")" << std::endl;
77	}
78
79	for (int r = `0`; r < file_metadata->num_row_groups(); ++r) {
80	stream << "--- Row Group " << r << " ---\n";
81
82	auto group_reader = fileReader->RowGroup(r);
83	std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
84
85	stream << "--- Total Bytes " << group_metadata ->total_byte_size() << " ---\n";
86	stream << " Rows: " << group_metadata ->num_rows() << "---\n";
87
88	// Print column metadata
89	for (auto i : selected_columns) {
90	auto column_chunk = group_metadata ->ColumnChunk(i);
91	std::shared_ptr<RowGroupStatistics> stats = column_chunk ->statistics();
92
93	const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
94	stream << "Column " << i << std::endl << ", Values: " << column_chunk ->num_values();
95	if (column_chunk ->is_stats_set()) {
96	std::string min = stats ->EncodeMin(), max = stats ->EncodeMax();
97	stream << ", Null Values: " << stats ->null_count()
98	<< ", Distinct Values: " << stats ->distinct_count() << std::endl
99	<< " Max: " << FormatStatValue(descr->physical_type(), max)
100	<< ", Min: " << FormatStatValue(descr->physical_type(), min);
101	} else {
102	stream << " Statistics Not Set";
103	}
104	stream << std::endl
105	<< " Compression: " << CompressionToString(column_chunk ->compression())
106	<< ", Encodings: ";
107	for (auto encoding : column_chunk ->encodings()) {
108	stream << EncodingToString(encoding) << " ";
109	}
110	stream << std::endl
111	<< " Uncompressed Size: " << column_chunk ->total_uncompressed_size()
112	<< ", Compressed Size: " << column_chunk ->total_compressed_size()
113	<< std::endl;
114	}
115
116	if (!print_values) {
117	continue;
118	}
119
120	static constexpr int bufsize = `25`;
121	char buffer[bufsize];
122
123	// Create readers for selected columns and print contents
124	vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
125	int j = `0`;
126	for (auto i : selected_columns) {
127	std::shared_ptr<ColumnReader> col_reader = group_reader ->Column(i);
128
129	std::stringstream ss;
130	ss << "%-" << COL_WIDTH << "s";
131	std::string fmt = ss.str();
132
133	snprintf(buffer, bufsize, fmt.c_str(),
134	file_metadata->schema()->Column(i)->name().c_str());
135	stream << buffer;
136
137	// This is OK in this method as long as the RowGroupReader does not get
138	// deleted
139	scanners [j++] = Scanner::Make(col_reader);
140	}
141	stream << "\n";
142
143	bool hasRow;
144	do {
145	hasRow = false;
146	for (auto scanner : scanners) {
147	if (scanner ->HasNext()) {
148	hasRow = true;
149	scanner ->PrintNext(stream, `27`);
150	}
151	}
152	stream << "\n";
153	} while (hasRow);
154	}
155	}
156
157	void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
158	const char* filename) {
159	const FileMetaData* file_metadata = fileReader->metadata().get();
160	stream << "{\n";
161	stream << " \"FileName\": \"" << filename << "\",\n";
162	stream << " \"Version\": \"" << file_metadata->version() << "\",\n";
163	stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
164	stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
165	stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
166	stream << " \"NumberOfRealColumns\": \""
167	<< file_metadata->schema()->group_node()->field_count() << "\",\n";
168	stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
169
170	if (selected_columns.size() == `0`) {
171	for (int i = `0`; i < file_metadata->num_columns(); i++) {
172	selected_columns.push_back(i);
173	}
174	} else {
175	for (auto i : selected_columns) {
176	if (i < `0` \|\| i >= file_metadata->num_columns()) {
177	throw ParquetException ("Selected column is out of range");
178	}
179	}
180	}
181
182	stream << " \"Columns\": [\n";
183	int c = `0`;
184	for (auto i : selected_columns) {
185	const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
186	stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\","
187	<< " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
188	<< " \"LogicalType\": \"" << LogicalTypeToString(descr->logical_type())
189	<< "\" }";
190	c++;
191	if (c != static_cast<int>(selected_columns.size())) {
192	stream << ",\n";
193	}
194	}
195
196	stream << "\n ],\n \"RowGroups\": [\n";
197	for (int r = `0`; r < file_metadata->num_row_groups(); ++r) {
198	stream << " {\n \"Id\": \"" << r << "\", ";
199
200	auto group_reader = fileReader->RowGroup(r);
201	std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
202
203	stream << " \"TotalBytes\": \"" << group_metadata ->total_byte_size() << "\", ";
204	stream << " \"Rows\": \"" << group_metadata ->num_rows() << "\",\n";
205
206	// Print column metadata
207	stream << " \"ColumnChunks\": [\n";
208	int c1 = `0`;
209	for (auto i : selected_columns) {
210	auto column_chunk = group_metadata ->ColumnChunk(i);
211	std::shared_ptr<RowGroupStatistics> stats = column_chunk ->statistics();
212
213	const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
214	stream << " {\"Id\": \"" << i << "\", \"Values\": \""
215	<< column_chunk ->num_values() << "\", "
216	<< "\"StatsSet\": ";
217	if (column_chunk ->is_stats_set()) {
218	stream << "\"True\", \"Stats\": {";
219	std::string min = stats ->EncodeMin(), max = stats ->EncodeMax();
220	stream << "\"NumNulls\": \"" << stats ->null_count() << "\", "
221	<< "\"DistinctValues\": \"" << stats ->distinct_count() << "\", "
222	<< "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
223	<< "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
224	<< "\" },";
225	} else {
226	stream << "\"False\",";
227	}
228	stream << "\n \"Compression\": \""
229	<< CompressionToString(column_chunk ->compression())
230	<< "\", \"Encodings\": \"";
231	for (auto encoding : column_chunk ->encodings()) {
232	stream << EncodingToString(encoding) << " ";
233	}
234	stream << "\", "
235	<< "\"UncompressedSize\": \"" << column_chunk ->total_uncompressed_size()
236	<< "\", \"CompressedSize\": \"" << column_chunk ->total_compressed_size();
237
238	// end of a ColumnChunk
239	stream << "\" }";
240	c1++;
241	if (c1 != static_cast<int>(selected_columns.size())) {
242	stream << ",\n";
243	}
244	}
245
246	stream << "\n ]\n }";
247	if ((r + `1`) != static_cast<int>(file_metadata->num_row_groups())) {
248	stream << ",\n";
249	}
250	}
251	stream << "\n ]\n}\n";
252	}
253
254	} // namespace parquet
255

Browse the source code of arrow/parquet/printer.cc