printer.cc source code [ClickHouse/contrib/arrow/cpp/src/parquet/printer.cc]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#include "parquet/printer.h"
19
20	#include <cstdint>
21	#include <cstdio>
22	#include <memory>
23	#include <ostream>
24	#include <string>
25	#include <vector>
26
27	#include "arrow/util/key_value_metadata.h"
28
29	#include "parquet/column_scanner.h"
30	#include "parquet/exception.h"
31	#include "parquet/file_reader.h"
32	#include "parquet/metadata.h"
33	#include "parquet/schema.h"
34	#include "parquet/statistics.h"
35	#include "parquet/types.h"
36
37	namespace parquet {
38
39	class ColumnReader;
40
41	// ----------------------------------------------------------------------
42	// ParquetFilePrinter::DebugPrint
43
44	// the fixed initial size is just for an example
45	#define COL_WIDTH 30
46
47	void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
48	bool print_values, bool format_dump,
49	bool print_key_value_metadata, const char* filename) {
50	const FileMetaData* file_metadata = fileReader->metadata().get();
51
52	stream << "File Name: " << filename << "\n";
53	stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
54	stream << "Created By: " << file_metadata->created_by() << "\n";
55	stream << "Total rows: " << file_metadata->num_rows() << "\n";
56
57	if (print_key_value_metadata && file_metadata->key_value_metadata()) {
58	auto key_value_metadata = file_metadata->key_value_metadata();
59	int64_t size_of_key_value_metadata = key_value_metadata ->size();
60	stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
61	for (int64_t i = `0`; i < size_of_key_value_metadata; i++) {
62	stream << " Key nr " << i << " " << key_value_metadata ->key(i) << ": "
63	<< key_value_metadata ->value(i) << "\n";
64	}
65	}
66
67	stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
68	stream << "Number of Real Columns: "
69	<< file_metadata->schema()->group_node()->field_count() << "\n";
70
71	if (selected_columns.size() == `0`) {
72	for (int i = `0`; i < file_metadata->num_columns(); i++) {
73	selected_columns.push_back(i);
74	}
75	} else {
76	for (auto i : selected_columns) {
77	if (i < `0` \|\| i >= file_metadata->num_columns()) {
78	throw ParquetException ("Selected column is out of range");
79	}
80	}
81	}
82
83	stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
84	stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
85	for (auto i : selected_columns) {
86	const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
87	stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
88	<< TypeToString(descr->physical_type());
89	if (descr->converted_type() != ConvertedType::NONE) {
90	stream << "/" << ConvertedTypeToString(descr->converted_type());
91	}
92	if (descr->converted_type() == ConvertedType::DECIMAL) {
93	stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
94	}
95	stream << ")" << std::endl;
96	}
97
98	for (int r = `0`; r < file_metadata->num_row_groups(); ++r) {
99	stream << "--- Row Group: " << r << " ---\n";
100
101	auto group_reader = fileReader->RowGroup(r);
102	std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
103
104	stream << "--- Total Bytes: " << group_metadata ->total_byte_size() << " ---\n";
105	stream << "--- Rows: " << group_metadata ->num_rows() << " ---\n";
106
107	// Print column metadata
108	for (auto i : selected_columns) {
109	auto column_chunk = group_metadata ->ColumnChunk(i);
110	std::shared_ptr<Statistics> stats = column_chunk ->statistics();
111
112	const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
113	stream << "Column " << i << std::endl << " Values: " << column_chunk ->num_values();
114	if (column_chunk ->is_stats_set()) {
115	std::string min = stats ->EncodeMin(), max = stats ->EncodeMax();
116	stream << ", Null Values: " << stats ->null_count()
117	<< ", Distinct Values: " << stats ->distinct_count() << std::endl
118	<< " Max: " << FormatStatValue(descr->physical_type(), max)
119	<< ", Min: " << FormatStatValue(descr->physical_type(), min);
120	} else {
121	stream << " Statistics Not Set";
122	}
123	stream << std::endl
124	<< " Compression: " << Codec::GetCodecAsString(column_chunk ->compression())
125	<< ", Encodings:";
126	for (auto encoding : column_chunk ->encodings()) {
127	stream << " " << EncodingToString(encoding);
128	}
129	stream << std::endl
130	<< " Uncompressed Size: " << column_chunk ->total_uncompressed_size()
131	<< ", Compressed Size: " << column_chunk ->total_compressed_size()
132	<< std::endl;
133	}
134
135	if (!print_values) {
136	continue;
137	}
138	stream << "--- Values ---\n";
139
140	static constexpr int bufsize = COL_WIDTH + `1`;
141	char buffer[bufsize];
142
143	// Create readers for selected columns and print contents
144	std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
145	int j = `0`;
146	for (auto i : selected_columns) {
147	std::shared_ptr<ColumnReader> col_reader = group_reader ->Column(i);
148	// This is OK in this method as long as the RowGroupReader does not get
149	// deleted
150	auto& scanner = scanners [j++] = Scanner::Make(col_reader);
151
152	if (format_dump) {
153	stream << "Column " << i << std::endl;
154	while (scanner ->HasNext()) {
155	scanner ->PrintNext(stream, `0`, true);
156	stream << "\n";
157	}
158	continue;
159	}
160
161	snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
162	file_metadata->schema()->Column(i)->name().c_str());
163	stream << buffer << `'\|'`;
164	}
165	if (format_dump) {
166	continue;
167	}
168	stream << "\n";
169
170	bool hasRow;
171	do {
172	hasRow = false;
173	for (auto scanner : scanners) {
174	if (scanner ->HasNext()) {
175	hasRow = true;
176	scanner ->PrintNext(stream, COL_WIDTH);
177	stream << `'\|'`;
178	}
179	}
180	stream << "\n";
181	} while (hasRow);
182	}
183	}
184
185	void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
186	const char* filename) {
187	const FileMetaData* file_metadata = fileReader->metadata().get();
188	stream << "{\n";
189	stream << " \"FileName\": \"" << filename << "\",\n";
190	stream << " \"Version\": \"" << file_metadata->version() << "\",\n";
191	stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
192	stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
193	stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
194	stream << " \"NumberOfRealColumns\": \""
195	<< file_metadata->schema()->group_node()->field_count() << "\",\n";
196	stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
197
198	if (selected_columns.size() == `0`) {
199	for (int i = `0`; i < file_metadata->num_columns(); i++) {
200	selected_columns.push_back(i);
201	}
202	} else {
203	for (auto i : selected_columns) {
204	if (i < `0` \|\| i >= file_metadata->num_columns()) {
205	throw ParquetException ("Selected column is out of range");
206	}
207	}
208	}
209
210	stream << " \"Columns\": [\n";
211	int c = `0`;
212	for (auto i : selected_columns) {
213	const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
214	stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\","
215	<< " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
216	<< " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
217	<< "\","
218	<< " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
219	c++;
220	if (c != static_cast<int>(selected_columns.size())) {
221	stream << ",\n";
222	}
223	}
224
225	stream << "\n ],\n \"RowGroups\": [\n";
226	for (int r = `0`; r < file_metadata->num_row_groups(); ++r) {
227	stream << " {\n \"Id\": \"" << r << "\", ";
228
229	auto group_reader = fileReader->RowGroup(r);
230	std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
231
232	stream << " \"TotalBytes\": \"" << group_metadata ->total_byte_size() << "\", ";
233	stream << " \"Rows\": \"" << group_metadata ->num_rows() << "\",\n";
234
235	// Print column metadata
236	stream << " \"ColumnChunks\": [\n";
237	int c1 = `0`;
238	for (auto i : selected_columns) {
239	auto column_chunk = group_metadata ->ColumnChunk(i);
240	std::shared_ptr<Statistics> stats = column_chunk ->statistics();
241
242	const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
243	stream << " {\"Id\": \"" << i << "\", \"Values\": \""
244	<< column_chunk ->num_values() << "\", "
245	<< "\"StatsSet\": ";
246	if (column_chunk ->is_stats_set()) {
247	stream << "\"True\", \"Stats\": {";
248	std::string min = stats ->EncodeMin(), max = stats ->EncodeMax();
249	stream << "\"NumNulls\": \"" << stats ->null_count() << "\", "
250	<< "\"DistinctValues\": \"" << stats ->distinct_count() << "\", "
251	<< "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
252	<< "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
253	<< "\" },";
254	} else {
255	stream << "\"False\",";
256	}
257	stream << "\n \"Compression\": \""
258	<< Codec::GetCodecAsString(column_chunk ->compression())
259	<< "\", \"Encodings\": \"";
260	for (auto encoding : column_chunk ->encodings()) {
261	stream << EncodingToString(encoding) << " ";
262	}
263	stream << "\", "
264	<< "\"UncompressedSize\": \"" << column_chunk ->total_uncompressed_size()
265	<< "\", \"CompressedSize\": \"" << column_chunk ->total_compressed_size();
266
267	// end of a ColumnChunk
268	stream << "\" }";
269	c1++;
270	if (c1 != static_cast<int>(selected_columns.size())) {
271	stream << ",\n";
272	}
273	}
274
275	stream << "\n ]\n }";
276	if ((r + `1`) != static_cast<int>(file_metadata->num_row_groups())) {
277	stream << ",\n";
278	}
279	}
280	stream << "\n ]\n}\n";
281	}
282
283	} // namespace parquet
284

Browse the source code of ClickHouse/contrib/arrow/cpp/src/parquet/printer.cc