| 1 | // Licensed to the Apache Software Foundation (ASF) under one |
| 2 | // or more contributor license agreements. See the NOTICE file |
| 3 | // distributed with this work for additional information |
| 4 | // regarding copyright ownership. The ASF licenses this file |
| 5 | // to you under the Apache License, Version 2.0 (the |
| 6 | // "License"); you may not use this file except in compliance |
| 7 | // with the License. You may obtain a copy of the License at |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, |
| 12 | // software distributed under the License is distributed on an |
| 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | // KIND, either express or implied. See the License for the |
| 15 | // specific language governing permissions and limitations |
| 16 | // under the License. |
| 17 | |
| 18 | #include "parquet/printer.h" |
| 19 | |
| 20 | #include <cstdint> |
| 21 | #include <cstdio> |
| 22 | #include <memory> |
| 23 | #include <ostream> |
| 24 | #include <string> |
| 25 | #include <vector> |
| 26 | |
| 27 | #include "arrow/util/key_value_metadata.h" |
| 28 | |
| 29 | #include "parquet/column_scanner.h" |
| 30 | #include "parquet/exception.h" |
| 31 | #include "parquet/file_reader.h" |
| 32 | #include "parquet/metadata.h" |
| 33 | #include "parquet/schema.h" |
| 34 | #include "parquet/statistics.h" |
| 35 | #include "parquet/types.h" |
| 36 | |
| 37 | namespace parquet { |
| 38 | |
| 39 | class ColumnReader; |
| 40 | |
| 41 | // ---------------------------------------------------------------------- |
| 42 | // ParquetFilePrinter::DebugPrint |
| 43 | |
| 44 | // the fixed initial size is just for an example |
| 45 | #define COL_WIDTH 30 |
| 46 | |
| 47 | void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns, |
| 48 | bool print_values, bool format_dump, |
| 49 | bool print_key_value_metadata, const char* filename) { |
| 50 | const FileMetaData* file_metadata = fileReader->metadata().get(); |
| 51 | |
| 52 | stream << "File Name: " << filename << "\n" ; |
| 53 | stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n" ; |
| 54 | stream << "Created By: " << file_metadata->created_by() << "\n" ; |
| 55 | stream << "Total rows: " << file_metadata->num_rows() << "\n" ; |
| 56 | |
| 57 | if (print_key_value_metadata && file_metadata->key_value_metadata()) { |
| 58 | auto key_value_metadata = file_metadata->key_value_metadata(); |
| 59 | int64_t size_of_key_value_metadata = key_value_metadata->size(); |
| 60 | stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n" ; |
| 61 | for (int64_t i = 0; i < size_of_key_value_metadata; i++) { |
| 62 | stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": " |
| 63 | << key_value_metadata->value(i) << "\n" ; |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n" ; |
| 68 | stream << "Number of Real Columns: " |
| 69 | << file_metadata->schema()->group_node()->field_count() << "\n" ; |
| 70 | |
| 71 | if (selected_columns.size() == 0) { |
| 72 | for (int i = 0; i < file_metadata->num_columns(); i++) { |
| 73 | selected_columns.push_back(i); |
| 74 | } |
| 75 | } else { |
| 76 | for (auto i : selected_columns) { |
| 77 | if (i < 0 || i >= file_metadata->num_columns()) { |
| 78 | throw ParquetException("Selected column is out of range" ); |
| 79 | } |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | stream << "Number of Columns: " << file_metadata->num_columns() << "\n" ; |
| 84 | stream << "Number of Selected Columns: " << selected_columns.size() << "\n" ; |
| 85 | for (auto i : selected_columns) { |
| 86 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
| 87 | stream << "Column " << i << ": " << descr->path()->ToDotString() << " (" |
| 88 | << TypeToString(descr->physical_type()); |
| 89 | if (descr->converted_type() != ConvertedType::NONE) { |
| 90 | stream << "/" << ConvertedTypeToString(descr->converted_type()); |
| 91 | } |
| 92 | if (descr->converted_type() == ConvertedType::DECIMAL) { |
| 93 | stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")" ; |
| 94 | } |
| 95 | stream << ")" << std::endl; |
| 96 | } |
| 97 | |
| 98 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { |
| 99 | stream << "--- Row Group: " << r << " ---\n" ; |
| 100 | |
| 101 | auto group_reader = fileReader->RowGroup(r); |
| 102 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); |
| 103 | |
| 104 | stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n" ; |
| 105 | stream << "--- Rows: " << group_metadata->num_rows() << " ---\n" ; |
| 106 | |
| 107 | // Print column metadata |
| 108 | for (auto i : selected_columns) { |
| 109 | auto column_chunk = group_metadata->ColumnChunk(i); |
| 110 | std::shared_ptr<Statistics> stats = column_chunk->statistics(); |
| 111 | |
| 112 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
| 113 | stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values(); |
| 114 | if (column_chunk->is_stats_set()) { |
| 115 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); |
| 116 | stream << ", Null Values: " << stats->null_count() |
| 117 | << ", Distinct Values: " << stats->distinct_count() << std::endl |
| 118 | << " Max: " << FormatStatValue(descr->physical_type(), max) |
| 119 | << ", Min: " << FormatStatValue(descr->physical_type(), min); |
| 120 | } else { |
| 121 | stream << " Statistics Not Set" ; |
| 122 | } |
| 123 | stream << std::endl |
| 124 | << " Compression: " << Codec::GetCodecAsString(column_chunk->compression()) |
| 125 | << ", Encodings:" ; |
| 126 | for (auto encoding : column_chunk->encodings()) { |
| 127 | stream << " " << EncodingToString(encoding); |
| 128 | } |
| 129 | stream << std::endl |
| 130 | << " Uncompressed Size: " << column_chunk->total_uncompressed_size() |
| 131 | << ", Compressed Size: " << column_chunk->total_compressed_size() |
| 132 | << std::endl; |
| 133 | } |
| 134 | |
| 135 | if (!print_values) { |
| 136 | continue; |
| 137 | } |
| 138 | stream << "--- Values ---\n" ; |
| 139 | |
| 140 | static constexpr int bufsize = COL_WIDTH + 1; |
| 141 | char buffer[bufsize]; |
| 142 | |
| 143 | // Create readers for selected columns and print contents |
| 144 | std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr); |
| 145 | int j = 0; |
| 146 | for (auto i : selected_columns) { |
| 147 | std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i); |
| 148 | // This is OK in this method as long as the RowGroupReader does not get |
| 149 | // deleted |
| 150 | auto& scanner = scanners[j++] = Scanner::Make(col_reader); |
| 151 | |
| 152 | if (format_dump) { |
| 153 | stream << "Column " << i << std::endl; |
| 154 | while (scanner->HasNext()) { |
| 155 | scanner->PrintNext(stream, 0, true); |
| 156 | stream << "\n" ; |
| 157 | } |
| 158 | continue; |
| 159 | } |
| 160 | |
| 161 | snprintf(buffer, bufsize, "%-*s" , COL_WIDTH, |
| 162 | file_metadata->schema()->Column(i)->name().c_str()); |
| 163 | stream << buffer << '|'; |
| 164 | } |
| 165 | if (format_dump) { |
| 166 | continue; |
| 167 | } |
| 168 | stream << "\n" ; |
| 169 | |
| 170 | bool hasRow; |
| 171 | do { |
| 172 | hasRow = false; |
| 173 | for (auto scanner : scanners) { |
| 174 | if (scanner->HasNext()) { |
| 175 | hasRow = true; |
| 176 | scanner->PrintNext(stream, COL_WIDTH); |
| 177 | stream << '|'; |
| 178 | } |
| 179 | } |
| 180 | stream << "\n" ; |
| 181 | } while (hasRow); |
| 182 | } |
| 183 | } |
| 184 | |
| 185 | void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns, |
| 186 | const char* filename) { |
| 187 | const FileMetaData* file_metadata = fileReader->metadata().get(); |
| 188 | stream << "{\n" ; |
| 189 | stream << " \"FileName\": \"" << filename << "\",\n" ; |
| 190 | stream << " \"Version\": \"" << file_metadata->version() << "\",\n" ; |
| 191 | stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n" ; |
| 192 | stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n" ; |
| 193 | stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n" ; |
| 194 | stream << " \"NumberOfRealColumns\": \"" |
| 195 | << file_metadata->schema()->group_node()->field_count() << "\",\n" ; |
| 196 | stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n" ; |
| 197 | |
| 198 | if (selected_columns.size() == 0) { |
| 199 | for (int i = 0; i < file_metadata->num_columns(); i++) { |
| 200 | selected_columns.push_back(i); |
| 201 | } |
| 202 | } else { |
| 203 | for (auto i : selected_columns) { |
| 204 | if (i < 0 || i >= file_metadata->num_columns()) { |
| 205 | throw ParquetException("Selected column is out of range" ); |
| 206 | } |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | stream << " \"Columns\": [\n" ; |
| 211 | int c = 0; |
| 212 | for (auto i : selected_columns) { |
| 213 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
| 214 | stream << " { \"Id\": \"" << i << "\", \"Name\": \"" << descr->name() << "\"," |
| 215 | << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\"," |
| 216 | << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type()) |
| 217 | << "\"," |
| 218 | << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }" ; |
| 219 | c++; |
| 220 | if (c != static_cast<int>(selected_columns.size())) { |
| 221 | stream << ",\n" ; |
| 222 | } |
| 223 | } |
| 224 | |
| 225 | stream << "\n ],\n \"RowGroups\": [\n" ; |
| 226 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { |
| 227 | stream << " {\n \"Id\": \"" << r << "\", " ; |
| 228 | |
| 229 | auto group_reader = fileReader->RowGroup(r); |
| 230 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); |
| 231 | |
| 232 | stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", " ; |
| 233 | stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n" ; |
| 234 | |
| 235 | // Print column metadata |
| 236 | stream << " \"ColumnChunks\": [\n" ; |
| 237 | int c1 = 0; |
| 238 | for (auto i : selected_columns) { |
| 239 | auto column_chunk = group_metadata->ColumnChunk(i); |
| 240 | std::shared_ptr<Statistics> stats = column_chunk->statistics(); |
| 241 | |
| 242 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); |
| 243 | stream << " {\"Id\": \"" << i << "\", \"Values\": \"" |
| 244 | << column_chunk->num_values() << "\", " |
| 245 | << "\"StatsSet\": " ; |
| 246 | if (column_chunk->is_stats_set()) { |
| 247 | stream << "\"True\", \"Stats\": {" ; |
| 248 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); |
| 249 | stream << "\"NumNulls\": \"" << stats->null_count() << "\", " |
| 250 | << "\"DistinctValues\": \"" << stats->distinct_count() << "\", " |
| 251 | << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", " |
| 252 | << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min) |
| 253 | << "\" }," ; |
| 254 | } else { |
| 255 | stream << "\"False\"," ; |
| 256 | } |
| 257 | stream << "\n \"Compression\": \"" |
| 258 | << Codec::GetCodecAsString(column_chunk->compression()) |
| 259 | << "\", \"Encodings\": \"" ; |
| 260 | for (auto encoding : column_chunk->encodings()) { |
| 261 | stream << EncodingToString(encoding) << " " ; |
| 262 | } |
| 263 | stream << "\", " |
| 264 | << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() |
| 265 | << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); |
| 266 | |
| 267 | // end of a ColumnChunk |
| 268 | stream << "\" }" ; |
| 269 | c1++; |
| 270 | if (c1 != static_cast<int>(selected_columns.size())) { |
| 271 | stream << ",\n" ; |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | stream << "\n ]\n }" ; |
| 276 | if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { |
| 277 | stream << ",\n" ; |
| 278 | } |
| 279 | } |
| 280 | stream << "\n ]\n}\n" ; |
| 281 | } |
| 282 | |
| 283 | } // namespace parquet |
| 284 | |