| 1 | // Licensed to the Apache Software Foundation (ASF) under one | 
|---|
| 2 | // or more contributor license agreements.  See the NOTICE file | 
|---|
| 3 | // distributed with this work for additional information | 
|---|
| 4 | // regarding copyright ownership.  The ASF licenses this file | 
|---|
| 5 | // to you under the Apache License, Version 2.0 (the | 
|---|
| 6 | // "License"); you may not use this file except in compliance | 
|---|
| 7 | // with the License.  You may obtain a copy of the License at | 
|---|
| 8 | // | 
|---|
| 9 | //   http://www.apache.org/licenses/LICENSE-2.0 | 
|---|
| 10 | // | 
|---|
| 11 | // Unless required by applicable law or agreed to in writing, | 
|---|
| 12 | // software distributed under the License is distributed on an | 
|---|
| 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
|---|
| 14 | // KIND, either express or implied.  See the License for the | 
|---|
| 15 | // specific language governing permissions and limitations | 
|---|
| 16 | // under the License. | 
|---|
| 17 |  | 
|---|
| 18 | #include "parquet/printer.h" | 
|---|
| 19 |  | 
|---|
| 20 | #include <cstdint> | 
|---|
| 21 | #include <cstdio> | 
|---|
| 22 | #include <memory> | 
|---|
| 23 | #include <ostream> | 
|---|
| 24 | #include <string> | 
|---|
| 25 | #include <vector> | 
|---|
| 26 |  | 
|---|
| 27 | #include "arrow/util/key_value_metadata.h" | 
|---|
| 28 |  | 
|---|
| 29 | #include "parquet/column_scanner.h" | 
|---|
| 30 | #include "parquet/exception.h" | 
|---|
| 31 | #include "parquet/file_reader.h" | 
|---|
| 32 | #include "parquet/metadata.h" | 
|---|
| 33 | #include "parquet/schema.h" | 
|---|
| 34 | #include "parquet/statistics.h" | 
|---|
| 35 | #include "parquet/types.h" | 
|---|
| 36 |  | 
|---|
| 37 | namespace parquet { | 
|---|
| 38 |  | 
|---|
| 39 | class ColumnReader; | 
|---|
| 40 |  | 
|---|
| 41 | // ---------------------------------------------------------------------- | 
|---|
| 42 | // ParquetFilePrinter::DebugPrint | 
|---|
| 43 |  | 
|---|
| 44 | // the fixed initial size is just for an example | 
|---|
| 45 | #define COL_WIDTH 30 | 
|---|
| 46 |  | 
|---|
| 47 | void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns, | 
|---|
| 48 | bool print_values, bool format_dump, | 
|---|
| 49 | bool print_key_value_metadata, const char* filename) { | 
|---|
| 50 | const FileMetaData* file_metadata = fileReader->metadata().get(); | 
|---|
| 51 |  | 
|---|
| 52 | stream << "File Name: "<< filename << "\n"; | 
|---|
| 53 | stream << "Version: "<< ParquetVersionToString(file_metadata->version()) << "\n"; | 
|---|
| 54 | stream << "Created By: "<< file_metadata->created_by() << "\n"; | 
|---|
| 55 | stream << "Total rows: "<< file_metadata->num_rows() << "\n"; | 
|---|
| 56 |  | 
|---|
| 57 | if (print_key_value_metadata && file_metadata->key_value_metadata()) { | 
|---|
| 58 | auto key_value_metadata = file_metadata->key_value_metadata(); | 
|---|
| 59 | int64_t size_of_key_value_metadata = key_value_metadata->size(); | 
|---|
| 60 | stream << "Key Value File Metadata: "<< size_of_key_value_metadata << " entries\n"; | 
|---|
| 61 | for (int64_t i = 0; i < size_of_key_value_metadata; i++) { | 
|---|
| 62 | stream << " Key nr "<< i << " "<< key_value_metadata->key(i) << ": " | 
|---|
| 63 | << key_value_metadata->value(i) << "\n"; | 
|---|
| 64 | } | 
|---|
| 65 | } | 
|---|
| 66 |  | 
|---|
| 67 | stream << "Number of RowGroups: "<< file_metadata->num_row_groups() << "\n"; | 
|---|
| 68 | stream << "Number of Real Columns: " | 
|---|
| 69 | << file_metadata->schema()->group_node()->field_count() << "\n"; | 
|---|
| 70 |  | 
|---|
| 71 | if (selected_columns.size() == 0) { | 
|---|
| 72 | for (int i = 0; i < file_metadata->num_columns(); i++) { | 
|---|
| 73 | selected_columns.push_back(i); | 
|---|
| 74 | } | 
|---|
| 75 | } else { | 
|---|
| 76 | for (auto i : selected_columns) { | 
|---|
| 77 | if (i < 0 || i >= file_metadata->num_columns()) { | 
|---|
| 78 | throw ParquetException( "Selected column is out of range"); | 
|---|
| 79 | } | 
|---|
| 80 | } | 
|---|
| 81 | } | 
|---|
| 82 |  | 
|---|
| 83 | stream << "Number of Columns: "<< file_metadata->num_columns() << "\n"; | 
|---|
| 84 | stream << "Number of Selected Columns: "<< selected_columns.size() << "\n"; | 
|---|
| 85 | for (auto i : selected_columns) { | 
|---|
| 86 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); | 
|---|
| 87 | stream << "Column "<< i << ": "<< descr->path()->ToDotString() << " (" | 
|---|
| 88 | << TypeToString(descr->physical_type()); | 
|---|
| 89 | if (descr->converted_type() != ConvertedType::NONE) { | 
|---|
| 90 | stream << "/"<< ConvertedTypeToString(descr->converted_type()); | 
|---|
| 91 | } | 
|---|
| 92 | if (descr->converted_type() == ConvertedType::DECIMAL) { | 
|---|
| 93 | stream << "("<< descr->type_precision() << ","<< descr->type_scale() << ")"; | 
|---|
| 94 | } | 
|---|
| 95 | stream << ")"<< std::endl; | 
|---|
| 96 | } | 
|---|
| 97 |  | 
|---|
| 98 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { | 
|---|
| 99 | stream << "--- Row Group: "<< r << " ---\n"; | 
|---|
| 100 |  | 
|---|
| 101 | auto group_reader = fileReader->RowGroup(r); | 
|---|
| 102 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); | 
|---|
| 103 |  | 
|---|
| 104 | stream << "--- Total Bytes: "<< group_metadata->total_byte_size() << " ---\n"; | 
|---|
| 105 | stream << "--- Rows: "<< group_metadata->num_rows() << " ---\n"; | 
|---|
| 106 |  | 
|---|
| 107 | // Print column metadata | 
|---|
| 108 | for (auto i : selected_columns) { | 
|---|
| 109 | auto column_chunk = group_metadata->ColumnChunk(i); | 
|---|
| 110 | std::shared_ptr<Statistics> stats = column_chunk->statistics(); | 
|---|
| 111 |  | 
|---|
| 112 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); | 
|---|
| 113 | stream << "Column "<< i << std::endl << "  Values: "<< column_chunk->num_values(); | 
|---|
| 114 | if (column_chunk->is_stats_set()) { | 
|---|
| 115 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); | 
|---|
| 116 | stream << ", Null Values: "<< stats->null_count() | 
|---|
| 117 | << ", Distinct Values: "<< stats->distinct_count() << std::endl | 
|---|
| 118 | << "  Max: "<< FormatStatValue(descr->physical_type(), max) | 
|---|
| 119 | << ", Min: "<< FormatStatValue(descr->physical_type(), min); | 
|---|
| 120 | } else { | 
|---|
| 121 | stream << "  Statistics Not Set"; | 
|---|
| 122 | } | 
|---|
| 123 | stream << std::endl | 
|---|
| 124 | << "  Compression: "<< Codec::GetCodecAsString(column_chunk->compression()) | 
|---|
| 125 | << ", Encodings:"; | 
|---|
| 126 | for (auto encoding : column_chunk->encodings()) { | 
|---|
| 127 | stream << " "<< EncodingToString(encoding); | 
|---|
| 128 | } | 
|---|
| 129 | stream << std::endl | 
|---|
| 130 | << "  Uncompressed Size: "<< column_chunk->total_uncompressed_size() | 
|---|
| 131 | << ", Compressed Size: "<< column_chunk->total_compressed_size() | 
|---|
| 132 | << std::endl; | 
|---|
| 133 | } | 
|---|
| 134 |  | 
|---|
| 135 | if (!print_values) { | 
|---|
| 136 | continue; | 
|---|
| 137 | } | 
|---|
| 138 | stream << "--- Values ---\n"; | 
|---|
| 139 |  | 
|---|
| 140 | static constexpr int bufsize = COL_WIDTH + 1; | 
|---|
| 141 | char buffer[bufsize]; | 
|---|
| 142 |  | 
|---|
| 143 | // Create readers for selected columns and print contents | 
|---|
| 144 | std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr); | 
|---|
| 145 | int j = 0; | 
|---|
| 146 | for (auto i : selected_columns) { | 
|---|
| 147 | std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i); | 
|---|
| 148 | // This is OK in this method as long as the RowGroupReader does not get | 
|---|
| 149 | // deleted | 
|---|
| 150 | auto& scanner = scanners[j++] = Scanner::Make(col_reader); | 
|---|
| 151 |  | 
|---|
| 152 | if (format_dump) { | 
|---|
| 153 | stream << "Column "<< i << std::endl; | 
|---|
| 154 | while (scanner->HasNext()) { | 
|---|
| 155 | scanner->PrintNext(stream, 0, true); | 
|---|
| 156 | stream << "\n"; | 
|---|
| 157 | } | 
|---|
| 158 | continue; | 
|---|
| 159 | } | 
|---|
| 160 |  | 
|---|
| 161 | snprintf(buffer, bufsize, "%-*s", COL_WIDTH, | 
|---|
| 162 | file_metadata->schema()->Column(i)->name().c_str()); | 
|---|
| 163 | stream << buffer << '|'; | 
|---|
| 164 | } | 
|---|
| 165 | if (format_dump) { | 
|---|
| 166 | continue; | 
|---|
| 167 | } | 
|---|
| 168 | stream << "\n"; | 
|---|
| 169 |  | 
|---|
| 170 | bool hasRow; | 
|---|
| 171 | do { | 
|---|
| 172 | hasRow = false; | 
|---|
| 173 | for (auto scanner : scanners) { | 
|---|
| 174 | if (scanner->HasNext()) { | 
|---|
| 175 | hasRow = true; | 
|---|
| 176 | scanner->PrintNext(stream, COL_WIDTH); | 
|---|
| 177 | stream << '|'; | 
|---|
| 178 | } | 
|---|
| 179 | } | 
|---|
| 180 | stream << "\n"; | 
|---|
| 181 | } while (hasRow); | 
|---|
| 182 | } | 
|---|
| 183 | } | 
|---|
| 184 |  | 
|---|
| 185 | void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns, | 
|---|
| 186 | const char* filename) { | 
|---|
| 187 | const FileMetaData* file_metadata = fileReader->metadata().get(); | 
|---|
| 188 | stream << "{\n"; | 
|---|
| 189 | stream << "  \"FileName\": \""<< filename << "\",\n"; | 
|---|
| 190 | stream << "  \"Version\": \""<< file_metadata->version() << "\",\n"; | 
|---|
| 191 | stream << "  \"CreatedBy\": \""<< file_metadata->created_by() << "\",\n"; | 
|---|
| 192 | stream << "  \"TotalRows\": \""<< file_metadata->num_rows() << "\",\n"; | 
|---|
| 193 | stream << "  \"NumberOfRowGroups\": \""<< file_metadata->num_row_groups() << "\",\n"; | 
|---|
| 194 | stream << "  \"NumberOfRealColumns\": \"" | 
|---|
| 195 | << file_metadata->schema()->group_node()->field_count() << "\",\n"; | 
|---|
| 196 | stream << "  \"NumberOfColumns\": \""<< file_metadata->num_columns() << "\",\n"; | 
|---|
| 197 |  | 
|---|
| 198 | if (selected_columns.size() == 0) { | 
|---|
| 199 | for (int i = 0; i < file_metadata->num_columns(); i++) { | 
|---|
| 200 | selected_columns.push_back(i); | 
|---|
| 201 | } | 
|---|
| 202 | } else { | 
|---|
| 203 | for (auto i : selected_columns) { | 
|---|
| 204 | if (i < 0 || i >= file_metadata->num_columns()) { | 
|---|
| 205 | throw ParquetException( "Selected column is out of range"); | 
|---|
| 206 | } | 
|---|
| 207 | } | 
|---|
| 208 | } | 
|---|
| 209 |  | 
|---|
| 210 | stream << "  \"Columns\": [\n"; | 
|---|
| 211 | int c = 0; | 
|---|
| 212 | for (auto i : selected_columns) { | 
|---|
| 213 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); | 
|---|
| 214 | stream << "     { \"Id\": \""<< i << "\", \"Name\": \""<< descr->name() << "\"," | 
|---|
| 215 | << " \"PhysicalType\": \""<< TypeToString(descr->physical_type()) << "\"," | 
|---|
| 216 | << " \"ConvertedType\": \""<< ConvertedTypeToString(descr->converted_type()) | 
|---|
| 217 | << "\"," | 
|---|
| 218 | << " \"LogicalType\": "<< (descr->logical_type())->ToJSON() << " }"; | 
|---|
| 219 | c++; | 
|---|
| 220 | if (c != static_cast<int>(selected_columns.size())) { | 
|---|
| 221 | stream << ",\n"; | 
|---|
| 222 | } | 
|---|
| 223 | } | 
|---|
| 224 |  | 
|---|
| 225 | stream << "\n  ],\n  \"RowGroups\": [\n"; | 
|---|
| 226 | for (int r = 0; r < file_metadata->num_row_groups(); ++r) { | 
|---|
| 227 | stream << "     {\n       \"Id\": \""<< r << "\", "; | 
|---|
| 228 |  | 
|---|
| 229 | auto group_reader = fileReader->RowGroup(r); | 
|---|
| 230 | std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r); | 
|---|
| 231 |  | 
|---|
| 232 | stream << " \"TotalBytes\": \""<< group_metadata->total_byte_size() << "\", "; | 
|---|
| 233 | stream << " \"Rows\": \""<< group_metadata->num_rows() << "\",\n"; | 
|---|
| 234 |  | 
|---|
| 235 | // Print column metadata | 
|---|
| 236 | stream << "       \"ColumnChunks\": [\n"; | 
|---|
| 237 | int c1 = 0; | 
|---|
| 238 | for (auto i : selected_columns) { | 
|---|
| 239 | auto column_chunk = group_metadata->ColumnChunk(i); | 
|---|
| 240 | std::shared_ptr<Statistics> stats = column_chunk->statistics(); | 
|---|
| 241 |  | 
|---|
| 242 | const ColumnDescriptor* descr = file_metadata->schema()->Column(i); | 
|---|
| 243 | stream << "          {\"Id\": \""<< i << "\", \"Values\": \"" | 
|---|
| 244 | << column_chunk->num_values() << "\", " | 
|---|
| 245 | << "\"StatsSet\": "; | 
|---|
| 246 | if (column_chunk->is_stats_set()) { | 
|---|
| 247 | stream << "\"True\", \"Stats\": {"; | 
|---|
| 248 | std::string min = stats->EncodeMin(), max = stats->EncodeMax(); | 
|---|
| 249 | stream << "\"NumNulls\": \""<< stats->null_count() << "\", " | 
|---|
| 250 | << "\"DistinctValues\": \""<< stats->distinct_count() << "\", " | 
|---|
| 251 | << "\"Max\": \""<< FormatStatValue(descr->physical_type(), max) << "\", " | 
|---|
| 252 | << "\"Min\": \""<< FormatStatValue(descr->physical_type(), min) | 
|---|
| 253 | << "\" },"; | 
|---|
| 254 | } else { | 
|---|
| 255 | stream << "\"False\","; | 
|---|
| 256 | } | 
|---|
| 257 | stream << "\n           \"Compression\": \"" | 
|---|
| 258 | << Codec::GetCodecAsString(column_chunk->compression()) | 
|---|
| 259 | << "\", \"Encodings\": \""; | 
|---|
| 260 | for (auto encoding : column_chunk->encodings()) { | 
|---|
| 261 | stream << EncodingToString(encoding) << " "; | 
|---|
| 262 | } | 
|---|
| 263 | stream << "\", " | 
|---|
| 264 | << "\"UncompressedSize\": \""<< column_chunk->total_uncompressed_size() | 
|---|
| 265 | << "\", \"CompressedSize\": \""<< column_chunk->total_compressed_size(); | 
|---|
| 266 |  | 
|---|
| 267 | // end of a ColumnChunk | 
|---|
| 268 | stream << "\" }"; | 
|---|
| 269 | c1++; | 
|---|
| 270 | if (c1 != static_cast<int>(selected_columns.size())) { | 
|---|
| 271 | stream << ",\n"; | 
|---|
| 272 | } | 
|---|
| 273 | } | 
|---|
| 274 |  | 
|---|
| 275 | stream << "\n        ]\n     }"; | 
|---|
| 276 | if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) { | 
|---|
| 277 | stream << ",\n"; | 
|---|
| 278 | } | 
|---|
| 279 | } | 
|---|
| 280 | stream << "\n  ]\n}\n"; | 
|---|
| 281 | } | 
|---|
| 282 |  | 
|---|
| 283 | }  // namespace parquet | 
|---|
| 284 |  | 
|---|