1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <cstddef> |
19 | #include <cstdint> |
20 | #include <iostream> |
21 | #include <memory> |
22 | #include <sstream> // IWYU pragma: keep |
23 | #include <string> |
24 | #include <type_traits> |
25 | #include <vector> |
26 | |
27 | #include "arrow/array.h" |
28 | #include "arrow/pretty_print.h" |
29 | #include "arrow/record_batch.h" |
30 | #include "arrow/status.h" |
31 | #include "arrow/table.h" |
32 | #include "arrow/type.h" |
33 | #include "arrow/type_traits.h" |
34 | #include "arrow/util/checked_cast.h" |
35 | #include "arrow/util/string.h" |
36 | #include "arrow/visitor_inline.h" |
37 | |
38 | namespace arrow { |
39 | |
40 | using internal::checked_cast; |
41 | |
42 | class PrettyPrinter { |
43 | public: |
44 | PrettyPrinter(int indent, int indent_size, int window, bool skip_new_lines, |
45 | std::ostream* sink) |
46 | : indent_(indent), |
47 | indent_size_(indent_size), |
48 | window_(window), |
49 | skip_new_lines_(skip_new_lines), |
50 | sink_(sink) {} |
51 | |
52 | void Write(const char* data); |
53 | void Write(const std::string& data); |
54 | void WriteIndented(const char* data); |
55 | void WriteIndented(const std::string& data); |
56 | void Newline(); |
57 | void Indent(); |
58 | void OpenArray(const Array& array); |
59 | void CloseArray(const Array& array); |
60 | |
61 | void Flush() { (*sink_) << std::flush; } |
62 | |
63 | protected: |
64 | int indent_; |
65 | int indent_size_; |
66 | int window_; |
67 | bool skip_new_lines_; |
68 | std::ostream* sink_; |
69 | }; |
70 | |
71 | void PrettyPrinter::OpenArray(const Array& array) { |
72 | Indent(); |
73 | (*sink_) << "[" ; |
74 | if (array.length() > 0) { |
75 | (*sink_) << "\n" ; |
76 | indent_ += indent_size_; |
77 | } |
78 | } |
79 | |
80 | void PrettyPrinter::CloseArray(const Array& array) { |
81 | if (array.length() > 0) { |
82 | indent_ -= indent_size_; |
83 | Indent(); |
84 | } |
85 | (*sink_) << "]" ; |
86 | } |
87 | |
88 | void PrettyPrinter::Write(const char* data) { (*sink_) << data; } |
89 | void PrettyPrinter::Write(const std::string& data) { (*sink_) << data; } |
90 | |
91 | void PrettyPrinter::WriteIndented(const char* data) { |
92 | Indent(); |
93 | Write(data); |
94 | } |
95 | |
96 | void PrettyPrinter::WriteIndented(const std::string& data) { |
97 | Indent(); |
98 | Write(data); |
99 | } |
100 | |
101 | void PrettyPrinter::Newline() { |
102 | if (skip_new_lines_) { |
103 | return; |
104 | } |
105 | (*sink_) << "\n" ; |
106 | Indent(); |
107 | } |
108 | |
109 | void PrettyPrinter::Indent() { |
110 | for (int i = 0; i < indent_; ++i) { |
111 | (*sink_) << " " ; |
112 | } |
113 | } |
114 | |
115 | class ArrayPrinter : public PrettyPrinter { |
116 | public: |
117 | ArrayPrinter(const Array& array, int indent, int indent_size, int window, |
118 | const std::string& null_rep, bool skip_new_lines, std::ostream* sink) |
119 | : PrettyPrinter(indent, indent_size, window, skip_new_lines, sink), |
120 | array_(array), |
121 | null_rep_(null_rep) {} |
122 | |
123 | template <typename FormatFunction> |
124 | void WriteValues(const Array& array, FormatFunction&& func) { |
125 | bool skip_comma = true; |
126 | for (int64_t i = 0; i < array.length(); ++i) { |
127 | if (skip_comma) { |
128 | skip_comma = false; |
129 | } else { |
130 | (*sink_) << ",\n" ; |
131 | } |
132 | Indent(); |
133 | if ((i >= window_) && (i < (array.length() - window_))) { |
134 | (*sink_) << "...\n" ; |
135 | i = array.length() - window_ - 1; |
136 | skip_comma = true; |
137 | } else if (array.IsNull(i)) { |
138 | (*sink_) << null_rep_; |
139 | } else { |
140 | func(i); |
141 | } |
142 | } |
143 | (*sink_) << "\n" ; |
144 | } |
145 | |
146 | template <typename T> |
147 | inline typename std::enable_if<IsInteger<T>::value, Status>::type WriteDataValues( |
148 | const T& array) { |
149 | const auto data = array.raw_values(); |
150 | WriteValues(array, [&](int64_t i) { (*sink_) << static_cast<int64_t>(data[i]); }); |
151 | return Status::OK(); |
152 | } |
153 | |
154 | template <typename T> |
155 | inline typename std::enable_if<IsFloatingPoint<T>::value, Status>::type WriteDataValues( |
156 | const T& array) { |
157 | const auto data = array.raw_values(); |
158 | WriteValues(array, [&](int64_t i) { (*sink_) << data[i]; }); |
159 | return Status::OK(); |
160 | } |
161 | |
162 | // String (Utf8) |
163 | template <typename T> |
164 | inline typename std::enable_if<std::is_same<StringArray, T>::value, Status>::type |
165 | WriteDataValues(const T& array) { |
166 | WriteValues(array, [&](int64_t i) { (*sink_) << "\"" << array.GetView(i) << "\"" ; }); |
167 | return Status::OK(); |
168 | } |
169 | |
170 | // Binary |
171 | template <typename T> |
172 | inline typename std::enable_if<std::is_same<BinaryArray, T>::value, Status>::type |
173 | WriteDataValues(const T& array) { |
174 | WriteValues(array, [&](int64_t i) { (*sink_) << HexEncode(array.GetView(i)); }); |
175 | return Status::OK(); |
176 | } |
177 | |
178 | template <typename T> |
179 | inline |
180 | typename std::enable_if<std::is_same<FixedSizeBinaryArray, T>::value, Status>::type |
181 | WriteDataValues(const T& array) { |
182 | WriteValues(array, [&](int64_t i) { (*sink_) << HexEncode(array.GetView(i)); }); |
183 | return Status::OK(); |
184 | } |
185 | |
186 | template <typename T> |
187 | inline typename std::enable_if<std::is_same<Decimal128Array, T>::value, Status>::type |
188 | WriteDataValues(const T& array) { |
189 | WriteValues(array, [&](int64_t i) { (*sink_) << array.FormatValue(i); }); |
190 | return Status::OK(); |
191 | } |
192 | |
193 | template <typename T> |
194 | inline typename std::enable_if<std::is_base_of<BooleanArray, T>::value, Status>::type |
195 | WriteDataValues(const T& array) { |
196 | WriteValues(array, [&](int64_t i) { Write(array.Value(i) ? "true" : "false" ); }); |
197 | return Status::OK(); |
198 | } |
199 | |
200 | template <typename T> |
201 | inline typename std::enable_if<std::is_base_of<ListArray, T>::value, Status>::type |
202 | WriteDataValues(const T& array) { |
203 | bool skip_comma = true; |
204 | for (int64_t i = 0; i < array.length(); ++i) { |
205 | if (skip_comma) { |
206 | skip_comma = false; |
207 | } else { |
208 | (*sink_) << ",\n" ; |
209 | } |
210 | if ((i >= window_) && (i < (array.length() - window_))) { |
211 | Indent(); |
212 | (*sink_) << "...\n" ; |
213 | i = array.length() - window_ - 1; |
214 | skip_comma = true; |
215 | } else if (array.IsNull(i)) { |
216 | Indent(); |
217 | (*sink_) << null_rep_; |
218 | } else { |
219 | std::shared_ptr<Array> slice = |
220 | array.values()->Slice(array.value_offset(i), array.value_length(i)); |
221 | RETURN_NOT_OK(PrettyPrint(*slice, {indent_, window_}, sink_)); |
222 | } |
223 | } |
224 | (*sink_) << "\n" ; |
225 | return Status::OK(); |
226 | } |
227 | |
228 | Status Visit(const NullArray& array) { |
229 | (*sink_) << array.length() << " nulls" ; |
230 | return Status::OK(); |
231 | } |
232 | |
233 | template <typename T> |
234 | typename std::enable_if<std::is_base_of<PrimitiveArray, T>::value || |
235 | std::is_base_of<FixedSizeBinaryArray, T>::value || |
236 | std::is_base_of<BinaryArray, T>::value || |
237 | std::is_base_of<ListArray, T>::value, |
238 | Status>::type |
239 | Visit(const T& array) { |
240 | OpenArray(array); |
241 | if (array.length() > 0) { |
242 | RETURN_NOT_OK(WriteDataValues(array)); |
243 | } |
244 | CloseArray(array); |
245 | return Status::OK(); |
246 | } |
247 | |
248 | Status Visit(const IntervalArray&) { return Status::NotImplemented("interval" ); } |
249 | |
250 | Status WriteValidityBitmap(const Array& array); |
251 | |
252 | Status PrintChildren(const std::vector<std::shared_ptr<Array>>& fields, int64_t offset, |
253 | int64_t length) { |
254 | for (size_t i = 0; i < fields.size(); ++i) { |
255 | Newline(); |
256 | std::stringstream ss; |
257 | ss << "-- child " << i << " type: " << fields[i]->type()->ToString() << "\n" ; |
258 | Write(ss.str()); |
259 | |
260 | std::shared_ptr<Array> field = fields[i]; |
261 | if (offset != 0) { |
262 | field = field->Slice(offset, length); |
263 | } |
264 | |
265 | RETURN_NOT_OK(PrettyPrint(*field, indent_ + indent_size_, sink_)); |
266 | } |
267 | return Status::OK(); |
268 | } |
269 | |
270 | Status Visit(const StructArray& array) { |
271 | RETURN_NOT_OK(WriteValidityBitmap(array)); |
272 | std::vector<std::shared_ptr<Array>> children; |
273 | children.reserve(array.num_fields()); |
274 | for (int i = 0; i < array.num_fields(); ++i) { |
275 | children.emplace_back(array.field(i)); |
276 | } |
277 | return PrintChildren(children, 0, array.length()); |
278 | } |
279 | |
280 | Status Visit(const UnionArray& array) { |
281 | RETURN_NOT_OK(WriteValidityBitmap(array)); |
282 | |
283 | Newline(); |
284 | Write("-- type_ids: " ); |
285 | UInt8Array type_ids(array.length(), array.type_ids(), nullptr, 0, array.offset()); |
286 | RETURN_NOT_OK(PrettyPrint(type_ids, indent_ + indent_size_, sink_)); |
287 | |
288 | if (array.mode() == UnionMode::DENSE) { |
289 | Newline(); |
290 | Write("-- value_offsets: " ); |
291 | Int32Array value_offsets(array.length(), array.value_offsets(), nullptr, 0, |
292 | array.offset()); |
293 | RETURN_NOT_OK(PrettyPrint(value_offsets, indent_ + indent_size_, sink_)); |
294 | } |
295 | |
296 | // Print the children without any offset, because the type ids are absolute |
297 | std::vector<std::shared_ptr<Array>> children; |
298 | children.reserve(array.num_fields()); |
299 | for (int i = 0; i < array.num_fields(); ++i) { |
300 | children.emplace_back(array.child(i)); |
301 | } |
302 | return PrintChildren(children, 0, array.length() + array.offset()); |
303 | } |
304 | |
305 | Status Visit(const DictionaryArray& array) { |
306 | Newline(); |
307 | Write("-- dictionary:\n" ); |
308 | RETURN_NOT_OK(PrettyPrint(*array.dictionary(), indent_ + indent_size_, sink_)); |
309 | |
310 | Newline(); |
311 | Write("-- indices:\n" ); |
312 | return PrettyPrint(*array.indices(), indent_ + indent_size_, sink_); |
313 | } |
314 | |
315 | Status Print() { |
316 | RETURN_NOT_OK(VisitArrayInline(array_, this)); |
317 | Flush(); |
318 | return Status::OK(); |
319 | } |
320 | |
321 | private: |
322 | const Array& array_; |
323 | std::string null_rep_; |
324 | }; |
325 | |
326 | Status ArrayPrinter::WriteValidityBitmap(const Array& array) { |
327 | Indent(); |
328 | Write("-- is_valid:" ); |
329 | |
330 | if (array.null_count() > 0) { |
331 | Newline(); |
332 | BooleanArray is_valid(array.length(), array.null_bitmap(), nullptr, 0, |
333 | array.offset()); |
334 | return PrettyPrint(is_valid, indent_ + indent_size_, sink_); |
335 | } else { |
336 | Write(" all not null" ); |
337 | return Status::OK(); |
338 | } |
339 | } |
340 | |
341 | Status PrettyPrint(const Array& arr, int indent, std::ostream* sink) { |
342 | ArrayPrinter printer(arr, indent, 2, 10, "null" , false, sink); |
343 | return printer.Print(); |
344 | } |
345 | |
346 | Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options, |
347 | std::ostream* sink) { |
348 | ArrayPrinter printer(arr, options.indent, options.indent_size, options.window, |
349 | options.null_rep, options.skip_new_lines, sink); |
350 | return printer.Print(); |
351 | } |
352 | |
353 | Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options, |
354 | std::string* result) { |
355 | std::ostringstream sink; |
356 | RETURN_NOT_OK(PrettyPrint(arr, options, &sink)); |
357 | *result = sink.str(); |
358 | return Status::OK(); |
359 | } |
360 | |
361 | Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options, |
362 | std::ostream* sink) { |
363 | int num_chunks = chunked_arr.num_chunks(); |
364 | int indent = options.indent; |
365 | int window = options.window; |
366 | |
367 | for (int i = 0; i < indent; ++i) { |
368 | (*sink) << " " ; |
369 | } |
370 | (*sink) << "[\n" ; |
371 | bool skip_comma = true; |
372 | for (int i = 0; i < num_chunks; ++i) { |
373 | if (skip_comma) { |
374 | skip_comma = false; |
375 | } else { |
376 | (*sink) << ",\n" ; |
377 | } |
378 | if ((i >= window) && (i < (num_chunks - window))) { |
379 | for (int i = 0; i < indent; ++i) { |
380 | (*sink) << " " ; |
381 | } |
382 | (*sink) << "...\n" ; |
383 | i = num_chunks - window - 1; |
384 | skip_comma = true; |
385 | } else { |
386 | ArrayPrinter printer(*chunked_arr.chunk(i), indent + options.indent_size, |
387 | options.indent_size, window, options.null_rep, |
388 | options.skip_new_lines, sink); |
389 | RETURN_NOT_OK(printer.Print()); |
390 | } |
391 | } |
392 | (*sink) << "\n" ; |
393 | |
394 | for (int i = 0; i < indent; ++i) { |
395 | (*sink) << " " ; |
396 | } |
397 | (*sink) << "]" ; |
398 | |
399 | return Status::OK(); |
400 | } |
401 | |
402 | Status PrettyPrint(const Column& column, const PrettyPrintOptions& options, |
403 | std::ostream* sink) { |
404 | for (int i = 0; i < options.indent; ++i) { |
405 | (*sink) << " " ; |
406 | } |
407 | (*sink) << column.field()->ToString() << "\n" ; |
408 | |
409 | return PrettyPrint(*column.data(), options, sink); |
410 | } |
411 | |
412 | Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options, |
413 | std::string* result) { |
414 | std::ostringstream sink; |
415 | RETURN_NOT_OK(PrettyPrint(chunked_arr, options, &sink)); |
416 | *result = sink.str(); |
417 | return Status::OK(); |
418 | } |
419 | |
420 | Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink) { |
421 | for (int i = 0; i < batch.num_columns(); ++i) { |
422 | const std::string& name = batch.column_name(i); |
423 | (*sink) << name << ": " ; |
424 | RETURN_NOT_OK(PrettyPrint(*batch.column(i), indent + 2, sink)); |
425 | (*sink) << "\n" ; |
426 | } |
427 | (*sink) << std::flush; |
428 | return Status::OK(); |
429 | } |
430 | |
431 | Status PrettyPrint(const Table& table, const PrettyPrintOptions& options, |
432 | std::ostream* sink) { |
433 | RETURN_NOT_OK(PrettyPrint(*table.schema(), options, sink)); |
434 | (*sink) << "\n" ; |
435 | (*sink) << "----\n" ; |
436 | |
437 | PrettyPrintOptions column_options = options; |
438 | column_options.indent += 2; |
439 | for (int i = 0; i < table.num_columns(); ++i) { |
440 | for (int j = 0; j < options.indent; ++j) { |
441 | (*sink) << " " ; |
442 | } |
443 | (*sink) << table.schema()->field(i)->name() << ":\n" ; |
444 | RETURN_NOT_OK(PrettyPrint(*table.column(i)->data(), column_options, sink)); |
445 | (*sink) << "\n" ; |
446 | } |
447 | (*sink) << std::flush; |
448 | return Status::OK(); |
449 | } |
450 | |
451 | Status DebugPrint(const Array& arr, int indent) { |
452 | return PrettyPrint(arr, indent, &std::cout); |
453 | } |
454 | |
455 | class SchemaPrinter : public PrettyPrinter { |
456 | public: |
457 | SchemaPrinter(const Schema& schema, int indent, int indent_size, int window, |
458 | bool skip_new_lines, std::ostream* sink) |
459 | : PrettyPrinter(indent, indent_size, window, skip_new_lines, sink), |
460 | schema_(schema) {} |
461 | |
462 | Status PrintType(const DataType& type); |
463 | Status PrintField(const Field& field); |
464 | |
465 | Status Print() { |
466 | for (int i = 0; i < schema_.num_fields(); ++i) { |
467 | if (i > 0) { |
468 | Newline(); |
469 | } |
470 | RETURN_NOT_OK(PrintField(*schema_.field(i))); |
471 | } |
472 | Flush(); |
473 | return Status::OK(); |
474 | } |
475 | |
476 | private: |
477 | const Schema& schema_; |
478 | }; |
479 | |
480 | Status SchemaPrinter::PrintType(const DataType& type) { |
481 | Write(type.ToString()); |
482 | if (type.id() == Type::DICTIONARY) { |
483 | indent_ += indent_size_; |
484 | Newline(); |
485 | Write("dictionary:\n" ); |
486 | const auto& dict_type = checked_cast<const DictionaryType&>(type); |
487 | RETURN_NOT_OK(PrettyPrint(*dict_type.dictionary(), indent_ + indent_size_, sink_)); |
488 | indent_ -= indent_size_; |
489 | } else { |
490 | for (int i = 0; i < type.num_children(); ++i) { |
491 | Newline(); |
492 | |
493 | std::stringstream ss; |
494 | ss << "child " << i << ", " ; |
495 | |
496 | indent_ += indent_size_; |
497 | WriteIndented(ss.str()); |
498 | RETURN_NOT_OK(PrintField(*type.child(i))); |
499 | indent_ -= indent_size_; |
500 | } |
501 | } |
502 | return Status::OK(); |
503 | } |
504 | |
505 | Status SchemaPrinter::PrintField(const Field& field) { |
506 | Write(field.name()); |
507 | Write(": " ); |
508 | return PrintType(*field.type()); |
509 | } |
510 | |
511 | Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, |
512 | std::ostream* sink) { |
513 | SchemaPrinter printer(schema, options.indent, options.indent_size, options.window, |
514 | options.skip_new_lines, sink); |
515 | return printer.Print(); |
516 | } |
517 | |
518 | Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, |
519 | std::string* result) { |
520 | std::ostringstream sink; |
521 | RETURN_NOT_OK(PrettyPrint(schema, options, &sink)); |
522 | *result = sink.str(); |
523 | return Status::OK(); |
524 | } |
525 | |
526 | } // namespace arrow |
527 | |