| 1 | #pragma once |
| 2 | |
| 3 | #include <Core/Block.h> |
| 4 | #include <Processors/Formats/IRowInputFormat.h> |
| 5 | #include <Formats/FormatSettings.h> |
| 6 | #include <Common/HashTable/HashMap.h> |
| 7 | |
| 8 | |
| 9 | namespace DB |
| 10 | { |
| 11 | |
| 12 | class ReadBuffer; |
| 13 | |
| 14 | |
| 15 | /** A stream for reading data in JSON format, where each row is represented by a separate JSON object. |
| 16 | * Objects can be separated by line feed, other whitespace characters in any number and possibly a comma. |
| 17 | * Fields can be listed in any order (including, in different lines there may be different order), |
| 18 | * and some fields may be missing. |
| 19 | */ |
| 20 | class JSONEachRowRowInputFormat : public IRowInputFormat |
| 21 | { |
| 22 | public: |
| 23 | JSONEachRowRowInputFormat(ReadBuffer & in_, const Block & , Params params_, const FormatSettings & format_settings_); |
| 24 | |
| 25 | String getName() const override { return "JSONEachRowRowInputFormat" ; } |
| 26 | |
| 27 | bool readRow(MutableColumns & columns, RowReadExtension & ext) override; |
| 28 | bool allowSyncAfterError() const override { return true; } |
| 29 | void syncAfterError() override; |
| 30 | void resetParser() override; |
| 31 | |
| 32 | private: |
| 33 | const String & columnName(size_t i) const; |
| 34 | size_t columnIndex(const StringRef & name, size_t key_index); |
| 35 | bool advanceToNextKey(size_t key_index); |
| 36 | void skipUnknownField(const StringRef & name_ref); |
| 37 | StringRef readColumnName(ReadBuffer & buf); |
| 38 | void readField(size_t index, MutableColumns & columns); |
| 39 | void readJSONObject(MutableColumns & columns); |
| 40 | void readNestedData(const String & name, MutableColumns & columns); |
| 41 | |
| 42 | private: |
| 43 | |
| 44 | const FormatSettings format_settings; |
| 45 | |
| 46 | /// Buffer for the read from the stream field name. Used when you have to copy it. |
| 47 | /// Also, if processing of Nested data is in progress, it holds the common prefix |
| 48 | /// of the nested column names (so that appending the field name to it produces |
| 49 | /// the full column name) |
| 50 | String current_column_name; |
| 51 | |
| 52 | /// If processing Nested data, holds the length of the common prefix |
| 53 | /// of the names of related nested columns. For example, for a table |
| 54 | /// created as follows |
| 55 | /// CREATE TABLE t (n Nested (i Int32, s String)) |
| 56 | /// the nested column names are 'n.i' and 'n.s' and the nested prefix is 'n.' |
| 57 | size_t nested_prefix_length = 0; |
| 58 | |
| 59 | /// Set of columns for which the values were read. The rest will be filled with default values. |
| 60 | std::vector<UInt8> read_columns; |
| 61 | /// Set of columns which already met in row. Exception is thrown if there are more than one column with the same name. |
| 62 | std::vector<UInt8> seen_columns; |
| 63 | /// These sets may be different, because if null_as_default=1 read_columns[i] will be false and seen_columns[i] will be true |
| 64 | /// for row like {..., "non-nullable column name" : null, ...} |
| 65 | |
| 66 | /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map. |
| 67 | using NameMap = HashMap<StringRef, size_t, StringRefHash>; |
| 68 | NameMap name_map; |
| 69 | |
| 70 | /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. |
| 71 | std::vector<NameMap::LookupResult> prev_positions; |
| 72 | }; |
| 73 | |
| 74 | } |
| 75 | |