1 | #pragma once |
2 | |
3 | #include <Core/Block.h> |
4 | #include <Processors/Formats/IRowInputFormat.h> |
5 | #include <Formats/FormatSettings.h> |
6 | #include <Common/HashTable/HashMap.h> |
7 | |
8 | |
9 | namespace DB |
10 | { |
11 | |
12 | class ReadBuffer; |
13 | |
14 | |
15 | /** A stream for reading data in JSON format, where each row is represented by a separate JSON object. |
16 | * Objects can be separated by line feed, other whitespace characters in any number and possibly a comma. |
17 | * Fields can be listed in any order (including, in different lines there may be different order), |
18 | * and some fields may be missing. |
19 | */ |
20 | class JSONEachRowRowInputFormat : public IRowInputFormat |
21 | { |
22 | public: |
23 | JSONEachRowRowInputFormat(ReadBuffer & in_, const Block & , Params params_, const FormatSettings & format_settings_); |
24 | |
25 | String getName() const override { return "JSONEachRowRowInputFormat" ; } |
26 | |
27 | bool readRow(MutableColumns & columns, RowReadExtension & ext) override; |
28 | bool allowSyncAfterError() const override { return true; } |
29 | void syncAfterError() override; |
30 | void resetParser() override; |
31 | |
32 | private: |
33 | const String & columnName(size_t i) const; |
34 | size_t columnIndex(const StringRef & name, size_t key_index); |
35 | bool advanceToNextKey(size_t key_index); |
36 | void skipUnknownField(const StringRef & name_ref); |
37 | StringRef readColumnName(ReadBuffer & buf); |
38 | void readField(size_t index, MutableColumns & columns); |
39 | void readJSONObject(MutableColumns & columns); |
40 | void readNestedData(const String & name, MutableColumns & columns); |
41 | |
42 | private: |
43 | |
44 | const FormatSettings format_settings; |
45 | |
46 | /// Buffer for the read from the stream field name. Used when you have to copy it. |
47 | /// Also, if processing of Nested data is in progress, it holds the common prefix |
48 | /// of the nested column names (so that appending the field name to it produces |
49 | /// the full column name) |
50 | String current_column_name; |
51 | |
52 | /// If processing Nested data, holds the length of the common prefix |
53 | /// of the names of related nested columns. For example, for a table |
54 | /// created as follows |
55 | /// CREATE TABLE t (n Nested (i Int32, s String)) |
56 | /// the nested column names are 'n.i' and 'n.s' and the nested prefix is 'n.' |
57 | size_t nested_prefix_length = 0; |
58 | |
59 | /// Set of columns for which the values were read. The rest will be filled with default values. |
60 | std::vector<UInt8> read_columns; |
61 | /// Set of columns which already met in row. Exception is thrown if there are more than one column with the same name. |
62 | std::vector<UInt8> seen_columns; |
63 | /// These sets may be different, because if null_as_default=1 read_columns[i] will be false and seen_columns[i] will be true |
64 | /// for row like {..., "non-nullable column name" : null, ...} |
65 | |
66 | /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map. |
67 | using NameMap = HashMap<StringRef, size_t, StringRefHash>; |
68 | NameMap name_map; |
69 | |
70 | /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. |
71 | std::vector<NameMap::LookupResult> prev_positions; |
72 | }; |
73 | |
74 | } |
75 | |