1 | #pragma once |
2 | |
3 | #include <Core/Block.h> |
4 | #include <Processors/Formats/IRowInputFormat.h> |
5 | #include <Formats/FormatSettings.h> |
6 | #include <Common/HashTable/HashMap.h> |
7 | |
8 | |
9 | namespace DB |
10 | { |
11 | |
12 | class ReadBuffer; |
13 | |
14 | |
15 | /** Stream for reading data in TSKV format. |
16 | * TSKV is a very inefficient data format. |
17 | * Similar to TSV, but each field is written as key=value. |
18 | * Fields can be listed in any order (including, in different lines there may be different order), |
19 | * and some fields may be missing. |
20 | * An equal sign can be escaped in the field name. |
21 | * Also, as an additional element there may be a useless tskv fragment - it needs to be ignored. |
22 | */ |
23 | class TSKVRowInputFormat : public IRowInputFormat |
24 | { |
25 | public: |
26 | TSKVRowInputFormat(ReadBuffer & in_, Block , Params params_, const FormatSettings & format_settings_); |
27 | |
28 | String getName() const override { return "TSKVRowInputFormat" ; } |
29 | |
30 | bool readRow(MutableColumns & columns, RowReadExtension &) override; |
31 | bool allowSyncAfterError() const override { return true; } |
32 | void syncAfterError() override; |
33 | void resetParser() override; |
34 | |
35 | |
36 | private: |
37 | const FormatSettings format_settings; |
38 | |
39 | /// Buffer for the read from the stream the field name. Used when you have to copy it. |
40 | String name_buf; |
41 | |
42 | /// Hash table matching `field name -> position in the block`. NOTE You can use perfect hash map. |
43 | using NameMap = HashMap<StringRef, size_t, StringRefHash>; |
44 | NameMap name_map; |
45 | |
46 | /// Set of columns for which the values were read. The rest will be filled with default values. |
47 | std::vector<UInt8> read_columns; |
48 | /// Set of columns which already met in row. Exception is thrown if there are more than one column with the same name. |
49 | std::vector<UInt8> seen_columns; |
50 | /// These sets may be different, because if null_as_default=1 read_columns[i] will be false and seen_columns[i] will be true |
51 | /// for row like ..., non-nullable column name=\N, ... |
52 | }; |
53 | |
54 | } |
55 | |