1#pragma once
2
3#include <Core/Block.h>
4#include <Processors/Formats/IRowInputFormat.h>
5#include <Formats/FormatSettings.h>
6#include <Common/HashTable/HashMap.h>
7
8
9namespace DB
10{
11
12class ReadBuffer;
13
14
15/** Stream for reading data in TSKV format.
16 * TSKV is a very inefficient data format.
17 * Similar to TSV, but each field is written as key=value.
18 * Fields can be listed in any order (including, in different lines there may be different order),
19 * and some fields may be missing.
20 * An equal sign can be escaped in the field name.
21 * Also, as an additional element there may be a useless tskv fragment - it needs to be ignored.
22 */
23class TSKVRowInputFormat : public IRowInputFormat
24{
25public:
26 TSKVRowInputFormat(ReadBuffer & in_, Block header_, Params params_, const FormatSettings & format_settings_);
27
28 String getName() const override { return "TSKVRowInputFormat"; }
29
30 bool readRow(MutableColumns & columns, RowReadExtension &) override;
31 bool allowSyncAfterError() const override { return true; }
32 void syncAfterError() override;
33 void resetParser() override;
34
35
36private:
37 const FormatSettings format_settings;
38
39 /// Buffer for the read from the stream the field name. Used when you have to copy it.
40 String name_buf;
41
42 /// Hash table matching `field name -> position in the block`. NOTE You can use perfect hash map.
43 using NameMap = HashMap<StringRef, size_t, StringRefHash>;
44 NameMap name_map;
45
46 /// Set of columns for which the values were read. The rest will be filled with default values.
47 std::vector<UInt8> read_columns;
48 /// Set of columns which already met in row. Exception is thrown if there are more than one column with the same name.
49 std::vector<UInt8> seen_columns;
50 /// These sets may be different, because if null_as_default=1 read_columns[i] will be false and seen_columns[i] will be true
51 /// for row like ..., non-nullable column name=\N, ...
52};
53
54}
55