1 | #pragma once |
2 | |
3 | #include <optional> |
4 | #include <unordered_map> |
5 | |
6 | #include <Core/Block.h> |
7 | #include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h> |
8 | #include <Formats/FormatSettings.h> |
9 | |
10 | |
11 | namespace DB |
12 | { |
13 | |
14 | /** A stream for inputting data in csv format. |
15 | * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. |
16 | */ |
17 | class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo |
18 | { |
19 | public: |
20 | /** with_names - in the first line the header with column names |
21 | */ |
22 | CSVRowInputFormat(const Block & , ReadBuffer & in_, const Params & params_, |
23 | bool with_names_, const FormatSettings & format_settings_); |
24 | |
25 | String getName() const override { return "CSVRowInputFormat" ; } |
26 | |
27 | bool readRow(MutableColumns & columns, RowReadExtension & ext) override; |
28 | void readPrefix() override; |
29 | bool allowSyncAfterError() const override { return true; } |
30 | void syncAfterError() override; |
31 | void resetParser() override; |
32 | |
33 | private: |
34 | bool with_names; |
35 | const FormatSettings format_settings; |
36 | DataTypes data_types; |
37 | |
38 | using IndexesMap = std::unordered_map<String, size_t>; |
39 | IndexesMap column_indexes_by_names; |
40 | |
41 | /// Maps indexes of columns in the input file to indexes of table columns |
42 | using OptionalIndexes = std::vector<std::optional<size_t>>; |
43 | OptionalIndexes column_indexes_for_input_fields; |
44 | |
45 | /// Tracks which columns we have read in a single read() call. |
46 | /// For columns that are never read, it is initialized to false when we |
47 | /// read the file header, and never changed afterwards. |
48 | /// For other columns, it is updated on each read() call. |
49 | std::vector<UInt8> read_columns; |
50 | |
51 | /// Whether we have any columns that are not read from file at all, |
52 | /// and must be always initialized with defaults. |
53 | bool have_always_default_columns = false; |
54 | |
55 | void addInputColumn(const String & column_name); |
56 | |
57 | bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; |
58 | void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, |
59 | ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; |
60 | bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override |
61 | { |
62 | return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter; |
63 | } |
64 | |
65 | bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column); |
66 | }; |
67 | |
68 | } |
69 | |