| 1 | #pragma once |
| 2 | |
| 3 | #include <optional> |
| 4 | #include <unordered_map> |
| 5 | |
| 6 | #include <Core/Block.h> |
| 7 | #include <Processors/Formats/RowInputFormatWithDiagnosticInfo.h> |
| 8 | #include <Formats/FormatSettings.h> |
| 9 | |
| 10 | |
| 11 | namespace DB |
| 12 | { |
| 13 | |
| 14 | /** A stream for inputting data in csv format. |
| 15 | * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. |
| 16 | */ |
| 17 | class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo |
| 18 | { |
| 19 | public: |
| 20 | /** with_names - in the first line the header with column names |
| 21 | */ |
| 22 | CSVRowInputFormat(const Block & , ReadBuffer & in_, const Params & params_, |
| 23 | bool with_names_, const FormatSettings & format_settings_); |
| 24 | |
| 25 | String getName() const override { return "CSVRowInputFormat" ; } |
| 26 | |
| 27 | bool readRow(MutableColumns & columns, RowReadExtension & ext) override; |
| 28 | void readPrefix() override; |
| 29 | bool allowSyncAfterError() const override { return true; } |
| 30 | void syncAfterError() override; |
| 31 | void resetParser() override; |
| 32 | |
| 33 | private: |
| 34 | bool with_names; |
| 35 | const FormatSettings format_settings; |
| 36 | DataTypes data_types; |
| 37 | |
| 38 | using IndexesMap = std::unordered_map<String, size_t>; |
| 39 | IndexesMap column_indexes_by_names; |
| 40 | |
| 41 | /// Maps indexes of columns in the input file to indexes of table columns |
| 42 | using OptionalIndexes = std::vector<std::optional<size_t>>; |
| 43 | OptionalIndexes column_indexes_for_input_fields; |
| 44 | |
| 45 | /// Tracks which columns we have read in a single read() call. |
| 46 | /// For columns that are never read, it is initialized to false when we |
| 47 | /// read the file header, and never changed afterwards. |
| 48 | /// For other columns, it is updated on each read() call. |
| 49 | std::vector<UInt8> read_columns; |
| 50 | |
| 51 | /// Whether we have any columns that are not read from file at all, |
| 52 | /// and must be always initialized with defaults. |
| 53 | bool have_always_default_columns = false; |
| 54 | |
| 55 | void addInputColumn(const String & column_name); |
| 56 | |
| 57 | bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; |
| 58 | void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, |
| 59 | ReadBuffer::Position & prev_pos, ReadBuffer::Position & curr_pos) override; |
| 60 | bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override |
| 61 | { |
| 62 | return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter; |
| 63 | } |
| 64 | |
| 65 | bool readField(IColumn & column, const DataTypePtr & type, bool is_last_file_column); |
| 66 | }; |
| 67 | |
| 68 | } |
| 69 | |