1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_CSV_CHUNKER_H |
19 | #define ARROW_CSV_CHUNKER_H |
20 | |
21 | #include <cstdint> |
22 | |
23 | #include "arrow/csv/options.h" |
24 | #include "arrow/status.h" |
25 | #include "arrow/util/macros.h" |
26 | #include "arrow/util/visibility.h" |
27 | |
28 | namespace arrow { |
29 | namespace csv { |
30 | |
31 | /// \class Chunker |
32 | /// \brief A reusable block-based chunker for CSV data |
33 | /// |
34 | /// The chunker takes a block of CSV data and finds a suitable place |
35 | /// to cut it up without splitting a row. |
36 | /// If the block is truncated (i.e. not all data can be chunked), it is up |
37 | /// to the caller to arrange the next block to start with the trailing data. |
38 | /// |
39 | /// Note: if the previous block ends with CR (0x0d) and a new block starts |
40 | /// with LF (0x0a), the chunker will consider the leading newline as an empty line. |
41 | class ARROW_EXPORT Chunker { |
42 | public: |
43 | explicit Chunker(ParseOptions options); |
44 | |
45 | /// \brief Carve up a chunk in a block of data |
46 | /// |
47 | /// Process a block of CSV data, reading up to size bytes. |
48 | /// The number of bytes in the chunk is returned in out_size. |
49 | Status Process(const char* data, uint32_t size, uint32_t* out_size); |
50 | |
51 | protected: |
52 | ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker); |
53 | |
54 | // Like Process(), but specialized for some parsing options |
55 | template <bool quoting, bool escaping> |
56 | Status ProcessSpecialized(const char* data, uint32_t size, uint32_t* out_size); |
57 | |
58 | // Detect a single line from the data pointer. Return the line end, |
59 | // or nullptr if the remaining line is truncated. |
60 | template <bool quoting, bool escaping> |
61 | inline const char* ReadLine(const char* data, const char* data_end); |
62 | |
63 | ParseOptions options_; |
64 | }; |
65 | |
66 | } // namespace csv |
67 | } // namespace arrow |
68 | |
69 | #endif // ARROW_CSV_CHUNKER_H |
70 | |