1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_CSV_CHUNKER_H
19#define ARROW_CSV_CHUNKER_H
20
21#include <cstdint>
22
23#include "arrow/csv/options.h"
24#include "arrow/status.h"
25#include "arrow/util/macros.h"
26#include "arrow/util/visibility.h"
27
28namespace arrow {
29namespace csv {
30
31/// \class Chunker
32/// \brief A reusable block-based chunker for CSV data
33///
34/// The chunker takes a block of CSV data and finds a suitable place
35/// to cut it up without splitting a row.
36/// If the block is truncated (i.e. not all data can be chunked), it is up
37/// to the caller to arrange the next block to start with the trailing data.
38///
39/// Note: if the previous block ends with CR (0x0d) and a new block starts
40/// with LF (0x0a), the chunker will consider the leading newline as an empty line.
41class ARROW_EXPORT Chunker {
42 public:
43 explicit Chunker(ParseOptions options);
44
45 /// \brief Carve up a chunk in a block of data
46 ///
47 /// Process a block of CSV data, reading up to size bytes.
48 /// The number of bytes in the chunk is returned in out_size.
49 Status Process(const char* data, uint32_t size, uint32_t* out_size);
50
51 protected:
52 ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker);
53
54 // Like Process(), but specialized for some parsing options
55 template <bool quoting, bool escaping>
56 Status ProcessSpecialized(const char* data, uint32_t size, uint32_t* out_size);
57
58 // Detect a single line from the data pointer. Return the line end,
59 // or nullptr if the remaining line is truncated.
60 template <bool quoting, bool escaping>
61 inline const char* ReadLine(const char* data, const char* data_end);
62
63 ParseOptions options_;
64};
65
66} // namespace csv
67} // namespace arrow
68
69#endif // ARROW_CSV_CHUNKER_H
70