1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_RECORD_BATCH_H |
19 | #define ARROW_RECORD_BATCH_H |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | #include <string> |
24 | #include <vector> |
25 | |
26 | #include "arrow/type.h" |
27 | #include "arrow/util/macros.h" |
28 | #include "arrow/util/visibility.h" |
29 | |
30 | namespace arrow { |
31 | |
32 | class Array; |
33 | struct ArrayData; |
34 | class Status; |
35 | class Table; |
36 | |
37 | /// \class RecordBatch |
38 | /// \brief Collection of equal-length arrays matching a particular Schema |
39 | /// |
40 | /// A record batch is table-like data structure that is semantically a sequence |
41 | /// of fields, each a contiguous Arrow array |
42 | class ARROW_EXPORT RecordBatch { |
43 | public: |
44 | virtual ~RecordBatch() = default; |
45 | |
46 | /// \param[in] schema The record batch schema |
47 | /// \param[in] num_rows length of fields in the record batch. Each array |
48 | /// should have the same length as num_rows |
49 | /// \param[in] columns the record batch fields as vector of arrays |
50 | static std::shared_ptr<RecordBatch> Make( |
51 | const std::shared_ptr<Schema>& schema, int64_t num_rows, |
52 | const std::vector<std::shared_ptr<Array>>& columns); |
53 | |
54 | /// \brief Move-based constructor for a vector of Array instances |
55 | static std::shared_ptr<RecordBatch> Make(const std::shared_ptr<Schema>& schema, |
56 | int64_t num_rows, |
57 | std::vector<std::shared_ptr<Array>>&& columns); |
58 | |
59 | /// \brief Construct record batch from vector of internal data structures |
60 | /// \since 0.5.0 |
61 | /// |
62 | /// This class is only provided with an rvalue-reference for the input data, |
63 | /// and is intended for internal use, or advanced users. |
64 | /// |
65 | /// \param schema the record batch schema |
66 | /// \param num_rows the number of semantic rows in the record batch. This |
67 | /// should be equal to the length of each field |
68 | /// \param columns the data for the batch's columns |
69 | static std::shared_ptr<RecordBatch> Make( |
70 | const std::shared_ptr<Schema>& schema, int64_t num_rows, |
71 | std::vector<std::shared_ptr<ArrayData>>&& columns); |
72 | |
73 | /// \brief Construct record batch by copying vector of array data |
74 | /// \since 0.5.0 |
75 | static std::shared_ptr<RecordBatch> Make( |
76 | const std::shared_ptr<Schema>& schema, int64_t num_rows, |
77 | const std::vector<std::shared_ptr<ArrayData>>& columns); |
78 | |
79 | /// \brief Determine if two record batches are exactly equal |
80 | /// \return true if batches are equal |
81 | bool Equals(const RecordBatch& other) const; |
82 | |
83 | /// \brief Determine if two record batches are approximately equal |
84 | bool ApproxEquals(const RecordBatch& other) const; |
85 | |
86 | // \return the table's schema |
87 | /// \return true if batches are equal |
88 | std::shared_ptr<Schema> schema() const { return schema_; } |
89 | |
90 | /// \brief Retrieve an array from the record batch |
91 | /// \param[in] i field index, does not boundscheck |
92 | /// \return an Array object |
93 | virtual std::shared_ptr<Array> column(int i) const = 0; |
94 | |
95 | /// \brief Retrieve an array's internaldata from the record batch |
96 | /// \param[in] i field index, does not boundscheck |
97 | /// \return an internal ArrayData object |
98 | virtual std::shared_ptr<ArrayData> column_data(int i) const = 0; |
99 | |
100 | /// \brief Add column to the record batch, producing a new RecordBatch |
101 | /// |
102 | /// \param[in] i field index, which will be boundschecked |
103 | /// \param[in] field field to be added |
104 | /// \param[in] column column to be added |
105 | /// \param[out] out record batch with column added |
106 | virtual Status AddColumn(int i, const std::shared_ptr<Field>& field, |
107 | const std::shared_ptr<Array>& column, |
108 | std::shared_ptr<RecordBatch>* out) const = 0; |
109 | |
110 | /// \brief Add new nullable column to the record batch, producing a new |
111 | /// RecordBatch. |
112 | /// |
113 | /// For non-nullable columns, use the Field-based version of this method. |
114 | /// |
115 | /// \param[in] i field index, which will be boundschecked |
116 | /// \param[in] field_name name of field to be added |
117 | /// \param[in] column column to be added |
118 | /// \param[out] out record batch with column added |
119 | virtual Status AddColumn(int i, const std::string& field_name, |
120 | const std::shared_ptr<Array>& column, |
121 | std::shared_ptr<RecordBatch>* out) const; |
122 | |
123 | /// \brief Remove column from the record batch, producing a new RecordBatch |
124 | /// |
125 | /// \param[in] i field index, does boundscheck |
126 | /// \param[out] out record batch with column removed |
127 | virtual Status RemoveColumn(int i, std::shared_ptr<RecordBatch>* out) const = 0; |
128 | |
129 | virtual std::shared_ptr<RecordBatch> ReplaceSchemaMetadata( |
130 | const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0; |
131 | |
132 | /// \brief Name in i-th column |
133 | const std::string& column_name(int i) const; |
134 | |
135 | /// \return the number of columns in the table |
136 | int num_columns() const { return schema_->num_fields(); } |
137 | |
138 | /// \return the number of rows (the corresponding length of each column) |
139 | int64_t num_rows() const { return num_rows_; } |
140 | |
141 | /// \brief Slice each of the arrays in the record batch |
142 | /// \param[in] offset the starting offset to slice, through end of batch |
143 | /// \return new record batch |
144 | virtual std::shared_ptr<RecordBatch> Slice(int64_t offset) const; |
145 | |
146 | /// \brief Slice each of the arrays in the record batch |
147 | /// \param[in] offset the starting offset to slice |
148 | /// \param[in] length the number of elements to slice from offset |
149 | /// \return new record batch |
150 | virtual std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const = 0; |
151 | |
152 | /// \brief Check for schema or length inconsistencies |
153 | /// \return Status |
154 | virtual Status Validate() const; |
155 | |
156 | protected: |
157 | RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows); |
158 | |
159 | std::shared_ptr<Schema> schema_; |
160 | int64_t num_rows_; |
161 | |
162 | private: |
163 | ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch); |
164 | }; |
165 | |
166 | /// \brief Abstract interface for reading stream of record batches |
167 | class ARROW_EXPORT RecordBatchReader { |
168 | public: |
169 | virtual ~RecordBatchReader(); |
170 | |
171 | /// \return the shared schema of the record batches in the stream |
172 | virtual std::shared_ptr<Schema> schema() const = 0; |
173 | |
174 | /// \brief Read the next record batch in the stream. Return null for batch |
175 | /// when reaching end of stream |
176 | /// |
177 | /// \param[out] batch the next loaded batch, null at end of stream |
178 | /// \return Status |
179 | virtual Status ReadNext(std::shared_ptr<RecordBatch>* batch) = 0; |
180 | |
181 | /// \brief Consume entire stream as a vector of record batches |
182 | Status ReadAll(std::vector<std::shared_ptr<RecordBatch>>* batches); |
183 | |
184 | /// \brief Read all batches and concatenate as arrow::Table |
185 | Status ReadAll(std::shared_ptr<Table>* table); |
186 | }; |
187 | |
188 | } // namespace arrow |
189 | |
190 | #endif // ARROW_RECORD_BATCH_H |
191 | |