1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_TABLE_H |
19 | #define ARROW_TABLE_H |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | #include <string> |
24 | #include <vector> |
25 | |
26 | #include "arrow/array.h" |
27 | #include "arrow/record_batch.h" |
28 | #include "arrow/type.h" |
29 | #include "arrow/util/macros.h" |
30 | #include "arrow/util/visibility.h" |
31 | |
32 | namespace arrow { |
33 | |
34 | class MemoryPool; |
35 | class Status; |
36 | |
37 | /// \class ChunkedArray |
38 | /// \brief A data structure managing a list of primitive Arrow arrays logically |
39 | /// as one large array |
40 | class ARROW_EXPORT ChunkedArray { |
41 | public: |
42 | /// \brief Construct a chunked array from a vector of arrays |
43 | /// |
44 | /// The vector should be non-empty and all its elements should have the same |
45 | /// data type. |
46 | explicit ChunkedArray(const ArrayVector& chunks); |
47 | |
48 | /// \brief Construct a chunked array from a single Array |
49 | explicit ChunkedArray(const std::shared_ptr<Array>& chunk) |
50 | : ChunkedArray(ArrayVector({chunk})) {} |
51 | |
52 | /// \brief Construct a chunked array from a vector of arrays and a data type |
53 | /// |
54 | /// As the data type is passed explicitly, the vector may be empty. |
55 | ChunkedArray(const ArrayVector& chunks, const std::shared_ptr<DataType>& type); |
56 | |
57 | /// \return the total length of the chunked array; computed on construction |
58 | int64_t length() const { return length_; } |
59 | |
60 | /// \return the total number of nulls among all chunks |
61 | int64_t null_count() const { return null_count_; } |
62 | |
63 | int num_chunks() const { return static_cast<int>(chunks_.size()); } |
64 | |
65 | /// \return chunk a particular chunk from the chunked array |
66 | std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; } |
67 | |
68 | const ArrayVector& chunks() const { return chunks_; } |
69 | |
70 | /// \brief Construct a zero-copy slice of the chunked array with the |
71 | /// indicated offset and length |
72 | /// |
73 | /// \param[in] offset the position of the first element in the constructed |
74 | /// slice |
75 | /// \param[in] length the length of the slice. If there are not enough |
76 | /// elements in the chunked array, the length will be adjusted accordingly |
77 | /// |
78 | /// \return a new object wrapped in std::shared_ptr<ChunkedArray> |
79 | std::shared_ptr<ChunkedArray> Slice(int64_t offset, int64_t length) const; |
80 | |
81 | /// \brief Slice from offset until end of the chunked array |
82 | std::shared_ptr<ChunkedArray> Slice(int64_t offset) const; |
83 | |
84 | /// \brief Flatten this chunked array as a vector of chunked arrays, one |
85 | /// for each struct field |
86 | /// |
87 | /// \param[in] pool The pool for buffer allocations, if any |
88 | /// \param[out] out The resulting vector of arrays |
89 | Status Flatten(MemoryPool* pool, std::vector<std::shared_ptr<ChunkedArray>>* out) const; |
90 | |
91 | std::shared_ptr<DataType> type() const { return type_; } |
92 | |
93 | /// \brief Determine if two chunked arrays are equal. |
94 | /// |
95 | /// Two chunked arrays can be equal only if they have equal datatypes. |
96 | /// However, they may be equal even if they have different chunkings. |
97 | bool Equals(const ChunkedArray& other) const; |
98 | /// \brief Determine if two chunked arrays are equal. |
99 | bool Equals(const std::shared_ptr<ChunkedArray>& other) const; |
100 | |
101 | protected: |
102 | ArrayVector chunks_; |
103 | int64_t length_; |
104 | int64_t null_count_; |
105 | std::shared_ptr<DataType> type_; |
106 | |
107 | private: |
108 | ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); |
109 | }; |
110 | |
111 | /// \class Column |
112 | /// \brief An immutable column data structure consisting of a field (type |
113 | /// metadata) and a chunked data array |
114 | class ARROW_EXPORT Column { |
115 | public: |
116 | /// \brief Construct a column from a vector of arrays |
117 | /// |
118 | /// The array chunks' datatype must match the field's datatype. |
119 | Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks); |
120 | /// \brief Construct a column from a chunked array |
121 | /// |
122 | /// The chunked array's datatype must match the field's datatype. |
123 | Column(const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data); |
124 | /// \brief Construct a column from a single array |
125 | /// |
126 | /// The array's datatype must match the field's datatype. |
127 | Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data); |
128 | |
129 | /// \brief Construct a column from a name and an array |
130 | /// |
131 | /// A field with the given name and the array's datatype is automatically created. |
132 | Column(const std::string& name, const std::shared_ptr<Array>& data); |
133 | /// \brief Construct a column from a name and a chunked array |
134 | /// |
135 | /// A field with the given name and the array's datatype is automatically created. |
136 | Column(const std::string& name, const std::shared_ptr<ChunkedArray>& data); |
137 | |
138 | int64_t length() const { return data_->length(); } |
139 | |
140 | int64_t null_count() const { return data_->null_count(); } |
141 | |
142 | std::shared_ptr<Field> field() const { return field_; } |
143 | |
144 | /// \brief The column name |
145 | /// \return the column's name in the passed metadata |
146 | const std::string& name() const { return field_->name(); } |
147 | |
148 | /// \brief The column type |
149 | /// \return the column's type according to the metadata |
150 | std::shared_ptr<DataType> type() const { return field_->type(); } |
151 | |
152 | /// \brief The column data as a chunked array |
153 | /// \return the column's data as a chunked logical array |
154 | std::shared_ptr<ChunkedArray> data() const { return data_; } |
155 | |
156 | /// \brief Construct a zero-copy slice of the column with the indicated |
157 | /// offset and length |
158 | /// |
159 | /// \param[in] offset the position of the first element in the constructed |
160 | /// slice |
161 | /// \param[in] length the length of the slice. If there are not enough |
162 | /// elements in the column, the length will be adjusted accordingly |
163 | /// |
164 | /// \return a new object wrapped in std::shared_ptr<Column> |
165 | std::shared_ptr<Column> Slice(int64_t offset, int64_t length) const { |
166 | return std::make_shared<Column>(field_, data_->Slice(offset, length)); |
167 | } |
168 | |
169 | /// \brief Slice from offset until end of the column |
170 | std::shared_ptr<Column> Slice(int64_t offset) const { |
171 | return std::make_shared<Column>(field_, data_->Slice(offset)); |
172 | } |
173 | |
174 | /// \brief Flatten this column as a vector of columns |
175 | /// |
176 | /// \param[in] pool The pool for buffer allocations, if any |
177 | /// \param[out] out The resulting vector of arrays |
178 | Status Flatten(MemoryPool* pool, std::vector<std::shared_ptr<Column>>* out) const; |
179 | |
180 | /// \brief Determine if two columns are equal. |
181 | /// |
182 | /// Two columns can be equal only if they have equal datatypes. |
183 | /// However, they may be equal even if they have different chunkings. |
184 | bool Equals(const Column& other) const; |
185 | /// \brief Determine if the two columns are equal. |
186 | bool Equals(const std::shared_ptr<Column>& other) const; |
187 | |
188 | /// \brief Verify that the column's array data is consistent with the passed |
189 | /// field's metadata |
190 | Status ValidateData(); |
191 | |
192 | protected: |
193 | std::shared_ptr<Field> field_; |
194 | std::shared_ptr<ChunkedArray> data_; |
195 | |
196 | private: |
197 | ARROW_DISALLOW_COPY_AND_ASSIGN(Column); |
198 | }; |
199 | |
200 | /// \class Table |
201 | /// \brief Logical table as sequence of chunked arrays |
202 | class ARROW_EXPORT Table { |
203 | public: |
204 | virtual ~Table() = default; |
205 | |
206 | /// \brief Construct Table from schema and columns |
207 | /// If columns is zero-length, the table's number of rows is zero |
208 | /// \param schema The table schema (column types) |
209 | /// \param columns The table's columns |
210 | /// \param num_rows number of rows in table, -1 (default) to infer from columns |
211 | static std::shared_ptr<Table> Make(const std::shared_ptr<Schema>& schema, |
212 | const std::vector<std::shared_ptr<Column>>& columns, |
213 | int64_t num_rows = -1); |
214 | |
215 | /// \brief Construct Table from schema and arrays |
216 | /// \param schema The table schema (column types) |
217 | /// \param arrays The table's columns as arrays |
218 | /// \param num_rows number of rows in table, -1 (default) to infer from columns |
219 | static std::shared_ptr<Table> Make(const std::shared_ptr<Schema>& schema, |
220 | const std::vector<std::shared_ptr<Array>>& arrays, |
221 | int64_t num_rows = -1); |
222 | |
223 | /// \brief Construct table from RecordBatches, using schema supplied by the first |
224 | /// RecordBatch. |
225 | /// |
226 | /// \param[in] batches a std::vector of record batches |
227 | /// \param[out] table the returned table |
228 | /// \return Status Returns Status::Invalid if there is some problem |
229 | static Status FromRecordBatches( |
230 | const std::vector<std::shared_ptr<RecordBatch>>& batches, |
231 | std::shared_ptr<Table>* table); |
232 | |
233 | /// Construct table from RecordBatches, using supplied schema. There may be |
234 | /// zero record batches |
235 | /// |
236 | /// \param[in] schema the arrow::Schema for each batch |
237 | /// \param[in] batches a std::vector of record batches |
238 | /// \param[out] table the returned table |
239 | /// \return Status |
240 | static Status FromRecordBatches( |
241 | const std::shared_ptr<Schema>& schema, |
242 | const std::vector<std::shared_ptr<RecordBatch>>& batches, |
243 | std::shared_ptr<Table>* table); |
244 | |
245 | /// Return the table schema |
246 | std::shared_ptr<Schema> schema() const { return schema_; } |
247 | |
248 | /// Return a column by index |
249 | virtual std::shared_ptr<Column> column(int i) const = 0; |
250 | |
251 | /// \brief Remove column from the table, producing a new Table |
252 | virtual Status RemoveColumn(int i, std::shared_ptr<Table>* out) const = 0; |
253 | |
254 | /// \brief Add column to the table, producing a new Table |
255 | virtual Status AddColumn(int i, const std::shared_ptr<Column>& column, |
256 | std::shared_ptr<Table>* out) const = 0; |
257 | |
258 | /// \brief Replace a column in the table, producing a new Table |
259 | virtual Status SetColumn(int i, const std::shared_ptr<Column>& column, |
260 | std::shared_ptr<Table>* out) const = 0; |
261 | |
262 | /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL) |
263 | /// \since 0.5.0 |
264 | /// |
265 | /// \param[in] metadata new KeyValueMetadata |
266 | /// \return new Table |
267 | virtual std::shared_ptr<Table> ReplaceSchemaMetadata( |
268 | const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0; |
269 | |
270 | /// \brief Flatten the table, producing a new Table. Any column with a |
271 | /// struct type will be flattened into multiple columns |
272 | /// |
273 | /// \param[in] pool The pool for buffer allocations, if any |
274 | /// \param[out] out The returned table |
275 | virtual Status Flatten(MemoryPool* pool, std::shared_ptr<Table>* out) const = 0; |
276 | |
277 | /// \brief Perform any checks to validate the input arguments |
278 | virtual Status Validate() const = 0; |
279 | |
280 | /// \brief Return the number of columns in the table |
281 | int num_columns() const { return schema_->num_fields(); } |
282 | |
283 | /// \brief Return the number of rows (equal to each column's logical length) |
284 | int64_t num_rows() const { return num_rows_; } |
285 | |
286 | /// \brief Determine if tables are equal |
287 | /// |
288 | /// Two tables can be equal only if they have equal schemas. |
289 | /// However, they may be equal even if they have different chunkings. |
290 | bool Equals(const Table& other) const; |
291 | |
292 | protected: |
293 | Table(); |
294 | |
295 | std::shared_ptr<Schema> schema_; |
296 | int64_t num_rows_; |
297 | |
298 | private: |
299 | ARROW_DISALLOW_COPY_AND_ASSIGN(Table); |
300 | }; |
301 | |
302 | /// \brief Compute a stream of record batches from a (possibly chunked) Table |
303 | /// |
304 | /// The conversion is zero-copy: each record batch is a view over a slice |
305 | /// of the table's columns. |
306 | class ARROW_EXPORT TableBatchReader : public RecordBatchReader { |
307 | public: |
308 | ~TableBatchReader() override; |
309 | |
310 | /// \brief Construct a TableBatchReader for the given table |
311 | explicit TableBatchReader(const Table& table); |
312 | |
313 | std::shared_ptr<Schema> schema() const override; |
314 | |
315 | Status ReadNext(std::shared_ptr<RecordBatch>* out) override; |
316 | |
317 | /// \brief Set the desired maximum chunk size of record batches |
318 | /// |
319 | /// The actual chunk size of each record batch may be smaller, depending |
320 | /// on actual chunking characteristics of each table column. |
321 | void set_chunksize(int64_t chunksize); |
322 | |
323 | private: |
324 | class TableBatchReaderImpl; |
325 | std::unique_ptr<TableBatchReaderImpl> impl_; |
326 | }; |
327 | |
328 | /// \brief Construct table from multiple input tables. |
329 | /// |
330 | /// The tables are concatenated vertically. Therefore, all tables should |
331 | /// have the same schema. Each column in the output table is the result |
332 | /// of concatenating the corresponding columns in all input tables. |
333 | ARROW_EXPORT |
334 | Status ConcatenateTables(const std::vector<std::shared_ptr<Table>>& tables, |
335 | std::shared_ptr<Table>* table); |
336 | |
337 | } // namespace arrow |
338 | |
339 | #endif // ARROW_TABLE_H |
340 | |