1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_TABLE_H
19#define ARROW_TABLE_H
20
21#include <cstdint>
22#include <memory>
23#include <string>
24#include <vector>
25
26#include "arrow/array.h"
27#include "arrow/record_batch.h"
28#include "arrow/type.h"
29#include "arrow/util/macros.h"
30#include "arrow/util/visibility.h"
31
32namespace arrow {
33
34class MemoryPool;
35class Status;
36
37/// \class ChunkedArray
38/// \brief A data structure managing a list of primitive Arrow arrays logically
39/// as one large array
40class ARROW_EXPORT ChunkedArray {
41 public:
42 /// \brief Construct a chunked array from a vector of arrays
43 ///
44 /// The vector should be non-empty and all its elements should have the same
45 /// data type.
46 explicit ChunkedArray(const ArrayVector& chunks);
47
48 /// \brief Construct a chunked array from a single Array
49 explicit ChunkedArray(const std::shared_ptr<Array>& chunk)
50 : ChunkedArray(ArrayVector({chunk})) {}
51
52 /// \brief Construct a chunked array from a vector of arrays and a data type
53 ///
54 /// As the data type is passed explicitly, the vector may be empty.
55 ChunkedArray(const ArrayVector& chunks, const std::shared_ptr<DataType>& type);
56
57 /// \return the total length of the chunked array; computed on construction
58 int64_t length() const { return length_; }
59
60 /// \return the total number of nulls among all chunks
61 int64_t null_count() const { return null_count_; }
62
63 int num_chunks() const { return static_cast<int>(chunks_.size()); }
64
65 /// \return chunk a particular chunk from the chunked array
66 std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; }
67
68 const ArrayVector& chunks() const { return chunks_; }
69
70 /// \brief Construct a zero-copy slice of the chunked array with the
71 /// indicated offset and length
72 ///
73 /// \param[in] offset the position of the first element in the constructed
74 /// slice
75 /// \param[in] length the length of the slice. If there are not enough
76 /// elements in the chunked array, the length will be adjusted accordingly
77 ///
78 /// \return a new object wrapped in std::shared_ptr<ChunkedArray>
79 std::shared_ptr<ChunkedArray> Slice(int64_t offset, int64_t length) const;
80
81 /// \brief Slice from offset until end of the chunked array
82 std::shared_ptr<ChunkedArray> Slice(int64_t offset) const;
83
84 /// \brief Flatten this chunked array as a vector of chunked arrays, one
85 /// for each struct field
86 ///
87 /// \param[in] pool The pool for buffer allocations, if any
88 /// \param[out] out The resulting vector of arrays
89 Status Flatten(MemoryPool* pool, std::vector<std::shared_ptr<ChunkedArray>>* out) const;
90
91 std::shared_ptr<DataType> type() const { return type_; }
92
93 /// \brief Determine if two chunked arrays are equal.
94 ///
95 /// Two chunked arrays can be equal only if they have equal datatypes.
96 /// However, they may be equal even if they have different chunkings.
97 bool Equals(const ChunkedArray& other) const;
98 /// \brief Determine if two chunked arrays are equal.
99 bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
100
101 protected:
102 ArrayVector chunks_;
103 int64_t length_;
104 int64_t null_count_;
105 std::shared_ptr<DataType> type_;
106
107 private:
108 ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
109};
110
111/// \class Column
112/// \brief An immutable column data structure consisting of a field (type
113/// metadata) and a chunked data array
114class ARROW_EXPORT Column {
115 public:
116 /// \brief Construct a column from a vector of arrays
117 ///
118 /// The array chunks' datatype must match the field's datatype.
119 Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks);
120 /// \brief Construct a column from a chunked array
121 ///
122 /// The chunked array's datatype must match the field's datatype.
123 Column(const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data);
124 /// \brief Construct a column from a single array
125 ///
126 /// The array's datatype must match the field's datatype.
127 Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data);
128
129 /// \brief Construct a column from a name and an array
130 ///
131 /// A field with the given name and the array's datatype is automatically created.
132 Column(const std::string& name, const std::shared_ptr<Array>& data);
133 /// \brief Construct a column from a name and a chunked array
134 ///
135 /// A field with the given name and the array's datatype is automatically created.
136 Column(const std::string& name, const std::shared_ptr<ChunkedArray>& data);
137
138 int64_t length() const { return data_->length(); }
139
140 int64_t null_count() const { return data_->null_count(); }
141
142 std::shared_ptr<Field> field() const { return field_; }
143
144 /// \brief The column name
145 /// \return the column's name in the passed metadata
146 const std::string& name() const { return field_->name(); }
147
148 /// \brief The column type
149 /// \return the column's type according to the metadata
150 std::shared_ptr<DataType> type() const { return field_->type(); }
151
152 /// \brief The column data as a chunked array
153 /// \return the column's data as a chunked logical array
154 std::shared_ptr<ChunkedArray> data() const { return data_; }
155
156 /// \brief Construct a zero-copy slice of the column with the indicated
157 /// offset and length
158 ///
159 /// \param[in] offset the position of the first element in the constructed
160 /// slice
161 /// \param[in] length the length of the slice. If there are not enough
162 /// elements in the column, the length will be adjusted accordingly
163 ///
164 /// \return a new object wrapped in std::shared_ptr<Column>
165 std::shared_ptr<Column> Slice(int64_t offset, int64_t length) const {
166 return std::make_shared<Column>(field_, data_->Slice(offset, length));
167 }
168
169 /// \brief Slice from offset until end of the column
170 std::shared_ptr<Column> Slice(int64_t offset) const {
171 return std::make_shared<Column>(field_, data_->Slice(offset));
172 }
173
174 /// \brief Flatten this column as a vector of columns
175 ///
176 /// \param[in] pool The pool for buffer allocations, if any
177 /// \param[out] out The resulting vector of arrays
178 Status Flatten(MemoryPool* pool, std::vector<std::shared_ptr<Column>>* out) const;
179
180 /// \brief Determine if two columns are equal.
181 ///
182 /// Two columns can be equal only if they have equal datatypes.
183 /// However, they may be equal even if they have different chunkings.
184 bool Equals(const Column& other) const;
185 /// \brief Determine if the two columns are equal.
186 bool Equals(const std::shared_ptr<Column>& other) const;
187
188 /// \brief Verify that the column's array data is consistent with the passed
189 /// field's metadata
190 Status ValidateData();
191
192 protected:
193 std::shared_ptr<Field> field_;
194 std::shared_ptr<ChunkedArray> data_;
195
196 private:
197 ARROW_DISALLOW_COPY_AND_ASSIGN(Column);
198};
199
200/// \class Table
201/// \brief Logical table as sequence of chunked arrays
202class ARROW_EXPORT Table {
203 public:
204 virtual ~Table() = default;
205
206 /// \brief Construct Table from schema and columns
207 /// If columns is zero-length, the table's number of rows is zero
208 /// \param schema The table schema (column types)
209 /// \param columns The table's columns
210 /// \param num_rows number of rows in table, -1 (default) to infer from columns
211 static std::shared_ptr<Table> Make(const std::shared_ptr<Schema>& schema,
212 const std::vector<std::shared_ptr<Column>>& columns,
213 int64_t num_rows = -1);
214
215 /// \brief Construct Table from schema and arrays
216 /// \param schema The table schema (column types)
217 /// \param arrays The table's columns as arrays
218 /// \param num_rows number of rows in table, -1 (default) to infer from columns
219 static std::shared_ptr<Table> Make(const std::shared_ptr<Schema>& schema,
220 const std::vector<std::shared_ptr<Array>>& arrays,
221 int64_t num_rows = -1);
222
223 /// \brief Construct table from RecordBatches, using schema supplied by the first
224 /// RecordBatch.
225 ///
226 /// \param[in] batches a std::vector of record batches
227 /// \param[out] table the returned table
228 /// \return Status Returns Status::Invalid if there is some problem
229 static Status FromRecordBatches(
230 const std::vector<std::shared_ptr<RecordBatch>>& batches,
231 std::shared_ptr<Table>* table);
232
233 /// Construct table from RecordBatches, using supplied schema. There may be
234 /// zero record batches
235 ///
236 /// \param[in] schema the arrow::Schema for each batch
237 /// \param[in] batches a std::vector of record batches
238 /// \param[out] table the returned table
239 /// \return Status
240 static Status FromRecordBatches(
241 const std::shared_ptr<Schema>& schema,
242 const std::vector<std::shared_ptr<RecordBatch>>& batches,
243 std::shared_ptr<Table>* table);
244
245 /// Return the table schema
246 std::shared_ptr<Schema> schema() const { return schema_; }
247
248 /// Return a column by index
249 virtual std::shared_ptr<Column> column(int i) const = 0;
250
251 /// \brief Remove column from the table, producing a new Table
252 virtual Status RemoveColumn(int i, std::shared_ptr<Table>* out) const = 0;
253
254 /// \brief Add column to the table, producing a new Table
255 virtual Status AddColumn(int i, const std::shared_ptr<Column>& column,
256 std::shared_ptr<Table>* out) const = 0;
257
258 /// \brief Replace a column in the table, producing a new Table
259 virtual Status SetColumn(int i, const std::shared_ptr<Column>& column,
260 std::shared_ptr<Table>* out) const = 0;
261
262 /// \brief Replace schema key-value metadata with new metadata (EXPERIMENTAL)
263 /// \since 0.5.0
264 ///
265 /// \param[in] metadata new KeyValueMetadata
266 /// \return new Table
267 virtual std::shared_ptr<Table> ReplaceSchemaMetadata(
268 const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
269
270 /// \brief Flatten the table, producing a new Table. Any column with a
271 /// struct type will be flattened into multiple columns
272 ///
273 /// \param[in] pool The pool for buffer allocations, if any
274 /// \param[out] out The returned table
275 virtual Status Flatten(MemoryPool* pool, std::shared_ptr<Table>* out) const = 0;
276
277 /// \brief Perform any checks to validate the input arguments
278 virtual Status Validate() const = 0;
279
280 /// \brief Return the number of columns in the table
281 int num_columns() const { return schema_->num_fields(); }
282
283 /// \brief Return the number of rows (equal to each column's logical length)
284 int64_t num_rows() const { return num_rows_; }
285
286 /// \brief Determine if tables are equal
287 ///
288 /// Two tables can be equal only if they have equal schemas.
289 /// However, they may be equal even if they have different chunkings.
290 bool Equals(const Table& other) const;
291
292 protected:
293 Table();
294
295 std::shared_ptr<Schema> schema_;
296 int64_t num_rows_;
297
298 private:
299 ARROW_DISALLOW_COPY_AND_ASSIGN(Table);
300};
301
302/// \brief Compute a stream of record batches from a (possibly chunked) Table
303///
304/// The conversion is zero-copy: each record batch is a view over a slice
305/// of the table's columns.
306class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
307 public:
308 ~TableBatchReader() override;
309
310 /// \brief Construct a TableBatchReader for the given table
311 explicit TableBatchReader(const Table& table);
312
313 std::shared_ptr<Schema> schema() const override;
314
315 Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
316
317 /// \brief Set the desired maximum chunk size of record batches
318 ///
319 /// The actual chunk size of each record batch may be smaller, depending
320 /// on actual chunking characteristics of each table column.
321 void set_chunksize(int64_t chunksize);
322
323 private:
324 class TableBatchReaderImpl;
325 std::unique_ptr<TableBatchReaderImpl> impl_;
326};
327
328/// \brief Construct table from multiple input tables.
329///
330/// The tables are concatenated vertically. Therefore, all tables should
331/// have the same schema. Each column in the output table is the result
332/// of concatenating the corresponding columns in all input tables.
333ARROW_EXPORT
334Status ConcatenateTables(const std::vector<std::shared_ptr<Table>>& tables,
335 std::shared_ptr<Table>* table);
336
337} // namespace arrow
338
339#endif // ARROW_TABLE_H
340