1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_RECORD_BATCH_H
19#define ARROW_RECORD_BATCH_H
20
21#include <cstdint>
22#include <memory>
23#include <string>
24#include <vector>
25
26#include "arrow/type.h"
27#include "arrow/util/macros.h"
28#include "arrow/util/visibility.h"
29
30namespace arrow {
31
32class Array;
33struct ArrayData;
34class Status;
35class Table;
36
37/// \class RecordBatch
38/// \brief Collection of equal-length arrays matching a particular Schema
39///
40/// A record batch is table-like data structure that is semantically a sequence
41/// of fields, each a contiguous Arrow array
42class ARROW_EXPORT RecordBatch {
43 public:
44 virtual ~RecordBatch() = default;
45
46 /// \param[in] schema The record batch schema
47 /// \param[in] num_rows length of fields in the record batch. Each array
48 /// should have the same length as num_rows
49 /// \param[in] columns the record batch fields as vector of arrays
50 static std::shared_ptr<RecordBatch> Make(
51 const std::shared_ptr<Schema>& schema, int64_t num_rows,
52 const std::vector<std::shared_ptr<Array>>& columns);
53
54 /// \brief Move-based constructor for a vector of Array instances
55 static std::shared_ptr<RecordBatch> Make(const std::shared_ptr<Schema>& schema,
56 int64_t num_rows,
57 std::vector<std::shared_ptr<Array>>&& columns);
58
59 /// \brief Construct record batch from vector of internal data structures
60 /// \since 0.5.0
61 ///
62 /// This class is only provided with an rvalue-reference for the input data,
63 /// and is intended for internal use, or advanced users.
64 ///
65 /// \param schema the record batch schema
66 /// \param num_rows the number of semantic rows in the record batch. This
67 /// should be equal to the length of each field
68 /// \param columns the data for the batch's columns
69 static std::shared_ptr<RecordBatch> Make(
70 const std::shared_ptr<Schema>& schema, int64_t num_rows,
71 std::vector<std::shared_ptr<ArrayData>>&& columns);
72
73 /// \brief Construct record batch by copying vector of array data
74 /// \since 0.5.0
75 static std::shared_ptr<RecordBatch> Make(
76 const std::shared_ptr<Schema>& schema, int64_t num_rows,
77 const std::vector<std::shared_ptr<ArrayData>>& columns);
78
79 /// \brief Determine if two record batches are exactly equal
80 /// \return true if batches are equal
81 bool Equals(const RecordBatch& other) const;
82
83 /// \brief Determine if two record batches are approximately equal
84 bool ApproxEquals(const RecordBatch& other) const;
85
86 // \return the table's schema
87 /// \return true if batches are equal
88 std::shared_ptr<Schema> schema() const { return schema_; }
89
90 /// \brief Retrieve an array from the record batch
91 /// \param[in] i field index, does not boundscheck
92 /// \return an Array object
93 virtual std::shared_ptr<Array> column(int i) const = 0;
94
95 /// \brief Retrieve an array's internaldata from the record batch
96 /// \param[in] i field index, does not boundscheck
97 /// \return an internal ArrayData object
98 virtual std::shared_ptr<ArrayData> column_data(int i) const = 0;
99
100 /// \brief Add column to the record batch, producing a new RecordBatch
101 ///
102 /// \param[in] i field index, which will be boundschecked
103 /// \param[in] field field to be added
104 /// \param[in] column column to be added
105 /// \param[out] out record batch with column added
106 virtual Status AddColumn(int i, const std::shared_ptr<Field>& field,
107 const std::shared_ptr<Array>& column,
108 std::shared_ptr<RecordBatch>* out) const = 0;
109
110 /// \brief Add new nullable column to the record batch, producing a new
111 /// RecordBatch.
112 ///
113 /// For non-nullable columns, use the Field-based version of this method.
114 ///
115 /// \param[in] i field index, which will be boundschecked
116 /// \param[in] field_name name of field to be added
117 /// \param[in] column column to be added
118 /// \param[out] out record batch with column added
119 virtual Status AddColumn(int i, const std::string& field_name,
120 const std::shared_ptr<Array>& column,
121 std::shared_ptr<RecordBatch>* out) const;
122
123 /// \brief Remove column from the record batch, producing a new RecordBatch
124 ///
125 /// \param[in] i field index, does boundscheck
126 /// \param[out] out record batch with column removed
127 virtual Status RemoveColumn(int i, std::shared_ptr<RecordBatch>* out) const = 0;
128
129 virtual std::shared_ptr<RecordBatch> ReplaceSchemaMetadata(
130 const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
131
132 /// \brief Name in i-th column
133 const std::string& column_name(int i) const;
134
135 /// \return the number of columns in the table
136 int num_columns() const { return schema_->num_fields(); }
137
138 /// \return the number of rows (the corresponding length of each column)
139 int64_t num_rows() const { return num_rows_; }
140
141 /// \brief Slice each of the arrays in the record batch
142 /// \param[in] offset the starting offset to slice, through end of batch
143 /// \return new record batch
144 virtual std::shared_ptr<RecordBatch> Slice(int64_t offset) const;
145
146 /// \brief Slice each of the arrays in the record batch
147 /// \param[in] offset the starting offset to slice
148 /// \param[in] length the number of elements to slice from offset
149 /// \return new record batch
150 virtual std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const = 0;
151
152 /// \brief Check for schema or length inconsistencies
153 /// \return Status
154 virtual Status Validate() const;
155
156 protected:
157 RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows);
158
159 std::shared_ptr<Schema> schema_;
160 int64_t num_rows_;
161
162 private:
163 ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch);
164};
165
166/// \brief Abstract interface for reading stream of record batches
167class ARROW_EXPORT RecordBatchReader {
168 public:
169 virtual ~RecordBatchReader();
170
171 /// \return the shared schema of the record batches in the stream
172 virtual std::shared_ptr<Schema> schema() const = 0;
173
174 /// \brief Read the next record batch in the stream. Return null for batch
175 /// when reaching end of stream
176 ///
177 /// \param[out] batch the next loaded batch, null at end of stream
178 /// \return Status
179 virtual Status ReadNext(std::shared_ptr<RecordBatch>* batch) = 0;
180
181 /// \brief Consume entire stream as a vector of record batches
182 Status ReadAll(std::vector<std::shared_ptr<RecordBatch>>* batches);
183
184 /// \brief Read all batches and concatenate as arrow::Table
185 Status ReadAll(std::shared_ptr<Table>* table);
186};
187
188} // namespace arrow
189
190#endif // ARROW_RECORD_BATCH_H
191