1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | // Public API for the "Feather" file format, originally created at |
19 | // http://github.com/wesm/feather |
20 | |
21 | #ifndef ARROW_IPC_FEATHER_INTERNAL_H |
22 | #define ARROW_IPC_FEATHER_INTERNAL_H |
23 | |
24 | #include <cstdint> |
25 | #include <iostream> |
26 | #include <memory> |
27 | #include <string> |
28 | #include <vector> |
29 | |
30 | #include "flatbuffers/flatbuffers.h" |
31 | |
32 | #include "arrow/buffer.h" |
33 | #include "arrow/ipc/feather.h" |
34 | #include "arrow/ipc/feather_generated.h" |
35 | #include "arrow/type.h" |
36 | |
37 | namespace arrow { |
38 | namespace ipc { |
39 | namespace feather { |
40 | |
41 | typedef std::vector<flatbuffers::Offset<fbs::Column>> ColumnVector; |
42 | typedef flatbuffers::FlatBufferBuilder FBB; |
43 | typedef flatbuffers::Offset<flatbuffers::String> FBString; |
44 | |
45 | struct ARROW_EXPORT ColumnType { |
46 | enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME }; |
47 | }; |
48 | |
49 | struct ARROW_EXPORT ArrayMetadata { |
50 | ArrayMetadata() {} |
51 | |
52 | ArrayMetadata(fbs::Type type, int64_t offset, int64_t length, int64_t null_count, |
53 | int64_t total_bytes) |
54 | : type(type), |
55 | offset(offset), |
56 | length(length), |
57 | null_count(null_count), |
58 | total_bytes(total_bytes) {} |
59 | |
60 | bool Equals(const ArrayMetadata& other) const { |
61 | return this->type == other.type && this->offset == other.offset && |
62 | this->length == other.length && this->null_count == other.null_count && |
63 | this->total_bytes == other.total_bytes; |
64 | } |
65 | |
66 | fbs::Type type; |
67 | int64_t offset; |
68 | int64_t length; |
69 | int64_t null_count; |
70 | int64_t total_bytes; |
71 | }; |
72 | |
73 | struct ARROW_EXPORT CategoryMetadata { |
74 | ArrayMetadata levels; |
75 | bool ordered; |
76 | }; |
77 | |
78 | struct ARROW_EXPORT TimestampMetadata { |
79 | TimeUnit::type unit; |
80 | |
81 | // A timezone name known to the Olson timezone database. For display purposes |
82 | // because the actual data is all UTC |
83 | std::string timezone; |
84 | }; |
85 | |
86 | struct ARROW_EXPORT TimeMetadata { |
87 | TimeUnit::type unit; |
88 | }; |
89 | |
90 | static constexpr const char* kFeatherMagicBytes = "FEA1" ; |
91 | static constexpr const int kFeatherDefaultAlignment = 8; |
92 | |
93 | class ColumnBuilder; |
94 | |
95 | class ARROW_EXPORT TableBuilder { |
96 | public: |
97 | explicit TableBuilder(int64_t num_rows); |
98 | ~TableBuilder() = default; |
99 | |
100 | FBB& fbb(); |
101 | Status Finish(); |
102 | std::shared_ptr<Buffer> GetBuffer() const; |
103 | |
104 | std::unique_ptr<ColumnBuilder> AddColumn(const std::string& name); |
105 | void SetDescription(const std::string& description); |
106 | void SetNumRows(int64_t num_rows); |
107 | void add_column(const flatbuffers::Offset<fbs::Column>& col); |
108 | |
109 | private: |
110 | flatbuffers::FlatBufferBuilder fbb_; |
111 | ColumnVector columns_; |
112 | |
113 | friend class ColumnBuilder; |
114 | |
115 | bool finished_; |
116 | std::string description_; |
117 | int64_t num_rows_; |
118 | }; |
119 | |
120 | class ARROW_EXPORT TableMetadata { |
121 | public: |
122 | TableMetadata() : table_(NULLPTR) {} |
123 | ~TableMetadata() = default; |
124 | |
125 | Status Open(const std::shared_ptr<Buffer>& buffer) { |
126 | metadata_buffer_ = buffer; |
127 | table_ = fbs::GetCTable(buffer->data()); |
128 | |
129 | if (table_->version() < kFeatherVersion) { |
130 | std::cout << "This Feather file is old" |
131 | << " and will not be readable beyond the 0.3.0 release" << std::endl; |
132 | } |
133 | return Status::OK(); |
134 | } |
135 | |
136 | bool HasDescription() const { return table_->description() != 0; } |
137 | |
138 | std::string GetDescription() const { |
139 | if (!HasDescription()) { |
140 | return std::string("" ); |
141 | } |
142 | return table_->description()->str(); |
143 | } |
144 | |
145 | int version() const { return table_->version(); } |
146 | int64_t num_rows() const { return table_->num_rows(); } |
147 | int64_t num_columns() const { return table_->columns()->size(); } |
148 | |
149 | const fbs::Column* column(int i) { return table_->columns()->Get(i); } |
150 | |
151 | private: |
152 | std::shared_ptr<Buffer> metadata_buffer_; |
153 | const fbs::CTable* table_; |
154 | }; |
155 | |
156 | static inline flatbuffers::Offset<fbs::PrimitiveArray> GetPrimitiveArray( |
157 | FBB& fbb, const ArrayMetadata& array) { |
158 | return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding_PLAIN, array.offset, |
159 | array.length, array.null_count, array.total_bytes); |
160 | } |
161 | |
162 | static inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) { |
163 | return static_cast<fbs::TimeUnit>(static_cast<int>(unit)); |
164 | } |
165 | |
166 | static inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) { |
167 | return static_cast<TimeUnit::type>(static_cast<int>(unit)); |
168 | } |
169 | |
170 | // Convert Feather enums to Flatbuffer enums |
171 | |
172 | const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = { |
173 | fbs::TypeMetadata_NONE, // PRIMITIVE |
174 | fbs::TypeMetadata_CategoryMetadata, // CATEGORY |
175 | fbs::TypeMetadata_TimestampMetadata, // TIMESTAMP |
176 | fbs::TypeMetadata_DateMetadata, // DATE |
177 | fbs::TypeMetadata_TimeMetadata // TIME |
178 | }; |
179 | |
180 | static inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) { |
181 | return COLUMN_TYPE_ENUM_MAPPING[column_type]; |
182 | } |
183 | |
184 | static inline void FromFlatbuffer(const fbs::PrimitiveArray* values, ArrayMetadata* out) { |
185 | out->type = values->type(); |
186 | out->offset = values->offset(); |
187 | out->length = values->length(); |
188 | out->null_count = values->null_count(); |
189 | out->total_bytes = values->total_bytes(); |
190 | } |
191 | |
192 | class ARROW_EXPORT ColumnBuilder { |
193 | public: |
194 | ColumnBuilder(TableBuilder* parent, const std::string& name); |
195 | ~ColumnBuilder() = default; |
196 | |
197 | flatbuffers::Offset<void> CreateColumnMetadata(); |
198 | |
199 | Status Finish(); |
200 | void SetValues(const ArrayMetadata& values); |
201 | void SetUserMetadata(const std::string& data); |
202 | void SetCategory(const ArrayMetadata& levels, bool ordered = false); |
203 | void SetTimestamp(TimeUnit::type unit); |
204 | void SetTimestamp(TimeUnit::type unit, const std::string& timezone); |
205 | void SetDate(); |
206 | void SetTime(TimeUnit::type unit); |
207 | FBB& fbb(); |
208 | |
209 | private: |
210 | TableBuilder* parent_; |
211 | |
212 | std::string name_; |
213 | ArrayMetadata values_; |
214 | std::string user_metadata_; |
215 | |
216 | // Column metadata |
217 | |
218 | // Is this a primitive type, or one of the types having metadata? Default is |
219 | // primitive |
220 | ColumnType::type type_; |
221 | |
222 | // Type-specific metadata union |
223 | CategoryMetadata meta_category_; |
224 | TimeMetadata meta_time_; |
225 | |
226 | TimestampMetadata meta_timestamp_; |
227 | |
228 | FBB* fbb_; |
229 | }; |
230 | |
231 | } // namespace feather |
232 | } // namespace ipc |
233 | } // namespace arrow |
234 | |
235 | #endif // ARROW_IPC_FEATHER_INTERNAL_H |
236 | |