1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/type.h"
19
20#include <climits>
21#include <cstddef>
22#include <sstream> // IWYU pragma: keep
23#include <string>
24#include <utility>
25#include <vector>
26
27#include "arrow/array.h"
28#include "arrow/compare.h"
29#include "arrow/status.h"
30#include "arrow/util/checked_cast.h"
31#include "arrow/util/key_value_metadata.h"
32#include "arrow/util/logging.h"
33#include "arrow/util/stl.h"
34#include "arrow/visitor.h"
35
36namespace arrow {
37
38using internal::checked_cast;
39
40bool Field::HasMetadata() const {
41 return (metadata_ != nullptr) && (metadata_->size() > 0);
42}
43
44std::shared_ptr<Field> Field::AddMetadata(
45 const std::shared_ptr<const KeyValueMetadata>& metadata) const {
46 return std::make_shared<Field>(name_, type_, nullable_, metadata);
47}
48
49std::shared_ptr<Field> Field::RemoveMetadata() const {
50 return std::make_shared<Field>(name_, type_, nullable_);
51}
52
53std::vector<std::shared_ptr<Field>> Field::Flatten() const {
54 std::vector<std::shared_ptr<Field>> flattened;
55 if (type_->id() == Type::STRUCT) {
56 for (const auto& child : type_->children()) {
57 auto flattened_child = std::make_shared<Field>(*child);
58 flattened.push_back(flattened_child);
59 flattened_child->name_.insert(0, name() + ".");
60 flattened_child->nullable_ |= nullable_;
61 }
62 } else {
63 flattened.push_back(std::make_shared<Field>(*this));
64 }
65 return flattened;
66}
67
68bool Field::Equals(const Field& other, bool check_metadata) const {
69 if (this == &other) {
70 return true;
71 }
72 if (this->name_ == other.name_ && this->nullable_ == other.nullable_ &&
73 this->type_->Equals(*other.type_.get())) {
74 if (!check_metadata) {
75 return true;
76 } else if (this->HasMetadata() && other.HasMetadata()) {
77 return metadata_->Equals(*other.metadata_);
78 } else if (!this->HasMetadata() && !other.HasMetadata()) {
79 return true;
80 } else {
81 return false;
82 }
83 }
84 return false;
85}
86
87bool Field::Equals(const std::shared_ptr<Field>& other, bool check_metadata) const {
88 return Equals(*other.get(), check_metadata);
89}
90
91std::string Field::ToString() const {
92 std::stringstream ss;
93 ss << this->name_ << ": " << this->type_->ToString();
94 if (!this->nullable_) {
95 ss << " not null";
96 }
97 return ss.str();
98}
99
100DataType::~DataType() {}
101
102bool DataType::Equals(const DataType& other) const { return TypeEquals(*this, other); }
103
104bool DataType::Equals(const std::shared_ptr<DataType>& other) const {
105 if (!other) {
106 return false;
107 }
108 return Equals(*other.get());
109}
110
111std::string BooleanType::ToString() const { return name(); }
112
113FloatingPoint::Precision HalfFloatType::precision() const { return FloatingPoint::HALF; }
114
115FloatingPoint::Precision FloatType::precision() const { return FloatingPoint::SINGLE; }
116
117FloatingPoint::Precision DoubleType::precision() const { return FloatingPoint::DOUBLE; }
118
119std::string StringType::ToString() const { return std::string("string"); }
120
121std::string ListType::ToString() const {
122 std::stringstream s;
123 s << "list<" << value_field()->ToString() << ">";
124 return s.str();
125}
126
127std::string BinaryType::ToString() const { return std::string("binary"); }
128
129int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); }
130
131std::string FixedSizeBinaryType::ToString() const {
132 std::stringstream ss;
133 ss << "fixed_size_binary[" << byte_width_ << "]";
134 return ss.str();
135}
136
137// ----------------------------------------------------------------------
138// Date types
139
140DateType::DateType(Type::type type_id) : FixedWidthType(type_id) {}
141
142Date32Type::Date32Type() : DateType(Type::DATE32) {}
143
144Date64Type::Date64Type() : DateType(Type::DATE64) {}
145
146std::string Date64Type::ToString() const { return std::string("date64[ms]"); }
147
148std::string Date32Type::ToString() const { return std::string("date32[day]"); }
149
150// ----------------------------------------------------------------------
151// Time types
152
153TimeType::TimeType(Type::type type_id, TimeUnit::type unit)
154 : FixedWidthType(type_id), unit_(unit) {}
155
156Time32Type::Time32Type(TimeUnit::type unit) : TimeType(Type::TIME32, unit) {
157 DCHECK(unit == TimeUnit::SECOND || unit == TimeUnit::MILLI)
158 << "Must be seconds or milliseconds";
159}
160
161std::string Time32Type::ToString() const {
162 std::stringstream ss;
163 ss << "time32[" << this->unit_ << "]";
164 return ss.str();
165}
166
167Time64Type::Time64Type(TimeUnit::type unit) : TimeType(Type::TIME64, unit) {
168 DCHECK(unit == TimeUnit::MICRO || unit == TimeUnit::NANO)
169 << "Must be microseconds or nanoseconds";
170}
171
172std::string Time64Type::ToString() const {
173 std::stringstream ss;
174 ss << "time64[" << this->unit_ << "]";
175 return ss.str();
176}
177
178// ----------------------------------------------------------------------
179// Timestamp types
180
181std::string TimestampType::ToString() const {
182 std::stringstream ss;
183 ss << "timestamp[" << this->unit_;
184 if (this->timezone_.size() > 0) {
185 ss << ", tz=" << this->timezone_;
186 }
187 ss << "]";
188 return ss.str();
189}
190
191// ----------------------------------------------------------------------
192// Union type
193
194UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields,
195 const std::vector<uint8_t>& type_codes, UnionMode::type mode)
196 : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) {
197 children_ = fields;
198}
199
200std::string UnionType::ToString() const {
201 std::stringstream s;
202
203 if (mode_ == UnionMode::SPARSE) {
204 s << "union[sparse]<";
205 } else {
206 s << "union[dense]<";
207 }
208
209 for (size_t i = 0; i < children_.size(); ++i) {
210 if (i) {
211 s << ", ";
212 }
213 s << children_[i]->ToString() << "=" << static_cast<int>(type_codes_[i]);
214 }
215 s << ">";
216 return s.str();
217}
218
219// ----------------------------------------------------------------------
220// Struct type
221
222namespace {
223
224std::unordered_map<std::string, int> CreateNameToIndexMap(
225 const std::vector<std::shared_ptr<Field>>& fields) {
226 std::unordered_map<std::string, int> name_to_index;
227 for (size_t i = 0; i < fields.size(); ++i) {
228 name_to_index[fields[i]->name()] = static_cast<int>(i);
229 }
230 return name_to_index;
231}
232
233} // namespace
234
235StructType::StructType(const std::vector<std::shared_ptr<Field>>& fields)
236 : NestedType(Type::STRUCT), name_to_index_(CreateNameToIndexMap(fields)) {
237 children_ = fields;
238}
239
240std::string StructType::ToString() const {
241 std::stringstream s;
242 s << "struct<";
243 for (int i = 0; i < this->num_children(); ++i) {
244 if (i > 0) {
245 s << ", ";
246 }
247 std::shared_ptr<Field> field = this->child(i);
248 s << field->name() << ": " << field->type()->ToString();
249 }
250 s << ">";
251 return s.str();
252}
253
254std::shared_ptr<Field> StructType::GetFieldByName(const std::string& name) const {
255 int i = GetFieldIndex(name);
256 return i == -1 ? nullptr : children_[i];
257}
258
259int StructType::GetFieldIndex(const std::string& name) const {
260 if (name_to_index_.size() < children_.size()) {
261 // There are duplicate field names. Refuse to guess
262 int counts = 0;
263 int last_observed_index = -1;
264 for (size_t i = 0; i < children_.size(); ++i) {
265 if (children_[i]->name() == name) {
266 ++counts;
267 last_observed_index = static_cast<int>(i);
268 }
269 }
270
271 if (counts == 1) {
272 return last_observed_index;
273 } else {
274 // Duplicate or not found
275 return -1;
276 }
277 }
278
279 auto it = name_to_index_.find(name);
280 if (it == name_to_index_.end()) {
281 return -1;
282 } else {
283 return it->second;
284 }
285}
286
287std::shared_ptr<Field> StructType::GetChildByName(const std::string& name) const {
288 return GetFieldByName(name);
289}
290
291int StructType::GetChildIndex(const std::string& name) const {
292 return GetFieldIndex(name);
293}
294
295// ----------------------------------------------------------------------
296// DictionaryType
297
298DictionaryType::DictionaryType(const std::shared_ptr<DataType>& index_type,
299 const std::shared_ptr<Array>& dictionary, bool ordered)
300 : FixedWidthType(Type::DICTIONARY),
301 index_type_(index_type),
302 dictionary_(dictionary),
303 ordered_(ordered) {
304#ifndef NDEBUG
305 const auto& int_type = checked_cast<const Integer&>(*index_type);
306 DCHECK_EQ(int_type.is_signed(), true) << "dictionary index type should be signed";
307#endif
308}
309
310int DictionaryType::bit_width() const {
311 return checked_cast<const FixedWidthType&>(*index_type_).bit_width();
312}
313
314std::shared_ptr<Array> DictionaryType::dictionary() const { return dictionary_; }
315
316std::string DictionaryType::ToString() const {
317 std::stringstream ss;
318 ss << "dictionary<values=" << dictionary_->type()->ToString()
319 << ", indices=" << index_type_->ToString() << ", ordered=" << ordered_ << ">";
320 return ss.str();
321}
322
323// ----------------------------------------------------------------------
324// Null type
325
326std::string NullType::ToString() const { return name(); }
327
328// ----------------------------------------------------------------------
329// Schema implementation
330
331Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields,
332 const std::shared_ptr<const KeyValueMetadata>& metadata)
333 : fields_(fields),
334 name_to_index_(CreateNameToIndexMap(fields_)),
335 metadata_(metadata) {}
336
337Schema::Schema(std::vector<std::shared_ptr<Field>>&& fields,
338 const std::shared_ptr<const KeyValueMetadata>& metadata)
339 : fields_(std::move(fields)),
340 name_to_index_(CreateNameToIndexMap(fields_)),
341 metadata_(metadata) {}
342
343bool Schema::Equals(const Schema& other, bool check_metadata) const {
344 if (this == &other) {
345 return true;
346 }
347
348 // checks field equality
349 if (num_fields() != other.num_fields()) {
350 return false;
351 }
352 for (int i = 0; i < num_fields(); ++i) {
353 if (!field(i)->Equals(*other.field(i).get(), check_metadata)) {
354 return false;
355 }
356 }
357
358 // check metadata equality
359 if (!check_metadata) {
360 return true;
361 } else if (this->HasMetadata() && other.HasMetadata()) {
362 return metadata_->Equals(*other.metadata_);
363 } else if (!this->HasMetadata() && !other.HasMetadata()) {
364 return true;
365 } else {
366 return false;
367 }
368}
369
370std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) const {
371 int64_t i = GetFieldIndex(name);
372 return i == -1 ? nullptr : fields_[i];
373}
374
375int64_t Schema::GetFieldIndex(const std::string& name) const {
376 auto it = name_to_index_.find(name);
377 if (it == name_to_index_.end()) {
378 return -1;
379 } else {
380 return it->second;
381 }
382}
383
384Status Schema::AddField(int i, const std::shared_ptr<Field>& field,
385 std::shared_ptr<Schema>* out) const {
386 if (i < 0 || i > this->num_fields()) {
387 return Status::Invalid("Invalid column index to add field.");
388 }
389
390 *out =
391 std::make_shared<Schema>(internal::AddVectorElement(fields_, i, field), metadata_);
392 return Status::OK();
393}
394
395Status Schema::SetField(int i, const std::shared_ptr<Field>& field,
396 std::shared_ptr<Schema>* out) const {
397 if (i < 0 || i > this->num_fields()) {
398 return Status::Invalid("Invalid column index to add field.");
399 }
400
401 *out = std::make_shared<Schema>(internal::ReplaceVectorElement(fields_, i, field),
402 metadata_);
403 return Status::OK();
404}
405
406bool Schema::HasMetadata() const {
407 return (metadata_ != nullptr) && (metadata_->size() > 0);
408}
409
410std::shared_ptr<Schema> Schema::AddMetadata(
411 const std::shared_ptr<const KeyValueMetadata>& metadata) const {
412 return std::make_shared<Schema>(fields_, metadata);
413}
414
415std::shared_ptr<const KeyValueMetadata> Schema::metadata() const { return metadata_; }
416
417std::shared_ptr<Schema> Schema::RemoveMetadata() const {
418 return std::make_shared<Schema>(fields_);
419}
420
421Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const {
422 if (i < 0 || i >= this->num_fields()) {
423 return Status::Invalid("Invalid column index to remove field.");
424 }
425
426 *out = std::make_shared<Schema>(internal::DeleteVectorElement(fields_, i), metadata_);
427 return Status::OK();
428}
429
430std::string Schema::ToString() const {
431 std::stringstream buffer;
432
433 int i = 0;
434 for (auto field : fields_) {
435 if (i > 0) {
436 buffer << std::endl;
437 }
438 buffer << field->ToString();
439 ++i;
440 }
441
442 if (metadata_) {
443 buffer << metadata_->ToString();
444 }
445
446 return buffer.str();
447}
448
449std::shared_ptr<Schema> schema(const std::vector<std::shared_ptr<Field>>& fields,
450 const std::shared_ptr<const KeyValueMetadata>& metadata) {
451 return std::make_shared<Schema>(fields, metadata);
452}
453
454std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>>&& fields,
455 const std::shared_ptr<const KeyValueMetadata>& metadata) {
456 return std::make_shared<Schema>(std::move(fields), metadata);
457}
458
459// ----------------------------------------------------------------------
460// Visitors and factory functions
461
462#define ACCEPT_VISITOR(TYPE) \
463 Status TYPE::Accept(TypeVisitor* visitor) const { return visitor->Visit(*this); }
464
465ACCEPT_VISITOR(NullType)
466ACCEPT_VISITOR(BooleanType)
467ACCEPT_VISITOR(BinaryType)
468ACCEPT_VISITOR(FixedSizeBinaryType)
469ACCEPT_VISITOR(StringType)
470ACCEPT_VISITOR(ListType)
471ACCEPT_VISITOR(StructType)
472ACCEPT_VISITOR(Decimal128Type)
473ACCEPT_VISITOR(UnionType)
474ACCEPT_VISITOR(Date32Type)
475ACCEPT_VISITOR(Date64Type)
476ACCEPT_VISITOR(Time32Type)
477ACCEPT_VISITOR(Time64Type)
478ACCEPT_VISITOR(TimestampType)
479ACCEPT_VISITOR(IntervalType)
480ACCEPT_VISITOR(DictionaryType)
481
482#define TYPE_FACTORY(NAME, KLASS) \
483 std::shared_ptr<DataType> NAME() { \
484 static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \
485 return result; \
486 }
487
488TYPE_FACTORY(null, NullType)
489TYPE_FACTORY(boolean, BooleanType)
490TYPE_FACTORY(int8, Int8Type)
491TYPE_FACTORY(uint8, UInt8Type)
492TYPE_FACTORY(int16, Int16Type)
493TYPE_FACTORY(uint16, UInt16Type)
494TYPE_FACTORY(int32, Int32Type)
495TYPE_FACTORY(uint32, UInt32Type)
496TYPE_FACTORY(int64, Int64Type)
497TYPE_FACTORY(uint64, UInt64Type)
498TYPE_FACTORY(float16, HalfFloatType)
499TYPE_FACTORY(float32, FloatType)
500TYPE_FACTORY(float64, DoubleType)
501TYPE_FACTORY(utf8, StringType)
502TYPE_FACTORY(binary, BinaryType)
503TYPE_FACTORY(date64, Date64Type)
504TYPE_FACTORY(date32, Date32Type)
505
506std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width) {
507 return std::make_shared<FixedSizeBinaryType>(byte_width);
508}
509
510std::shared_ptr<DataType> timestamp(TimeUnit::type unit) {
511 return std::make_shared<TimestampType>(unit);
512}
513
514std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone) {
515 return std::make_shared<TimestampType>(unit, timezone);
516}
517
518std::shared_ptr<DataType> time32(TimeUnit::type unit) {
519 return std::make_shared<Time32Type>(unit);
520}
521
522std::shared_ptr<DataType> time64(TimeUnit::type unit) {
523 return std::make_shared<Time64Type>(unit);
524}
525
526std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
527 return std::make_shared<ListType>(value_type);
528}
529
530std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_field) {
531 return std::make_shared<ListType>(value_field);
532}
533
534std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fields) {
535 return std::make_shared<StructType>(fields);
536}
537
538std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
539 const std::vector<uint8_t>& type_codes,
540 UnionMode::type mode) {
541 return std::make_shared<UnionType>(child_fields, type_codes, mode);
542}
543
544std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>& children,
545 UnionMode::type mode) {
546 std::vector<std::shared_ptr<Field>> types;
547 std::vector<uint8_t> type_codes;
548 uint8_t counter = 0;
549 for (const auto& child : children) {
550 types.push_back(field(std::to_string(counter), child->type()));
551 type_codes.push_back(counter);
552 counter++;
553 }
554 return union_(types, type_codes, mode);
555}
556
557std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
558 const std::shared_ptr<Array>& dict_values,
559 bool ordered) {
560 return std::make_shared<DictionaryType>(index_type, dict_values, ordered);
561}
562
563std::shared_ptr<Field> field(const std::string& name,
564 const std::shared_ptr<DataType>& type, bool nullable,
565 const std::shared_ptr<const KeyValueMetadata>& metadata) {
566 return std::make_shared<Field>(name, type, nullable, metadata);
567}
568
569std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) {
570 return std::make_shared<Decimal128Type>(precision, scale);
571}
572
573std::string Decimal128Type::ToString() const {
574 std::stringstream s;
575 s << "decimal(" << precision_ << ", " << scale_ << ")";
576 return s.str();
577}
578
579} // namespace arrow
580