1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "arrow/type.h" |
19 | |
20 | #include <climits> |
21 | #include <cstddef> |
22 | #include <sstream> // IWYU pragma: keep |
23 | #include <string> |
24 | #include <utility> |
25 | #include <vector> |
26 | |
27 | #include "arrow/array.h" |
28 | #include "arrow/compare.h" |
29 | #include "arrow/status.h" |
30 | #include "arrow/util/checked_cast.h" |
31 | #include "arrow/util/key_value_metadata.h" |
32 | #include "arrow/util/logging.h" |
33 | #include "arrow/util/stl.h" |
34 | #include "arrow/visitor.h" |
35 | |
36 | namespace arrow { |
37 | |
38 | using internal::checked_cast; |
39 | |
40 | bool Field::HasMetadata() const { |
41 | return (metadata_ != nullptr) && (metadata_->size() > 0); |
42 | } |
43 | |
44 | std::shared_ptr<Field> Field::AddMetadata( |
45 | const std::shared_ptr<const KeyValueMetadata>& metadata) const { |
46 | return std::make_shared<Field>(name_, type_, nullable_, metadata); |
47 | } |
48 | |
49 | std::shared_ptr<Field> Field::RemoveMetadata() const { |
50 | return std::make_shared<Field>(name_, type_, nullable_); |
51 | } |
52 | |
53 | std::vector<std::shared_ptr<Field>> Field::Flatten() const { |
54 | std::vector<std::shared_ptr<Field>> flattened; |
55 | if (type_->id() == Type::STRUCT) { |
56 | for (const auto& child : type_->children()) { |
57 | auto flattened_child = std::make_shared<Field>(*child); |
58 | flattened.push_back(flattened_child); |
59 | flattened_child->name_.insert(0, name() + "." ); |
60 | flattened_child->nullable_ |= nullable_; |
61 | } |
62 | } else { |
63 | flattened.push_back(std::make_shared<Field>(*this)); |
64 | } |
65 | return flattened; |
66 | } |
67 | |
68 | bool Field::Equals(const Field& other, bool check_metadata) const { |
69 | if (this == &other) { |
70 | return true; |
71 | } |
72 | if (this->name_ == other.name_ && this->nullable_ == other.nullable_ && |
73 | this->type_->Equals(*other.type_.get())) { |
74 | if (!check_metadata) { |
75 | return true; |
76 | } else if (this->HasMetadata() && other.HasMetadata()) { |
77 | return metadata_->Equals(*other.metadata_); |
78 | } else if (!this->HasMetadata() && !other.HasMetadata()) { |
79 | return true; |
80 | } else { |
81 | return false; |
82 | } |
83 | } |
84 | return false; |
85 | } |
86 | |
87 | bool Field::Equals(const std::shared_ptr<Field>& other, bool check_metadata) const { |
88 | return Equals(*other.get(), check_metadata); |
89 | } |
90 | |
91 | std::string Field::ToString() const { |
92 | std::stringstream ss; |
93 | ss << this->name_ << ": " << this->type_->ToString(); |
94 | if (!this->nullable_) { |
95 | ss << " not null" ; |
96 | } |
97 | return ss.str(); |
98 | } |
99 | |
100 | DataType::~DataType() {} |
101 | |
102 | bool DataType::Equals(const DataType& other) const { return TypeEquals(*this, other); } |
103 | |
104 | bool DataType::Equals(const std::shared_ptr<DataType>& other) const { |
105 | if (!other) { |
106 | return false; |
107 | } |
108 | return Equals(*other.get()); |
109 | } |
110 | |
111 | std::string BooleanType::ToString() const { return name(); } |
112 | |
113 | FloatingPoint::Precision HalfFloatType::precision() const { return FloatingPoint::HALF; } |
114 | |
115 | FloatingPoint::Precision FloatType::precision() const { return FloatingPoint::SINGLE; } |
116 | |
117 | FloatingPoint::Precision DoubleType::precision() const { return FloatingPoint::DOUBLE; } |
118 | |
119 | std::string StringType::ToString() const { return std::string("string" ); } |
120 | |
121 | std::string ListType::ToString() const { |
122 | std::stringstream s; |
123 | s << "list<" << value_field()->ToString() << ">" ; |
124 | return s.str(); |
125 | } |
126 | |
127 | std::string BinaryType::ToString() const { return std::string("binary" ); } |
128 | |
129 | int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } |
130 | |
131 | std::string FixedSizeBinaryType::ToString() const { |
132 | std::stringstream ss; |
133 | ss << "fixed_size_binary[" << byte_width_ << "]" ; |
134 | return ss.str(); |
135 | } |
136 | |
137 | // ---------------------------------------------------------------------- |
138 | // Date types |
139 | |
140 | DateType::DateType(Type::type type_id) : FixedWidthType(type_id) {} |
141 | |
142 | Date32Type::Date32Type() : DateType(Type::DATE32) {} |
143 | |
144 | Date64Type::Date64Type() : DateType(Type::DATE64) {} |
145 | |
146 | std::string Date64Type::ToString() const { return std::string("date64[ms]" ); } |
147 | |
148 | std::string Date32Type::ToString() const { return std::string("date32[day]" ); } |
149 | |
150 | // ---------------------------------------------------------------------- |
151 | // Time types |
152 | |
153 | TimeType::TimeType(Type::type type_id, TimeUnit::type unit) |
154 | : FixedWidthType(type_id), unit_(unit) {} |
155 | |
156 | Time32Type::Time32Type(TimeUnit::type unit) : TimeType(Type::TIME32, unit) { |
157 | DCHECK(unit == TimeUnit::SECOND || unit == TimeUnit::MILLI) |
158 | << "Must be seconds or milliseconds" ; |
159 | } |
160 | |
161 | std::string Time32Type::ToString() const { |
162 | std::stringstream ss; |
163 | ss << "time32[" << this->unit_ << "]" ; |
164 | return ss.str(); |
165 | } |
166 | |
167 | Time64Type::Time64Type(TimeUnit::type unit) : TimeType(Type::TIME64, unit) { |
168 | DCHECK(unit == TimeUnit::MICRO || unit == TimeUnit::NANO) |
169 | << "Must be microseconds or nanoseconds" ; |
170 | } |
171 | |
172 | std::string Time64Type::ToString() const { |
173 | std::stringstream ss; |
174 | ss << "time64[" << this->unit_ << "]" ; |
175 | return ss.str(); |
176 | } |
177 | |
178 | // ---------------------------------------------------------------------- |
179 | // Timestamp types |
180 | |
181 | std::string TimestampType::ToString() const { |
182 | std::stringstream ss; |
183 | ss << "timestamp[" << this->unit_; |
184 | if (this->timezone_.size() > 0) { |
185 | ss << ", tz=" << this->timezone_; |
186 | } |
187 | ss << "]" ; |
188 | return ss.str(); |
189 | } |
190 | |
191 | // ---------------------------------------------------------------------- |
192 | // Union type |
193 | |
194 | UnionType::UnionType(const std::vector<std::shared_ptr<Field>>& fields, |
195 | const std::vector<uint8_t>& type_codes, UnionMode::type mode) |
196 | : NestedType(Type::UNION), mode_(mode), type_codes_(type_codes) { |
197 | children_ = fields; |
198 | } |
199 | |
200 | std::string UnionType::ToString() const { |
201 | std::stringstream s; |
202 | |
203 | if (mode_ == UnionMode::SPARSE) { |
204 | s << "union[sparse]<" ; |
205 | } else { |
206 | s << "union[dense]<" ; |
207 | } |
208 | |
209 | for (size_t i = 0; i < children_.size(); ++i) { |
210 | if (i) { |
211 | s << ", " ; |
212 | } |
213 | s << children_[i]->ToString() << "=" << static_cast<int>(type_codes_[i]); |
214 | } |
215 | s << ">" ; |
216 | return s.str(); |
217 | } |
218 | |
219 | // ---------------------------------------------------------------------- |
220 | // Struct type |
221 | |
222 | namespace { |
223 | |
224 | std::unordered_map<std::string, int> CreateNameToIndexMap( |
225 | const std::vector<std::shared_ptr<Field>>& fields) { |
226 | std::unordered_map<std::string, int> name_to_index; |
227 | for (size_t i = 0; i < fields.size(); ++i) { |
228 | name_to_index[fields[i]->name()] = static_cast<int>(i); |
229 | } |
230 | return name_to_index; |
231 | } |
232 | |
233 | } // namespace |
234 | |
235 | StructType::StructType(const std::vector<std::shared_ptr<Field>>& fields) |
236 | : NestedType(Type::STRUCT), name_to_index_(CreateNameToIndexMap(fields)) { |
237 | children_ = fields; |
238 | } |
239 | |
240 | std::string StructType::ToString() const { |
241 | std::stringstream s; |
242 | s << "struct<" ; |
243 | for (int i = 0; i < this->num_children(); ++i) { |
244 | if (i > 0) { |
245 | s << ", " ; |
246 | } |
247 | std::shared_ptr<Field> field = this->child(i); |
248 | s << field->name() << ": " << field->type()->ToString(); |
249 | } |
250 | s << ">" ; |
251 | return s.str(); |
252 | } |
253 | |
254 | std::shared_ptr<Field> StructType::GetFieldByName(const std::string& name) const { |
255 | int i = GetFieldIndex(name); |
256 | return i == -1 ? nullptr : children_[i]; |
257 | } |
258 | |
259 | int StructType::GetFieldIndex(const std::string& name) const { |
260 | if (name_to_index_.size() < children_.size()) { |
261 | // There are duplicate field names. Refuse to guess |
262 | int counts = 0; |
263 | int last_observed_index = -1; |
264 | for (size_t i = 0; i < children_.size(); ++i) { |
265 | if (children_[i]->name() == name) { |
266 | ++counts; |
267 | last_observed_index = static_cast<int>(i); |
268 | } |
269 | } |
270 | |
271 | if (counts == 1) { |
272 | return last_observed_index; |
273 | } else { |
274 | // Duplicate or not found |
275 | return -1; |
276 | } |
277 | } |
278 | |
279 | auto it = name_to_index_.find(name); |
280 | if (it == name_to_index_.end()) { |
281 | return -1; |
282 | } else { |
283 | return it->second; |
284 | } |
285 | } |
286 | |
287 | std::shared_ptr<Field> StructType::GetChildByName(const std::string& name) const { |
288 | return GetFieldByName(name); |
289 | } |
290 | |
291 | int StructType::GetChildIndex(const std::string& name) const { |
292 | return GetFieldIndex(name); |
293 | } |
294 | |
295 | // ---------------------------------------------------------------------- |
296 | // DictionaryType |
297 | |
298 | DictionaryType::DictionaryType(const std::shared_ptr<DataType>& index_type, |
299 | const std::shared_ptr<Array>& dictionary, bool ordered) |
300 | : FixedWidthType(Type::DICTIONARY), |
301 | index_type_(index_type), |
302 | dictionary_(dictionary), |
303 | ordered_(ordered) { |
304 | #ifndef NDEBUG |
305 | const auto& int_type = checked_cast<const Integer&>(*index_type); |
306 | DCHECK_EQ(int_type.is_signed(), true) << "dictionary index type should be signed" ; |
307 | #endif |
308 | } |
309 | |
310 | int DictionaryType::bit_width() const { |
311 | return checked_cast<const FixedWidthType&>(*index_type_).bit_width(); |
312 | } |
313 | |
314 | std::shared_ptr<Array> DictionaryType::dictionary() const { return dictionary_; } |
315 | |
316 | std::string DictionaryType::ToString() const { |
317 | std::stringstream ss; |
318 | ss << "dictionary<values=" << dictionary_->type()->ToString() |
319 | << ", indices=" << index_type_->ToString() << ", ordered=" << ordered_ << ">" ; |
320 | return ss.str(); |
321 | } |
322 | |
323 | // ---------------------------------------------------------------------- |
324 | // Null type |
325 | |
326 | std::string NullType::ToString() const { return name(); } |
327 | |
328 | // ---------------------------------------------------------------------- |
329 | // Schema implementation |
330 | |
331 | Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields, |
332 | const std::shared_ptr<const KeyValueMetadata>& metadata) |
333 | : fields_(fields), |
334 | name_to_index_(CreateNameToIndexMap(fields_)), |
335 | metadata_(metadata) {} |
336 | |
337 | Schema::Schema(std::vector<std::shared_ptr<Field>>&& fields, |
338 | const std::shared_ptr<const KeyValueMetadata>& metadata) |
339 | : fields_(std::move(fields)), |
340 | name_to_index_(CreateNameToIndexMap(fields_)), |
341 | metadata_(metadata) {} |
342 | |
343 | bool Schema::Equals(const Schema& other, bool check_metadata) const { |
344 | if (this == &other) { |
345 | return true; |
346 | } |
347 | |
348 | // checks field equality |
349 | if (num_fields() != other.num_fields()) { |
350 | return false; |
351 | } |
352 | for (int i = 0; i < num_fields(); ++i) { |
353 | if (!field(i)->Equals(*other.field(i).get(), check_metadata)) { |
354 | return false; |
355 | } |
356 | } |
357 | |
358 | // check metadata equality |
359 | if (!check_metadata) { |
360 | return true; |
361 | } else if (this->HasMetadata() && other.HasMetadata()) { |
362 | return metadata_->Equals(*other.metadata_); |
363 | } else if (!this->HasMetadata() && !other.HasMetadata()) { |
364 | return true; |
365 | } else { |
366 | return false; |
367 | } |
368 | } |
369 | |
370 | std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) const { |
371 | int64_t i = GetFieldIndex(name); |
372 | return i == -1 ? nullptr : fields_[i]; |
373 | } |
374 | |
375 | int64_t Schema::GetFieldIndex(const std::string& name) const { |
376 | auto it = name_to_index_.find(name); |
377 | if (it == name_to_index_.end()) { |
378 | return -1; |
379 | } else { |
380 | return it->second; |
381 | } |
382 | } |
383 | |
384 | Status Schema::AddField(int i, const std::shared_ptr<Field>& field, |
385 | std::shared_ptr<Schema>* out) const { |
386 | if (i < 0 || i > this->num_fields()) { |
387 | return Status::Invalid("Invalid column index to add field." ); |
388 | } |
389 | |
390 | *out = |
391 | std::make_shared<Schema>(internal::AddVectorElement(fields_, i, field), metadata_); |
392 | return Status::OK(); |
393 | } |
394 | |
395 | Status Schema::SetField(int i, const std::shared_ptr<Field>& field, |
396 | std::shared_ptr<Schema>* out) const { |
397 | if (i < 0 || i > this->num_fields()) { |
398 | return Status::Invalid("Invalid column index to add field." ); |
399 | } |
400 | |
401 | *out = std::make_shared<Schema>(internal::ReplaceVectorElement(fields_, i, field), |
402 | metadata_); |
403 | return Status::OK(); |
404 | } |
405 | |
406 | bool Schema::HasMetadata() const { |
407 | return (metadata_ != nullptr) && (metadata_->size() > 0); |
408 | } |
409 | |
410 | std::shared_ptr<Schema> Schema::AddMetadata( |
411 | const std::shared_ptr<const KeyValueMetadata>& metadata) const { |
412 | return std::make_shared<Schema>(fields_, metadata); |
413 | } |
414 | |
415 | std::shared_ptr<const KeyValueMetadata> Schema::metadata() const { return metadata_; } |
416 | |
417 | std::shared_ptr<Schema> Schema::RemoveMetadata() const { |
418 | return std::make_shared<Schema>(fields_); |
419 | } |
420 | |
421 | Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const { |
422 | if (i < 0 || i >= this->num_fields()) { |
423 | return Status::Invalid("Invalid column index to remove field." ); |
424 | } |
425 | |
426 | *out = std::make_shared<Schema>(internal::DeleteVectorElement(fields_, i), metadata_); |
427 | return Status::OK(); |
428 | } |
429 | |
430 | std::string Schema::ToString() const { |
431 | std::stringstream buffer; |
432 | |
433 | int i = 0; |
434 | for (auto field : fields_) { |
435 | if (i > 0) { |
436 | buffer << std::endl; |
437 | } |
438 | buffer << field->ToString(); |
439 | ++i; |
440 | } |
441 | |
442 | if (metadata_) { |
443 | buffer << metadata_->ToString(); |
444 | } |
445 | |
446 | return buffer.str(); |
447 | } |
448 | |
449 | std::shared_ptr<Schema> schema(const std::vector<std::shared_ptr<Field>>& fields, |
450 | const std::shared_ptr<const KeyValueMetadata>& metadata) { |
451 | return std::make_shared<Schema>(fields, metadata); |
452 | } |
453 | |
454 | std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>>&& fields, |
455 | const std::shared_ptr<const KeyValueMetadata>& metadata) { |
456 | return std::make_shared<Schema>(std::move(fields), metadata); |
457 | } |
458 | |
459 | // ---------------------------------------------------------------------- |
460 | // Visitors and factory functions |
461 | |
462 | #define ACCEPT_VISITOR(TYPE) \ |
463 | Status TYPE::Accept(TypeVisitor* visitor) const { return visitor->Visit(*this); } |
464 | |
465 | ACCEPT_VISITOR(NullType) |
466 | ACCEPT_VISITOR(BooleanType) |
467 | ACCEPT_VISITOR(BinaryType) |
468 | ACCEPT_VISITOR(FixedSizeBinaryType) |
469 | ACCEPT_VISITOR(StringType) |
470 | ACCEPT_VISITOR(ListType) |
471 | ACCEPT_VISITOR(StructType) |
472 | ACCEPT_VISITOR(Decimal128Type) |
473 | ACCEPT_VISITOR(UnionType) |
474 | ACCEPT_VISITOR(Date32Type) |
475 | ACCEPT_VISITOR(Date64Type) |
476 | ACCEPT_VISITOR(Time32Type) |
477 | ACCEPT_VISITOR(Time64Type) |
478 | ACCEPT_VISITOR(TimestampType) |
479 | ACCEPT_VISITOR(IntervalType) |
480 | ACCEPT_VISITOR(DictionaryType) |
481 | |
482 | #define TYPE_FACTORY(NAME, KLASS) \ |
483 | std::shared_ptr<DataType> NAME() { \ |
484 | static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \ |
485 | return result; \ |
486 | } |
487 | |
488 | TYPE_FACTORY(null, NullType) |
489 | TYPE_FACTORY(boolean, BooleanType) |
490 | TYPE_FACTORY(int8, Int8Type) |
491 | TYPE_FACTORY(uint8, UInt8Type) |
492 | TYPE_FACTORY(int16, Int16Type) |
493 | TYPE_FACTORY(uint16, UInt16Type) |
494 | TYPE_FACTORY(int32, Int32Type) |
495 | TYPE_FACTORY(uint32, UInt32Type) |
496 | TYPE_FACTORY(int64, Int64Type) |
497 | TYPE_FACTORY(uint64, UInt64Type) |
498 | TYPE_FACTORY(float16, HalfFloatType) |
499 | TYPE_FACTORY(float32, FloatType) |
500 | TYPE_FACTORY(float64, DoubleType) |
501 | TYPE_FACTORY(utf8, StringType) |
502 | TYPE_FACTORY(binary, BinaryType) |
503 | TYPE_FACTORY(date64, Date64Type) |
504 | TYPE_FACTORY(date32, Date32Type) |
505 | |
506 | std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width) { |
507 | return std::make_shared<FixedSizeBinaryType>(byte_width); |
508 | } |
509 | |
510 | std::shared_ptr<DataType> timestamp(TimeUnit::type unit) { |
511 | return std::make_shared<TimestampType>(unit); |
512 | } |
513 | |
514 | std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone) { |
515 | return std::make_shared<TimestampType>(unit, timezone); |
516 | } |
517 | |
518 | std::shared_ptr<DataType> time32(TimeUnit::type unit) { |
519 | return std::make_shared<Time32Type>(unit); |
520 | } |
521 | |
522 | std::shared_ptr<DataType> time64(TimeUnit::type unit) { |
523 | return std::make_shared<Time64Type>(unit); |
524 | } |
525 | |
526 | std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) { |
527 | return std::make_shared<ListType>(value_type); |
528 | } |
529 | |
530 | std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_field) { |
531 | return std::make_shared<ListType>(value_field); |
532 | } |
533 | |
534 | std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fields) { |
535 | return std::make_shared<StructType>(fields); |
536 | } |
537 | |
538 | std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields, |
539 | const std::vector<uint8_t>& type_codes, |
540 | UnionMode::type mode) { |
541 | return std::make_shared<UnionType>(child_fields, type_codes, mode); |
542 | } |
543 | |
544 | std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Array>>& children, |
545 | UnionMode::type mode) { |
546 | std::vector<std::shared_ptr<Field>> types; |
547 | std::vector<uint8_t> type_codes; |
548 | uint8_t counter = 0; |
549 | for (const auto& child : children) { |
550 | types.push_back(field(std::to_string(counter), child->type())); |
551 | type_codes.push_back(counter); |
552 | counter++; |
553 | } |
554 | return union_(types, type_codes, mode); |
555 | } |
556 | |
557 | std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type, |
558 | const std::shared_ptr<Array>& dict_values, |
559 | bool ordered) { |
560 | return std::make_shared<DictionaryType>(index_type, dict_values, ordered); |
561 | } |
562 | |
563 | std::shared_ptr<Field> field(const std::string& name, |
564 | const std::shared_ptr<DataType>& type, bool nullable, |
565 | const std::shared_ptr<const KeyValueMetadata>& metadata) { |
566 | return std::make_shared<Field>(name, type, nullable, metadata); |
567 | } |
568 | |
569 | std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) { |
570 | return std::make_shared<Decimal128Type>(precision, scale); |
571 | } |
572 | |
573 | std::string Decimal128Type::ToString() const { |
574 | std::stringstream s; |
575 | s << "decimal(" << precision_ << ", " << scale_ << ")" ; |
576 | return s.str(); |
577 | } |
578 | |
579 | } // namespace arrow |
580 | |