1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_TYPE_H |
19 | #define ARROW_TYPE_H |
20 | |
21 | #include <climits> |
22 | #include <cstdint> |
23 | #include <memory> |
24 | #include <ostream> |
25 | #include <string> |
26 | #include <type_traits> |
27 | #include <unordered_map> |
28 | #include <vector> |
29 | |
30 | #include "arrow/status.h" |
31 | #include "arrow/type_fwd.h" // IWYU pragma: export |
32 | #include "arrow/util/checked_cast.h" |
33 | #include "arrow/util/key_value_metadata.h" // IWYU pragma: export |
34 | #include "arrow/util/macros.h" |
35 | #include "arrow/util/visibility.h" |
36 | #include "arrow/visitor.h" // IWYU pragma: keep |
37 | |
38 | namespace arrow { |
39 | |
40 | class Array; |
41 | class Field; |
42 | class MemoryPool; |
43 | |
44 | struct Type { |
45 | /// \brief Main data type enumeration |
46 | /// |
47 | /// This enumeration provides a quick way to interrogate the category |
48 | /// of a DataType instance. |
49 | enum type { |
50 | /// A NULL type having no physical storage |
51 | NA, |
52 | |
53 | /// Boolean as 1 bit, LSB bit-packed ordering |
54 | BOOL, |
55 | |
56 | /// Unsigned 8-bit little-endian integer |
57 | UINT8, |
58 | |
59 | /// Signed 8-bit little-endian integer |
60 | INT8, |
61 | |
62 | /// Unsigned 16-bit little-endian integer |
63 | UINT16, |
64 | |
65 | /// Signed 16-bit little-endian integer |
66 | INT16, |
67 | |
68 | /// Unsigned 32-bit little-endian integer |
69 | UINT32, |
70 | |
71 | /// Signed 32-bit little-endian integer |
72 | INT32, |
73 | |
74 | /// Unsigned 64-bit little-endian integer |
75 | UINT64, |
76 | |
77 | /// Signed 64-bit little-endian integer |
78 | INT64, |
79 | |
80 | /// 2-byte floating point value |
81 | HALF_FLOAT, |
82 | |
83 | /// 4-byte floating point value |
84 | FLOAT, |
85 | |
86 | /// 8-byte floating point value |
87 | DOUBLE, |
88 | |
89 | /// UTF8 variable-length string as List<Char> |
90 | STRING, |
91 | |
92 | /// Variable-length bytes (no guarantee of UTF8-ness) |
93 | BINARY, |
94 | |
95 | /// Fixed-size binary. Each value occupies the same number of bytes |
96 | FIXED_SIZE_BINARY, |
97 | |
98 | /// int32_t days since the UNIX epoch |
99 | DATE32, |
100 | |
101 | /// int64_t milliseconds since the UNIX epoch |
102 | DATE64, |
103 | |
104 | /// Exact timestamp encoded with int64 since UNIX epoch |
105 | /// Default unit millisecond |
106 | TIMESTAMP, |
107 | |
108 | /// Time as signed 32-bit integer, representing either seconds or |
109 | /// milliseconds since midnight |
110 | TIME32, |
111 | |
112 | /// Time as signed 64-bit integer, representing either microseconds or |
113 | /// nanoseconds since midnight |
114 | TIME64, |
115 | |
116 | /// YEAR_MONTH or DAY_TIME interval in SQL style |
117 | INTERVAL, |
118 | |
119 | /// Precision- and scale-based decimal type. Storage type depends on the |
120 | /// parameters. |
121 | DECIMAL, |
122 | |
123 | /// A list of some logical data type |
124 | LIST, |
125 | |
126 | /// Struct of logical types |
127 | STRUCT, |
128 | |
129 | /// Unions of logical types |
130 | UNION, |
131 | |
132 | /// Dictionary aka Category type |
133 | DICTIONARY, |
134 | |
135 | /// Map, a repeated struct logical type |
136 | MAP |
137 | }; |
138 | }; |
139 | |
140 | /// \brief Base class for all data types |
141 | /// |
142 | /// Data types in this library are all *logical*. They can be expressed as |
143 | /// either a primitive physical type (bytes or bits of some fixed size), a |
144 | /// nested type consisting of other data types, or another data type (e.g. a |
145 | /// timestamp encoded as an int64). |
146 | /// |
147 | /// Simple datatypes may be entirely described by their Type::type id, but |
148 | /// complex datatypes are usually parametric. |
149 | class ARROW_EXPORT DataType { |
150 | public: |
151 | explicit DataType(Type::type id) : id_(id) {} |
152 | virtual ~DataType(); |
153 | |
154 | /// \brief Return whether the types are equal |
155 | /// |
156 | /// Types that are logically convertible from one to another (e.g. List<UInt8> |
157 | /// and Binary) are NOT equal. |
158 | virtual bool Equals(const DataType& other) const; |
159 | /// \brief Return whether the types are equal |
160 | bool Equals(const std::shared_ptr<DataType>& other) const; |
161 | |
162 | std::shared_ptr<Field> child(int i) const { return children_[i]; } |
163 | |
164 | const std::vector<std::shared_ptr<Field>>& children() const { return children_; } |
165 | |
166 | int num_children() const { return static_cast<int>(children_.size()); } |
167 | |
168 | virtual Status Accept(TypeVisitor* visitor) const = 0; |
169 | |
170 | /// \brief A string representation of the type, including any children |
171 | virtual std::string ToString() const = 0; |
172 | |
173 | /// \brief A string name of the type, omitting any child fields |
174 | /// |
175 | /// \note Experimental API |
176 | /// \since 0.7.0 |
177 | virtual std::string name() const = 0; |
178 | |
179 | /// \brief Return the type category |
180 | Type::type id() const { return id_; } |
181 | |
182 | protected: |
183 | Type::type id_; |
184 | std::vector<std::shared_ptr<Field>> children_; |
185 | |
186 | private: |
187 | ARROW_DISALLOW_COPY_AND_ASSIGN(DataType); |
188 | }; |
189 | |
190 | inline std::ostream& operator<<(std::ostream& os, const DataType& type) { |
191 | os << type.ToString(); |
192 | return os; |
193 | } |
194 | |
195 | /// \brief Base class for all fixed-width data types |
196 | class ARROW_EXPORT FixedWidthType : public DataType { |
197 | public: |
198 | using DataType::DataType; |
199 | |
200 | virtual int bit_width() const = 0; |
201 | }; |
202 | |
203 | /// \brief Base class for all data types representing primitive values |
204 | class ARROW_EXPORT PrimitiveCType : public FixedWidthType { |
205 | public: |
206 | using FixedWidthType::FixedWidthType; |
207 | }; |
208 | |
209 | /// \brief Base class for all numeric data types |
210 | class ARROW_EXPORT Number : public PrimitiveCType { |
211 | public: |
212 | using PrimitiveCType::PrimitiveCType; |
213 | }; |
214 | |
215 | /// \brief Base class for all integral data types |
216 | class ARROW_EXPORT Integer : public Number { |
217 | public: |
218 | using Number::Number; |
219 | virtual bool is_signed() const = 0; |
220 | }; |
221 | |
222 | /// \brief Base class for all floating-point data types |
223 | class ARROW_EXPORT FloatingPoint : public Number { |
224 | public: |
225 | using Number::Number; |
226 | enum Precision { HALF, SINGLE, DOUBLE }; |
227 | virtual Precision precision() const = 0; |
228 | }; |
229 | |
230 | /// \brief Base class for all parametric data types |
231 | class ParametricType {}; |
232 | |
233 | class ARROW_EXPORT NestedType : public DataType, public ParametricType { |
234 | public: |
235 | using DataType::DataType; |
236 | }; |
237 | |
238 | class {}; |
239 | |
240 | /// \brief The combination of a field name and data type, with optional metadata |
241 | /// |
242 | /// Fields are used to describe the individual constituents of a |
243 | /// nested DataType or a Schema. |
244 | /// |
245 | /// A field's metadata is represented by a KeyValueMetadata instance, |
246 | /// which holds arbitrary key-value pairs. |
247 | class ARROW_EXPORT Field { |
248 | public: |
249 | Field(const std::string& name, const std::shared_ptr<DataType>& type, |
250 | bool nullable = true, |
251 | const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR) |
252 | : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} |
253 | |
254 | /// \brief Return the field's attached metadata |
255 | std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; } |
256 | |
257 | /// \brief Return whether the field has non-empty metadata |
258 | bool HasMetadata() const; |
259 | |
260 | /// \brief Return a copy of this field with the given metadata attached to it |
261 | std::shared_ptr<Field> AddMetadata( |
262 | const std::shared_ptr<const KeyValueMetadata>& metadata) const; |
263 | /// \brief Return a copy of this field without any metadata attached to it |
264 | std::shared_ptr<Field> RemoveMetadata() const; |
265 | |
266 | std::vector<std::shared_ptr<Field>> Flatten() const; |
267 | |
268 | bool Equals(const Field& other, bool check_metadata = true) const; |
269 | bool Equals(const std::shared_ptr<Field>& other, bool check_metadata = true) const; |
270 | |
271 | /// \brief Return a string representation ot the field |
272 | std::string ToString() const; |
273 | |
274 | /// \brief Return the field name |
275 | const std::string& name() const { return name_; } |
276 | /// \brief Return the field data type |
277 | std::shared_ptr<DataType> type() const { return type_; } |
278 | /// \brief Return whether the field is nullable |
279 | bool nullable() const { return nullable_; } |
280 | |
281 | private: |
282 | // Field name |
283 | std::string name_; |
284 | |
285 | // The field's data type |
286 | std::shared_ptr<DataType> type_; |
287 | |
288 | // Fields can be nullable |
289 | bool nullable_; |
290 | |
291 | // The field's metadata, if any |
292 | std::shared_ptr<const KeyValueMetadata> metadata_; |
293 | }; |
294 | |
295 | namespace detail { |
296 | |
297 | template <typename DERIVED, typename BASE, Type::type TYPE_ID, typename C_TYPE> |
298 | class ARROW_EXPORT CTypeImpl : public BASE { |
299 | public: |
300 | using c_type = C_TYPE; |
301 | static constexpr Type::type type_id = TYPE_ID; |
302 | |
303 | CTypeImpl() : BASE(TYPE_ID) {} |
304 | |
305 | int bit_width() const override { return static_cast<int>(sizeof(C_TYPE) * CHAR_BIT); } |
306 | |
307 | Status Accept(TypeVisitor* visitor) const override { |
308 | return visitor->Visit(internal::checked_cast<const DERIVED&>(*this)); |
309 | } |
310 | |
311 | std::string ToString() const override { return this->name(); } |
312 | }; |
313 | |
314 | template <typename DERIVED, Type::type TYPE_ID, typename C_TYPE> |
315 | class IntegerTypeImpl : public detail::CTypeImpl<DERIVED, Integer, TYPE_ID, C_TYPE> { |
316 | bool is_signed() const override { return std::is_signed<C_TYPE>::value; } |
317 | }; |
318 | |
319 | } // namespace detail |
320 | |
321 | /// Concrete type class for always-null data |
322 | class ARROW_EXPORT NullType : public DataType, public NoExtraMeta { |
323 | public: |
324 | static constexpr Type::type type_id = Type::NA; |
325 | |
326 | NullType() : DataType(Type::NA) {} |
327 | |
328 | Status Accept(TypeVisitor* visitor) const override; |
329 | std::string ToString() const override; |
330 | |
331 | std::string name() const override { return "null" ; } |
332 | }; |
333 | |
334 | /// Concrete type class for boolean data |
335 | class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta { |
336 | public: |
337 | static constexpr Type::type type_id = Type::BOOL; |
338 | |
339 | BooleanType() : FixedWidthType(Type::BOOL) {} |
340 | |
341 | Status Accept(TypeVisitor* visitor) const override; |
342 | std::string ToString() const override; |
343 | |
344 | int bit_width() const override { return 1; } |
345 | std::string name() const override { return "bool" ; } |
346 | }; |
347 | |
348 | /// Concrete type class for unsigned 8-bit integer data |
349 | class ARROW_EXPORT UInt8Type |
350 | : public detail::IntegerTypeImpl<UInt8Type, Type::UINT8, uint8_t> { |
351 | public: |
352 | std::string name() const override { return "uint8" ; } |
353 | }; |
354 | |
355 | /// Concrete type class for signed 8-bit integer data |
356 | class ARROW_EXPORT Int8Type |
357 | : public detail::IntegerTypeImpl<Int8Type, Type::INT8, int8_t> { |
358 | public: |
359 | std::string name() const override { return "int8" ; } |
360 | }; |
361 | |
362 | /// Concrete type class for unsigned 16-bit integer data |
363 | class ARROW_EXPORT UInt16Type |
364 | : public detail::IntegerTypeImpl<UInt16Type, Type::UINT16, uint16_t> { |
365 | public: |
366 | std::string name() const override { return "uint16" ; } |
367 | }; |
368 | |
369 | /// Concrete type class for signed 16-bit integer data |
370 | class ARROW_EXPORT Int16Type |
371 | : public detail::IntegerTypeImpl<Int16Type, Type::INT16, int16_t> { |
372 | public: |
373 | std::string name() const override { return "int16" ; } |
374 | }; |
375 | |
376 | /// Concrete type class for unsigned 32-bit integer data |
377 | class ARROW_EXPORT UInt32Type |
378 | : public detail::IntegerTypeImpl<UInt32Type, Type::UINT32, uint32_t> { |
379 | public: |
380 | std::string name() const override { return "uint32" ; } |
381 | }; |
382 | |
383 | /// Concrete type class for signed 32-bit integer data |
384 | class ARROW_EXPORT Int32Type |
385 | : public detail::IntegerTypeImpl<Int32Type, Type::INT32, int32_t> { |
386 | public: |
387 | std::string name() const override { return "int32" ; } |
388 | }; |
389 | |
390 | /// Concrete type class for unsigned 64-bit integer data |
391 | class ARROW_EXPORT UInt64Type |
392 | : public detail::IntegerTypeImpl<UInt64Type, Type::UINT64, uint64_t> { |
393 | public: |
394 | std::string name() const override { return "uint64" ; } |
395 | }; |
396 | |
397 | /// Concrete type class for signed 64-bit integer data |
398 | class ARROW_EXPORT Int64Type |
399 | : public detail::IntegerTypeImpl<Int64Type, Type::INT64, int64_t> { |
400 | public: |
401 | std::string name() const override { return "int64" ; } |
402 | }; |
403 | |
404 | /// Concrete type class for 16-bit floating-point data |
405 | class ARROW_EXPORT HalfFloatType |
406 | : public detail::CTypeImpl<HalfFloatType, FloatingPoint, Type::HALF_FLOAT, uint16_t> { |
407 | public: |
408 | Precision precision() const override; |
409 | std::string name() const override { return "halffloat" ; } |
410 | }; |
411 | |
412 | /// Concrete type class for 32-bit floating-point data (C "float") |
413 | class ARROW_EXPORT FloatType |
414 | : public detail::CTypeImpl<FloatType, FloatingPoint, Type::FLOAT, float> { |
415 | public: |
416 | Precision precision() const override; |
417 | std::string name() const override { return "float" ; } |
418 | }; |
419 | |
420 | /// Concrete type class for 64-bit floating-point data (C "double") |
421 | class ARROW_EXPORT DoubleType |
422 | : public detail::CTypeImpl<DoubleType, FloatingPoint, Type::DOUBLE, double> { |
423 | public: |
424 | Precision precision() const override; |
425 | std::string name() const override { return "double" ; } |
426 | }; |
427 | |
428 | /// \brief Concrete type class for list data |
429 | /// |
430 | /// List data is nested data where each value is a variable number of |
431 | /// child items. Lists can be recursively nested, for example |
432 | /// list(list(int32)). |
433 | class ARROW_EXPORT ListType : public NestedType { |
434 | public: |
435 | static constexpr Type::type type_id = Type::LIST; |
436 | |
437 | // List can contain any other logical value type |
438 | explicit ListType(const std::shared_ptr<DataType>& value_type) |
439 | : ListType(std::make_shared<Field>("item" , value_type)) {} |
440 | |
441 | explicit ListType(const std::shared_ptr<Field>& value_field) : NestedType(Type::LIST) { |
442 | children_ = {value_field}; |
443 | } |
444 | |
445 | std::shared_ptr<Field> value_field() const { return children_[0]; } |
446 | |
447 | std::shared_ptr<DataType> value_type() const { return children_[0]->type(); } |
448 | |
449 | Status Accept(TypeVisitor* visitor) const override; |
450 | std::string ToString() const override; |
451 | |
452 | std::string name() const override { return "list" ; } |
453 | }; |
454 | |
455 | /// \brief Concrete type class for variable-size binary data |
456 | class ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { |
457 | public: |
458 | static constexpr Type::type type_id = Type::BINARY; |
459 | |
460 | BinaryType() : BinaryType(Type::BINARY) {} |
461 | |
462 | Status Accept(TypeVisitor* visitor) const override; |
463 | std::string ToString() const override; |
464 | std::string name() const override { return "binary" ; } |
465 | |
466 | protected: |
467 | // Allow subclasses to change the logical type. |
468 | explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} |
469 | }; |
470 | |
471 | /// \brief Concrete type class for fixed-size binary data |
472 | class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public ParametricType { |
473 | public: |
474 | static constexpr Type::type type_id = Type::FIXED_SIZE_BINARY; |
475 | |
476 | explicit FixedSizeBinaryType(int32_t byte_width) |
477 | : FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {} |
478 | explicit FixedSizeBinaryType(int32_t byte_width, Type::type override_type_id) |
479 | : FixedWidthType(override_type_id), byte_width_(byte_width) {} |
480 | |
481 | Status Accept(TypeVisitor* visitor) const override; |
482 | std::string ToString() const override; |
483 | std::string name() const override { return "fixed_size_binary" ; } |
484 | |
485 | int32_t byte_width() const { return byte_width_; } |
486 | int bit_width() const override; |
487 | |
488 | protected: |
489 | int32_t byte_width_; |
490 | }; |
491 | |
492 | /// \brief Concrete type class for variable-size string data, utf8-encoded |
493 | class ARROW_EXPORT StringType : public BinaryType { |
494 | public: |
495 | static constexpr Type::type type_id = Type::STRING; |
496 | |
497 | StringType() : BinaryType(Type::STRING) {} |
498 | |
499 | Status Accept(TypeVisitor* visitor) const override; |
500 | std::string ToString() const override; |
501 | std::string name() const override { return "utf8" ; } |
502 | }; |
503 | |
504 | /// \brief Concrete type class for struct data |
505 | class ARROW_EXPORT StructType : public NestedType { |
506 | public: |
507 | static constexpr Type::type type_id = Type::STRUCT; |
508 | |
509 | explicit StructType(const std::vector<std::shared_ptr<Field>>& fields); |
510 | |
511 | Status Accept(TypeVisitor* visitor) const override; |
512 | std::string ToString() const override; |
513 | std::string name() const override { return "struct" ; } |
514 | |
515 | /// Returns null if name not found |
516 | std::shared_ptr<Field> GetFieldByName(const std::string& name) const; |
517 | |
518 | /// Returns -1 if name not found or if there are multiple fields having the |
519 | /// same name |
520 | int GetFieldIndex(const std::string& name) const; |
521 | |
522 | ARROW_DEPRECATED("Use GetFieldByName" ) |
523 | std::shared_ptr<Field> GetChildByName(const std::string& name) const; |
524 | |
525 | ARROW_DEPRECATED("Use GetFieldIndex" ) |
526 | int GetChildIndex(const std::string& name) const; |
527 | |
528 | private: |
529 | std::unordered_map<std::string, int> name_to_index_; |
530 | }; |
531 | |
532 | /// \brief Base type class for (fixed-size) decimal data |
533 | class ARROW_EXPORT DecimalType : public FixedSizeBinaryType { |
534 | public: |
535 | explicit DecimalType(int32_t byte_width, int32_t precision, int32_t scale) |
536 | : FixedSizeBinaryType(byte_width, Type::DECIMAL), |
537 | precision_(precision), |
538 | scale_(scale) {} |
539 | |
540 | int32_t precision() const { return precision_; } |
541 | int32_t scale() const { return scale_; } |
542 | |
543 | protected: |
544 | int32_t precision_; |
545 | int32_t scale_; |
546 | }; |
547 | |
548 | /// \brief Concrete type class for 128-bit decimal data |
549 | class ARROW_EXPORT Decimal128Type : public DecimalType { |
550 | public: |
551 | static constexpr Type::type type_id = Type::DECIMAL; |
552 | |
553 | explicit Decimal128Type(int32_t precision, int32_t scale) |
554 | : DecimalType(16, precision, scale) {} |
555 | |
556 | Status Accept(TypeVisitor* visitor) const override; |
557 | std::string ToString() const override; |
558 | std::string name() const override { return "decimal" ; } |
559 | }; |
560 | |
561 | struct UnionMode { |
562 | enum type { SPARSE, DENSE }; |
563 | }; |
564 | |
565 | /// \brief Concrete type class for union data |
566 | class ARROW_EXPORT UnionType : public NestedType { |
567 | public: |
568 | static constexpr Type::type type_id = Type::UNION; |
569 | |
570 | UnionType(const std::vector<std::shared_ptr<Field>>& fields, |
571 | const std::vector<uint8_t>& type_codes, |
572 | UnionMode::type mode = UnionMode::SPARSE); |
573 | |
574 | std::string ToString() const override; |
575 | std::string name() const override { return "union" ; } |
576 | Status Accept(TypeVisitor* visitor) const override; |
577 | |
578 | const std::vector<uint8_t>& type_codes() const { return type_codes_; } |
579 | |
580 | UnionMode::type mode() const { return mode_; } |
581 | |
582 | private: |
583 | UnionMode::type mode_; |
584 | |
585 | // The type id used in the data to indicate each data type in the union. For |
586 | // example, the first type in the union might be denoted by the id 5 (instead |
587 | // of 0). |
588 | std::vector<uint8_t> type_codes_; |
589 | }; |
590 | |
591 | // ---------------------------------------------------------------------- |
592 | // Date and time types |
593 | |
594 | enum class DateUnit : char { DAY = 0, MILLI = 1 }; |
595 | |
596 | /// \brief Base type class for date data |
597 | class ARROW_EXPORT DateType : public FixedWidthType { |
598 | public: |
599 | virtual DateUnit unit() const = 0; |
600 | |
601 | protected: |
602 | explicit DateType(Type::type type_id); |
603 | }; |
604 | |
605 | /// Concrete type class for 32-bit date data (as number of days since UNIX epoch) |
606 | class ARROW_EXPORT Date32Type : public DateType { |
607 | public: |
608 | static constexpr Type::type type_id = Type::DATE32; |
609 | static constexpr DateUnit UNIT = DateUnit::DAY; |
610 | |
611 | using c_type = int32_t; |
612 | |
613 | Date32Type(); |
614 | |
615 | int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); } |
616 | |
617 | Status Accept(TypeVisitor* visitor) const override; |
618 | std::string ToString() const override; |
619 | |
620 | std::string name() const override { return "date32" ; } |
621 | DateUnit unit() const override { return UNIT; } |
622 | }; |
623 | |
624 | /// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch) |
625 | class ARROW_EXPORT Date64Type : public DateType { |
626 | public: |
627 | static constexpr Type::type type_id = Type::DATE64; |
628 | static constexpr DateUnit UNIT = DateUnit::MILLI; |
629 | |
630 | using c_type = int64_t; |
631 | |
632 | Date64Type(); |
633 | |
634 | int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); } |
635 | |
636 | Status Accept(TypeVisitor* visitor) const override; |
637 | std::string ToString() const override; |
638 | |
639 | std::string name() const override { return "date64" ; } |
640 | DateUnit unit() const override { return UNIT; } |
641 | }; |
642 | |
643 | struct TimeUnit { |
644 | /// The unit for a time or timestamp DataType |
645 | enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; |
646 | }; |
647 | |
648 | static inline std::ostream& operator<<(std::ostream& os, TimeUnit::type unit) { |
649 | switch (unit) { |
650 | case TimeUnit::SECOND: |
651 | os << "s" ; |
652 | break; |
653 | case TimeUnit::MILLI: |
654 | os << "ms" ; |
655 | break; |
656 | case TimeUnit::MICRO: |
657 | os << "us" ; |
658 | break; |
659 | case TimeUnit::NANO: |
660 | os << "ns" ; |
661 | break; |
662 | } |
663 | return os; |
664 | } |
665 | |
666 | /// Base type class for time data |
667 | class ARROW_EXPORT TimeType : public FixedWidthType, public ParametricType { |
668 | public: |
669 | TimeUnit::type unit() const { return unit_; } |
670 | |
671 | protected: |
672 | TimeType(Type::type type_id, TimeUnit::type unit); |
673 | TimeUnit::type unit_; |
674 | }; |
675 | |
676 | class ARROW_EXPORT Time32Type : public TimeType { |
677 | public: |
678 | static constexpr Type::type type_id = Type::TIME32; |
679 | using c_type = int32_t; |
680 | |
681 | int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); } |
682 | |
683 | explicit Time32Type(TimeUnit::type unit = TimeUnit::MILLI); |
684 | |
685 | Status Accept(TypeVisitor* visitor) const override; |
686 | std::string ToString() const override; |
687 | |
688 | std::string name() const override { return "time32" ; } |
689 | }; |
690 | |
691 | class ARROW_EXPORT Time64Type : public TimeType { |
692 | public: |
693 | static constexpr Type::type type_id = Type::TIME64; |
694 | using c_type = int64_t; |
695 | |
696 | int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); } |
697 | |
698 | explicit Time64Type(TimeUnit::type unit = TimeUnit::MILLI); |
699 | |
700 | Status Accept(TypeVisitor* visitor) const override; |
701 | std::string ToString() const override; |
702 | |
703 | std::string name() const override { return "time64" ; } |
704 | }; |
705 | |
706 | class ARROW_EXPORT TimestampType : public FixedWidthType, public ParametricType { |
707 | public: |
708 | using Unit = TimeUnit; |
709 | |
710 | typedef int64_t c_type; |
711 | static constexpr Type::type type_id = Type::TIMESTAMP; |
712 | |
713 | int bit_width() const override { return static_cast<int>(sizeof(int64_t) * CHAR_BIT); } |
714 | |
715 | explicit TimestampType(TimeUnit::type unit = TimeUnit::MILLI) |
716 | : FixedWidthType(Type::TIMESTAMP), unit_(unit) {} |
717 | |
718 | explicit TimestampType(TimeUnit::type unit, const std::string& timezone) |
719 | : FixedWidthType(Type::TIMESTAMP), unit_(unit), timezone_(timezone) {} |
720 | |
721 | Status Accept(TypeVisitor* visitor) const override; |
722 | std::string ToString() const override; |
723 | std::string name() const override { return "timestamp" ; } |
724 | |
725 | TimeUnit::type unit() const { return unit_; } |
726 | const std::string& timezone() const { return timezone_; } |
727 | |
728 | private: |
729 | TimeUnit::type unit_; |
730 | std::string timezone_; |
731 | }; |
732 | |
733 | class ARROW_EXPORT IntervalType : public FixedWidthType { |
734 | public: |
735 | enum class Unit : char { YEAR_MONTH = 0, DAY_TIME = 1 }; |
736 | |
737 | using c_type = int64_t; |
738 | static constexpr Type::type type_id = Type::INTERVAL; |
739 | |
740 | int bit_width() const override { return static_cast<int>(sizeof(int64_t) * CHAR_BIT); } |
741 | |
742 | explicit IntervalType(Unit unit = Unit::YEAR_MONTH) |
743 | : FixedWidthType(Type::INTERVAL), unit_(unit) {} |
744 | |
745 | Status Accept(TypeVisitor* visitor) const override; |
746 | std::string ToString() const override { return name(); } |
747 | std::string name() const override { return "date" ; } |
748 | |
749 | Unit unit() const { return unit_; } |
750 | |
751 | private: |
752 | Unit unit_; |
753 | }; |
754 | |
755 | // ---------------------------------------------------------------------- |
756 | // DictionaryType (for categorical or dictionary-encoded data) |
757 | |
758 | /// Concrete type class for dictionary data |
759 | class ARROW_EXPORT DictionaryType : public FixedWidthType { |
760 | public: |
761 | static constexpr Type::type type_id = Type::DICTIONARY; |
762 | |
763 | DictionaryType(const std::shared_ptr<DataType>& index_type, |
764 | const std::shared_ptr<Array>& dictionary, bool ordered = false); |
765 | |
766 | int bit_width() const override; |
767 | |
768 | std::shared_ptr<DataType> index_type() const { return index_type_; } |
769 | |
770 | std::shared_ptr<Array> dictionary() const; |
771 | |
772 | Status Accept(TypeVisitor* visitor) const override; |
773 | std::string ToString() const override; |
774 | std::string name() const override { return "dictionary" ; } |
775 | |
776 | bool ordered() const { return ordered_; } |
777 | |
778 | /// \brief Unify several dictionary types |
779 | /// |
780 | /// Compute a resulting dictionary that will allow the union of values |
781 | /// of all input dictionary types. The input types must all have the |
782 | /// same value type. |
783 | /// \param[in] pool Memory pool to allocate dictionary values from |
784 | /// \param[in] types A sequence of input dictionary types |
785 | /// \param[out] out_type The unified dictionary type |
786 | /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors, |
787 | /// one per input type. Each integer vector represents the transposition |
788 | /// of input type indices into unified type indices. |
789 | // XXX Should we return something special (an empty transpose map?) when |
790 | // the transposition is the identity function? |
791 | static Status Unify(MemoryPool* pool, const std::vector<const DataType*>& types, |
792 | std::shared_ptr<DataType>* out_type, |
793 | std::vector<std::vector<int32_t>>* out_transpose_maps = NULLPTR); |
794 | |
795 | private: |
796 | // Must be an integer type (not currently checked) |
797 | std::shared_ptr<DataType> index_type_; |
798 | std::shared_ptr<Array> dictionary_; |
799 | bool ordered_; |
800 | }; |
801 | |
802 | // ---------------------------------------------------------------------- |
803 | // Schema |
804 | |
805 | /// \class Schema |
806 | /// \brief Sequence of arrow::Field objects describing the columns of a record |
807 | /// batch or table data structure |
808 | class ARROW_EXPORT Schema { |
809 | public: |
810 | explicit Schema(const std::vector<std::shared_ptr<Field>>& fields, |
811 | const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR); |
812 | |
813 | explicit Schema(std::vector<std::shared_ptr<Field>>&& fields, |
814 | const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR); |
815 | |
816 | virtual ~Schema() = default; |
817 | |
818 | /// Returns true if all of the schema fields are equal |
819 | bool Equals(const Schema& other, bool check_metadata = true) const; |
820 | |
821 | /// Return the ith schema element. Does not boundscheck |
822 | std::shared_ptr<Field> field(int i) const { return fields_[i]; } |
823 | |
824 | /// Returns null if name not found |
825 | std::shared_ptr<Field> GetFieldByName(const std::string& name) const; |
826 | |
827 | /// Returns -1 if name not found |
828 | int64_t GetFieldIndex(const std::string& name) const; |
829 | |
830 | const std::vector<std::shared_ptr<Field>>& fields() const { return fields_; } |
831 | |
832 | /// \brief The custom key-value metadata, if any |
833 | /// |
834 | /// \return metadata may be null |
835 | std::shared_ptr<const KeyValueMetadata> metadata() const; |
836 | |
837 | /// \brief Render a string representation of the schema suitable for debugging |
838 | std::string ToString() const; |
839 | |
840 | Status AddField(int i, const std::shared_ptr<Field>& field, |
841 | std::shared_ptr<Schema>* out) const; |
842 | Status RemoveField(int i, std::shared_ptr<Schema>* out) const; |
843 | Status SetField(int i, const std::shared_ptr<Field>& field, |
844 | std::shared_ptr<Schema>* out) const; |
845 | |
846 | /// \brief Replace key-value metadata with new metadata |
847 | /// |
848 | /// \param[in] metadata new KeyValueMetadata |
849 | /// \return new Schema |
850 | std::shared_ptr<Schema> AddMetadata( |
851 | const std::shared_ptr<const KeyValueMetadata>& metadata) const; |
852 | |
853 | /// \brief Return copy of Schema without the KeyValueMetadata |
854 | std::shared_ptr<Schema> RemoveMetadata() const; |
855 | |
856 | /// \brief Indicates that Schema has non-empty KevValueMetadata |
857 | bool HasMetadata() const; |
858 | |
859 | /// \brief Return the number of fields (columns) in the schema |
860 | int num_fields() const { return static_cast<int>(fields_.size()); } |
861 | |
862 | private: |
863 | std::vector<std::shared_ptr<Field>> fields_; |
864 | |
865 | std::unordered_map<std::string, int> name_to_index_; |
866 | |
867 | std::shared_ptr<const KeyValueMetadata> metadata_; |
868 | }; |
869 | |
870 | // ---------------------------------------------------------------------- |
871 | // Parametric factory functions |
872 | // Other factory functions are in type_fwd.h |
873 | |
874 | /// \addtogroup type-factories |
875 | /// @{ |
876 | |
877 | /// \brief Create a FixedSizeBinaryType instance |
878 | ARROW_EXPORT |
879 | std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width); |
880 | |
881 | /// \brief Create a Decimal128Type instance |
882 | ARROW_EXPORT |
883 | std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale); |
884 | |
885 | /// \brief Create a ListType instance from its child Field type |
886 | ARROW_EXPORT |
887 | std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_type); |
888 | |
889 | /// \brief Create a ListType instance from its child DataType |
890 | ARROW_EXPORT |
891 | std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type); |
892 | |
893 | /// \brief Create a TimestampType instance from its unit |
894 | ARROW_EXPORT |
895 | std::shared_ptr<DataType> timestamp(TimeUnit::type unit); |
896 | |
897 | /// \brief Create a TimestampType instance from its unit and timezone |
898 | ARROW_EXPORT |
899 | std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone); |
900 | |
901 | /// \brief Create a 32-bit time type instance |
902 | /// |
903 | /// Unit can be either SECOND or MILLI |
904 | std::shared_ptr<DataType> ARROW_EXPORT time32(TimeUnit::type unit); |
905 | |
906 | /// \brief Create a 64-bit time type instance |
907 | /// |
908 | /// Unit can be either MICRO or NANO |
909 | std::shared_ptr<DataType> ARROW_EXPORT time64(TimeUnit::type unit); |
910 | |
911 | /// \brief Create a StructType instance |
912 | std::shared_ptr<DataType> ARROW_EXPORT |
913 | struct_(const std::vector<std::shared_ptr<Field>>& fields); |
914 | |
915 | /// \brief Create a UnionType instance |
916 | std::shared_ptr<DataType> ARROW_EXPORT |
917 | union_(const std::vector<std::shared_ptr<Field>>& child_fields, |
918 | const std::vector<uint8_t>& type_codes, UnionMode::type mode = UnionMode::SPARSE); |
919 | |
920 | /// \brief Create a UnionType instance |
921 | std::shared_ptr<DataType> ARROW_EXPORT |
922 | union_(const std::vector<std::shared_ptr<Array>>& children, |
923 | UnionMode::type mode = UnionMode::SPARSE); |
924 | |
925 | /// \brief Create a DictionaryType instance |
926 | std::shared_ptr<DataType> ARROW_EXPORT |
927 | dictionary(const std::shared_ptr<DataType>& index_type, |
928 | const std::shared_ptr<Array>& values, bool ordered = false); |
929 | |
930 | /// @} |
931 | |
932 | /// \defgroup schema-factories Factory functions for fields and schemas |
933 | /// |
934 | /// Factory functions for fields and schemas |
935 | /// @{ |
936 | |
937 | /// \brief Create a Field instance |
938 | /// |
939 | /// \param name the field name |
940 | /// \param type the field value type |
941 | /// \param nullable whether the values are nullable, default true |
942 | /// \param metadata any custom key-value metadata, default null |
943 | std::shared_ptr<Field> ARROW_EXPORT field( |
944 | const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true, |
945 | const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR); |
946 | |
947 | /// \brief Create a Schema instance |
948 | /// |
949 | /// \param fields the schema's fields |
950 | /// \param metadata any custom key-value metadata, default null |
951 | /// \return schema shared_ptr to Schema |
952 | ARROW_EXPORT |
953 | std::shared_ptr<Schema> schema( |
954 | const std::vector<std::shared_ptr<Field>>& fields, |
955 | const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR); |
956 | |
957 | /// \brief Create a Schema instance |
958 | /// |
959 | /// \param fields the schema's fields (rvalue reference) |
960 | /// \param metadata any custom key-value metadata, default null |
961 | /// \return schema shared_ptr to Schema |
962 | ARROW_EXPORT |
963 | std::shared_ptr<Schema> schema( |
964 | std::vector<std::shared_ptr<Field>>&& fields, |
965 | const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR); |
966 | |
967 | /// @} |
968 | |
969 | } // namespace arrow |
970 | |
971 | #endif // ARROW_TYPE_H |
972 | |