1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #pragma once |
19 | |
20 | #include <memory> |
21 | |
22 | #include "arrow/array/builder_adaptive.h" // IWYU pragma: export |
23 | #include "arrow/array/builder_base.h" // IWYU pragma: export |
24 | |
25 | namespace arrow { |
26 | |
27 | // ---------------------------------------------------------------------- |
28 | // Dictionary builder |
29 | |
30 | namespace internal { |
31 | |
32 | template <typename T> |
33 | struct DictionaryScalar { |
34 | using type = typename T::c_type; |
35 | }; |
36 | |
37 | template <> |
38 | struct DictionaryScalar<BinaryType> { |
39 | using type = util::string_view; |
40 | }; |
41 | |
42 | template <> |
43 | struct DictionaryScalar<StringType> { |
44 | using type = util::string_view; |
45 | }; |
46 | |
47 | template <> |
48 | struct DictionaryScalar<FixedSizeBinaryType> { |
49 | using type = util::string_view; |
50 | }; |
51 | |
52 | } // namespace internal |
53 | |
54 | /// \brief Array builder for created encoded DictionaryArray from dense array |
55 | /// |
56 | /// Unlike other builders, dictionary builder does not completely reset the state |
57 | /// on Finish calls. The arrays built after the initial Finish call will reuse |
58 | /// the previously created encoding and build a delta dictionary when new terms |
59 | /// occur. |
60 | /// |
61 | /// data |
62 | template <typename T> |
63 | class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { |
64 | public: |
65 | using Scalar = typename internal::DictionaryScalar<T>::type; |
66 | |
67 | // WARNING: the type given below is the value type, not the DictionaryType. |
68 | // The DictionaryType is instantiated on the Finish() call. |
69 | DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool); |
70 | |
71 | template <typename T1 = T> |
72 | explicit DictionaryBuilder( |
73 | typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool) |
74 | : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {} |
75 | |
76 | ~DictionaryBuilder() override; |
77 | |
78 | /// \brief Append a scalar value |
79 | Status Append(const Scalar& value); |
80 | |
81 | /// \brief Append a fixed-width string (only for FixedSizeBinaryType) |
82 | template <typename T1 = T> |
83 | Status Append(typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value, |
84 | const uint8_t*>::type value) { |
85 | return Append(util::string_view(reinterpret_cast<const char*>(value), byte_width_)); |
86 | } |
87 | |
88 | /// \brief Append a fixed-width string (only for FixedSizeBinaryType) |
89 | template <typename T1 = T> |
90 | Status Append(typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value, |
91 | const char*>::type value) { |
92 | return Append(util::string_view(value, byte_width_)); |
93 | } |
94 | |
95 | /// \brief Append a scalar null value |
96 | Status AppendNull(); |
97 | |
98 | /// \brief Append a whole dense array to the builder |
99 | Status AppendArray(const Array& array); |
100 | |
101 | void Reset() override; |
102 | Status Resize(int64_t capacity) override; |
103 | Status FinishInternal(std::shared_ptr<ArrayData>* out) override; |
104 | |
105 | /// is the dictionary builder in the delta building mode |
106 | bool is_building_delta() { return delta_offset_ > 0; } |
107 | |
108 | protected: |
109 | class MemoTableImpl; |
110 | std::unique_ptr<MemoTableImpl> memo_table_; |
111 | |
112 | int32_t delta_offset_; |
113 | // Only used for FixedSizeBinaryType |
114 | int32_t byte_width_; |
115 | |
116 | AdaptiveIntBuilder values_builder_; |
117 | }; |
118 | |
119 | template <> |
120 | class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder { |
121 | public: |
122 | DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool); |
123 | explicit DictionaryBuilder(MemoryPool* pool); |
124 | |
125 | /// \brief Append a scalar null value |
126 | Status AppendNull(); |
127 | |
128 | /// \brief Append a whole dense array to the builder |
129 | Status AppendArray(const Array& array); |
130 | |
131 | Status Resize(int64_t capacity) override; |
132 | Status FinishInternal(std::shared_ptr<ArrayData>* out) override; |
133 | |
134 | protected: |
135 | AdaptiveIntBuilder values_builder_; |
136 | }; |
137 | |
138 | class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> { |
139 | public: |
140 | using DictionaryBuilder::Append; |
141 | using DictionaryBuilder::DictionaryBuilder; |
142 | |
143 | Status Append(const uint8_t* value, int32_t length) { |
144 | return Append(reinterpret_cast<const char*>(value), length); |
145 | } |
146 | |
147 | Status Append(const char* value, int32_t length) { |
148 | return Append(util::string_view(value, length)); |
149 | } |
150 | }; |
151 | |
152 | /// \brief Dictionary array builder with convenience methods for strings |
153 | class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> { |
154 | public: |
155 | using DictionaryBuilder::Append; |
156 | using DictionaryBuilder::DictionaryBuilder; |
157 | |
158 | Status Append(const uint8_t* value, int32_t length) { |
159 | return Append(reinterpret_cast<const char*>(value), length); |
160 | } |
161 | |
162 | Status Append(const char* value, int32_t length) { |
163 | return Append(util::string_view(value, length)); |
164 | } |
165 | }; |
166 | |
167 | } // namespace arrow |
168 | |