1#include "duckdb/function/cast/vector_cast_helpers.hpp"
2
3namespace duckdb {
4
5// ------- Helper functions for splitting string nested types -------
6static bool IsNull(const char *buf, idx_t start_pos, Vector &child, idx_t row_idx) {
7 if (buf[start_pos] == 'N' && buf[start_pos + 1] == 'U' && buf[start_pos + 2] == 'L' && buf[start_pos + 3] == 'L') {
8 FlatVector::SetNull(vector&: child, idx: row_idx, is_null: true);
9 return true;
10 }
11 return false;
12}
13
14inline static void SkipWhitespace(const char *buf, idx_t &pos, idx_t len) {
15 while (pos < len && StringUtil::CharacterIsSpace(c: buf[pos])) {
16 pos++;
17 }
18}
19
20static bool SkipToCloseQuotes(idx_t &pos, const char *buf, idx_t &len) {
21 char quote = buf[pos];
22 pos++;
23
24 while (pos < len) {
25 if (buf[pos] == quote) {
26 return true;
27 }
28 pos++;
29 }
30 return false;
31}
32
33static bool SkipToClose(idx_t &idx, const char *buf, idx_t &len, idx_t &lvl, char close_bracket) {
34 idx++;
35
36 while (idx < len) {
37 if (buf[idx] == '"' || buf[idx] == '\'') {
38 if (!SkipToCloseQuotes(pos&: idx, buf, len)) {
39 return false;
40 }
41 } else if (buf[idx] == '{') {
42 if (!SkipToClose(idx, buf, len, lvl, close_bracket: '}')) {
43 return false;
44 }
45 } else if (buf[idx] == '[') {
46 if (!SkipToClose(idx, buf, len, lvl, close_bracket: ']')) {
47 return false;
48 }
49 lvl++;
50 } else if (buf[idx] == close_bracket) {
51 if (close_bracket == ']') {
52 lvl--;
53 }
54 return true;
55 }
56 idx++;
57 }
58 return false;
59}
60
61static idx_t StringTrim(const char *buf, idx_t &start_pos, idx_t pos) {
62 idx_t trailing_whitespace = 0;
63 while (StringUtil::CharacterIsSpace(c: buf[pos - trailing_whitespace - 1])) {
64 trailing_whitespace++;
65 }
66 if ((buf[start_pos] == '"' && buf[pos - trailing_whitespace - 1] == '"') ||
67 (buf[start_pos] == '\'' && buf[pos - trailing_whitespace - 1] == '\'')) {
68 start_pos++;
69 trailing_whitespace++;
70 }
71 return (pos - trailing_whitespace);
72}
73
74struct CountPartOperation {
75 idx_t count = 0;
76
77 bool HandleKey(const char *buf, idx_t start_pos, idx_t pos) {
78 count++;
79 return true;
80 }
81 void HandleValue(const char *buf, idx_t start_pos, idx_t pos) {
82 count++;
83 }
84};
85
86// ------- LIST SPLIT -------
87struct SplitStringListOperation {
88 SplitStringListOperation(string_t *child_data, idx_t &child_start, Vector &child)
89 : child_data(child_data), child_start(child_start), child(child) {
90 }
91
92 string_t *child_data;
93 idx_t &child_start;
94 Vector &child;
95
96 void HandleValue(const char *buf, idx_t start_pos, idx_t pos) {
97 if ((pos - start_pos) == 4 && IsNull(buf, start_pos, child, row_idx: child_start)) {
98 child_start++;
99 return;
100 }
101 child_data[child_start] = StringVector::AddString(vector&: child, data: buf + start_pos, len: pos - start_pos);
102 child_start++;
103 }
104};
105
106template <class OP>
107static bool SplitStringListInternal(const string_t &input, OP &state) {
108 const char *buf = input.GetData();
109 idx_t len = input.GetSize();
110 idx_t lvl = 1;
111 idx_t pos = 0;
112
113 SkipWhitespace(buf, pos, len);
114 if (pos == len || buf[pos] != '[') {
115 return false;
116 }
117
118 SkipWhitespace(buf, pos&: ++pos, len);
119 idx_t start_pos = pos;
120 while (pos < len) {
121 if (buf[pos] == '[') {
122 if (!SkipToClose(idx&: pos, buf, len, lvl&: ++lvl, close_bracket: ']')) {
123 return false;
124 }
125 } else if ((buf[pos] == '"' || buf[pos] == '\'') && pos == start_pos) {
126 SkipToCloseQuotes(pos, buf, len);
127 } else if (buf[pos] == '{') {
128 idx_t struct_lvl = 0;
129 SkipToClose(idx&: pos, buf, len, lvl&: struct_lvl, close_bracket: '}');
130 } else if (buf[pos] == ',' || buf[pos] == ']') {
131 idx_t trailing_whitespace = 0;
132 while (StringUtil::CharacterIsSpace(c: buf[pos - trailing_whitespace - 1])) {
133 trailing_whitespace++;
134 }
135 if (!(buf[pos] == ']' && start_pos == pos)) {
136 state.HandleValue(buf, start_pos, pos - trailing_whitespace);
137 } // else the list is empty
138 if (buf[pos] == ']') {
139 lvl--;
140 break;
141 }
142 SkipWhitespace(buf, pos&: ++pos, len);
143 start_pos = pos;
144 continue;
145 }
146 pos++;
147 }
148 SkipWhitespace(buf, pos&: ++pos, len);
149 return (pos == len && lvl == 0);
150}
151
152bool VectorStringToList::SplitStringList(const string_t &input, string_t *child_data, idx_t &child_start,
153 Vector &child) {
154 SplitStringListOperation state(child_data, child_start, child);
155 return SplitStringListInternal<SplitStringListOperation>(input, state);
156}
157
158idx_t VectorStringToList::CountPartsList(const string_t &input) {
159 CountPartOperation state;
160 SplitStringListInternal<CountPartOperation>(input, state);
161 return state.count;
162}
163
164// ------- MAP SPLIT -------
165struct SplitStringMapOperation {
166 SplitStringMapOperation(string_t *child_key_data, string_t *child_val_data, idx_t &child_start, Vector &varchar_key,
167 Vector &varchar_val)
168 : child_key_data(child_key_data), child_val_data(child_val_data), child_start(child_start),
169 varchar_key(varchar_key), varchar_val(varchar_val) {
170 }
171
172 string_t *child_key_data;
173 string_t *child_val_data;
174 idx_t &child_start;
175 Vector &varchar_key;
176 Vector &varchar_val;
177
178 bool HandleKey(const char *buf, idx_t start_pos, idx_t pos) {
179 if ((pos - start_pos) == 4 && IsNull(buf, start_pos, child&: varchar_key, row_idx: child_start)) {
180 FlatVector::SetNull(vector&: varchar_val, idx: child_start, is_null: true);
181 child_start++;
182 return false;
183 }
184 child_key_data[child_start] = StringVector::AddString(vector&: varchar_key, data: buf + start_pos, len: pos - start_pos);
185 return true;
186 }
187
188 void HandleValue(const char *buf, idx_t start_pos, idx_t pos) {
189 if ((pos - start_pos) == 4 && IsNull(buf, start_pos, child&: varchar_val, row_idx: child_start)) {
190 child_start++;
191 return;
192 }
193 child_val_data[child_start] = StringVector::AddString(vector&: varchar_val, data: buf + start_pos, len: pos - start_pos);
194 child_start++;
195 }
196};
197
198template <class OP>
199static bool FindKeyOrValueMap(const char *buf, idx_t len, idx_t &pos, OP &state, bool key) {
200 auto start_pos = pos;
201 idx_t lvl = 0;
202 while (pos < len) {
203 if (buf[pos] == '"' || buf[pos] == '\'') {
204 SkipToCloseQuotes(pos, buf, len);
205 } else if (buf[pos] == '{') {
206 SkipToClose(idx&: pos, buf, len, lvl, close_bracket: '}');
207 } else if (buf[pos] == '[') {
208 SkipToClose(idx&: pos, buf, len, lvl, close_bracket: ']');
209 } else if (key && buf[pos] == '=') {
210 idx_t end_pos = StringTrim(buf, start_pos, pos);
211 return state.HandleKey(buf, start_pos, end_pos); // put string in KEY_child_vector
212 } else if (!key && (buf[pos] == ',' || buf[pos] == '}')) {
213 idx_t end_pos = StringTrim(buf, start_pos, pos);
214 state.HandleValue(buf, start_pos, end_pos); // put string in VALUE_child_vector
215 return true;
216 }
217 pos++;
218 }
219 return false;
220}
221
222template <class OP>
223static bool SplitStringMapInternal(const string_t &input, OP &state) {
224 const char *buf = input.GetData();
225 idx_t len = input.GetSize();
226 idx_t pos = 0;
227
228 SkipWhitespace(buf, pos, len);
229 if (pos == len || buf[pos] != '{') {
230 return false;
231 }
232 SkipWhitespace(buf, pos&: ++pos, len);
233 if (pos == len) {
234 return false;
235 }
236 if (buf[pos] == '}') {
237 SkipWhitespace(buf, pos&: ++pos, len);
238 return (pos == len);
239 }
240 while (pos < len) {
241 if (!FindKeyOrValueMap(buf, len, pos, state, true)) {
242 return false;
243 }
244 SkipWhitespace(buf, pos&: ++pos, len);
245 if (!FindKeyOrValueMap(buf, len, pos, state, false)) {
246 return false;
247 }
248 SkipWhitespace(buf, pos&: ++pos, len);
249 }
250 return true;
251}
252
253bool VectorStringToMap::SplitStringMap(const string_t &input, string_t *child_key_data, string_t *child_val_data,
254 idx_t &child_start, Vector &varchar_key, Vector &varchar_val) {
255 SplitStringMapOperation state(child_key_data, child_val_data, child_start, varchar_key, varchar_val);
256 return SplitStringMapInternal<SplitStringMapOperation>(input, state);
257}
258
259idx_t VectorStringToMap::CountPartsMap(const string_t &input) {
260 CountPartOperation state;
261 SplitStringMapInternal<CountPartOperation>(input, state);
262 return state.count;
263}
264
265// ------- STRUCT SPLIT -------
266static bool FindKeyStruct(const char *buf, idx_t len, idx_t &pos) {
267 while (pos < len) {
268 if (buf[pos] == ':') {
269 return true;
270 }
271 pos++;
272 }
273 return false;
274}
275
276static bool FindValueStruct(const char *buf, idx_t len, idx_t &pos, Vector &varchar_child, idx_t &row_idx,
277 ValidityMask *child_mask) {
278 auto start_pos = pos;
279 idx_t lvl = 0;
280 while (pos < len) {
281 if (buf[pos] == '"' || buf[pos] == '\'') {
282 SkipToCloseQuotes(pos, buf, len);
283 } else if (buf[pos] == '{') {
284 SkipToClose(idx&: pos, buf, len, lvl, close_bracket: '}');
285 } else if (buf[pos] == '[') {
286 SkipToClose(idx&: pos, buf, len, lvl, close_bracket: ']');
287 } else if (buf[pos] == ',' || buf[pos] == '}') {
288 idx_t end_pos = StringTrim(buf, start_pos, pos);
289 if ((end_pos - start_pos) == 4 && IsNull(buf, start_pos, child&: varchar_child, row_idx)) {
290 return true;
291 }
292 FlatVector::GetData<string_t>(vector&: varchar_child)[row_idx] =
293 StringVector::AddString(vector&: varchar_child, data: buf + start_pos, len: end_pos - start_pos);
294 child_mask->SetValid(row_idx); // any child not set to valid will remain invalid
295 return true;
296 }
297 pos++;
298 }
299 return false;
300}
301
302bool VectorStringToStruct::SplitStruct(const string_t &input, vector<unique_ptr<Vector>> &varchar_vectors,
303 idx_t &row_idx, string_map_t<idx_t> &child_names,
304 vector<ValidityMask *> &child_masks) {
305 const char *buf = input.GetData();
306 idx_t len = input.GetSize();
307 idx_t pos = 0;
308 idx_t child_idx;
309
310 SkipWhitespace(buf, pos, len);
311 if (pos == len || buf[pos] != '{') {
312 return false;
313 }
314 SkipWhitespace(buf, pos&: ++pos, len);
315 if (buf[pos] == '}') {
316 pos++;
317 } else {
318 while (pos < len) {
319 auto key_start = pos;
320 if (!FindKeyStruct(buf, len, pos)) {
321 return false;
322 }
323 auto key_end = StringTrim(buf, start_pos&: key_start, pos);
324 string_t found_key(buf + key_start, key_end - key_start);
325
326 auto it = child_names.find(x: found_key);
327 if (it == child_names.end()) {
328 return false; // false key
329 }
330 child_idx = it->second;
331 SkipWhitespace(buf, pos&: ++pos, len);
332 if (!FindValueStruct(buf, len, pos, varchar_child&: *varchar_vectors[child_idx], row_idx, child_mask: child_masks[child_idx])) {
333 return false;
334 }
335 SkipWhitespace(buf, pos&: ++pos, len);
336 }
337 }
338 SkipWhitespace(buf, pos, len);
339 return (pos == len);
340}
341
342} // namespace duckdb
343