1 | #pragma once |
2 | |
3 | #include <DataTypes/DataTypesNumber.h> |
4 | #include <DataTypes/DataTypeString.h> |
5 | #include <DataTypes/DataTypeFixedString.h> |
6 | #include <Columns/ColumnString.h> |
7 | #include <Common/Volnitsky.h> |
8 | #include <Functions/IFunctionImpl.h> |
9 | #include <Functions/FunctionHelpers.h> |
10 | #include <IO/ReadBufferFromMemory.h> |
11 | #include <IO/ReadHelpers.h> |
12 | |
13 | |
14 | /** Functions for retrieving "visit parameters". |
15 | * Visit parameters in Yandex.Metrika are a special kind of JSONs. |
16 | * These functions are applicable to almost any JSONs. |
17 | * Implemented via templates from FunctionsStringSearch.h. |
18 | * |
19 | * Check if there is a parameter |
20 | * visitParamHas |
21 | * |
22 | * Retrieve the numeric value of the parameter |
23 | * visitParamExtractUInt |
24 | * visitParamExtractInt |
25 | * visitParamExtractFloat |
26 | * visitParamExtractBool |
27 | * |
28 | * Retrieve the string value of the parameter |
29 | * visitParamExtractString - unescape value |
30 | * visitParamExtractRaw |
31 | */ |
32 | |
33 | namespace DB |
34 | { |
35 | |
36 | namespace ErrorCodes |
37 | { |
38 | extern const int ILLEGAL_COLUMN; |
39 | } |
40 | |
41 | |
42 | template <typename NumericType> |
43 | struct |
44 | { |
45 | using = NumericType; |
46 | |
47 | static ResultType (const UInt8 * begin, const UInt8 * end) |
48 | { |
49 | ReadBufferFromMemory in(begin, end - begin); |
50 | |
51 | /// Read numbers in double quotes |
52 | if (!in.eof() && *in.position() == '"') |
53 | ++in.position(); |
54 | |
55 | ResultType x = 0; |
56 | if (!in.eof()) |
57 | { |
58 | if constexpr (std::is_floating_point_v<NumericType>) |
59 | tryReadFloatText(x, in); |
60 | else |
61 | tryReadIntText(x, in); |
62 | } |
63 | return x; |
64 | } |
65 | }; |
66 | |
67 | |
68 | /** Searches for occurrences of a field in the visit parameter and calls ParamExtractor |
69 | * for each occurrence of the field, passing it a pointer to the part of the string, |
70 | * where the occurrence of the field value begins. |
71 | * ParamExtractor must parse and return the value of the desired type. |
72 | * |
73 | * If a field was not found or an incorrect value is associated with the field, |
74 | * then the default value used - 0. |
75 | */ |
76 | template <typename ParamExtractor> |
77 | struct |
78 | { |
79 | using = typename ParamExtractor::ResultType; |
80 | |
81 | /// It is assumed that `res` is the correct size and initialized with zeros. |
82 | static void (const ColumnString::Chars & data, const ColumnString::Offsets & offsets, |
83 | std::string needle, |
84 | PaddedPODArray<ResultType> & res) |
85 | { |
86 | /// We are looking for a parameter simply as a substring of the form "name" |
87 | needle = "\"" + needle + "\":" ; |
88 | |
89 | const UInt8 * begin = data.data(); |
90 | const UInt8 * pos = begin; |
91 | const UInt8 * end = pos + data.size(); |
92 | |
93 | /// The current index in the string array. |
94 | size_t i = 0; |
95 | |
96 | Volnitsky searcher(needle.data(), needle.size(), end - pos); |
97 | |
98 | /// We will search for the next occurrence in all strings at once. |
99 | while (pos < end && end != (pos = searcher.search(pos, end - pos))) |
100 | { |
101 | /// Let's determine which index it belongs to. |
102 | while (begin + offsets[i] <= pos) |
103 | { |
104 | res[i] = 0; |
105 | ++i; |
106 | } |
107 | |
108 | /// We check that the entry does not pass through the boundaries of strings. |
109 | if (pos + needle.size() < begin + offsets[i]) |
110 | res[i] = ParamExtractor::extract(pos + needle.size(), begin + offsets[i] - 1); /// don't include terminating zero |
111 | else |
112 | res[i] = 0; |
113 | |
114 | pos = begin + offsets[i]; |
115 | ++i; |
116 | } |
117 | |
118 | if (res.size() > i) |
119 | memset(&res[i], 0, (res.size() - i) * sizeof(res[0])); |
120 | } |
121 | |
122 | static void (const std::string & data, std::string needle, ResultType & res) |
123 | { |
124 | needle = "\"" + needle + "\":" ; |
125 | size_t pos = data.find(needle); |
126 | if (pos == std::string::npos) |
127 | res = 0; |
128 | else |
129 | res = ParamExtractor::extract( |
130 | reinterpret_cast<const UInt8 *>(data.data() + pos + needle.size()), |
131 | reinterpret_cast<const UInt8 *>(data.data() + data.size()) |
132 | ); |
133 | } |
134 | |
135 | template <typename... Args> static void (Args &&...) |
136 | { |
137 | throw Exception("Functions 'visitParamHas' and 'visitParamExtract*' doesn't support non-constant needle argument" , ErrorCodes::ILLEGAL_COLUMN); |
138 | } |
139 | |
140 | template <typename... Args> static void (Args &&...) |
141 | { |
142 | throw Exception("Functions 'visitParamHas' and 'visitParamExtract*' doesn't support non-constant needle argument" , ErrorCodes::ILLEGAL_COLUMN); |
143 | } |
144 | }; |
145 | |
146 | |
147 | /** For the case where the type of field to extract is a string. |
148 | */ |
149 | template <typename ParamExtractor> |
150 | struct |
151 | { |
152 | static void (const ColumnString::Chars & data, const ColumnString::Offsets & offsets, |
153 | std::string needle, |
154 | ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) |
155 | { |
156 | /// Constant 5 is taken from a function that performs a similar task FunctionsStringSearch.h::ExtractImpl |
157 | res_data.reserve(data.size() / 5); |
158 | res_offsets.resize(offsets.size()); |
159 | |
160 | /// We are looking for a parameter simply as a substring of the form "name" |
161 | needle = "\"" + needle + "\":" ; |
162 | |
163 | const UInt8 * begin = data.data(); |
164 | const UInt8 * pos = begin; |
165 | const UInt8 * end = pos + data.size(); |
166 | |
167 | /// The current index in the string array. |
168 | size_t i = 0; |
169 | |
170 | Volnitsky searcher(needle.data(), needle.size(), end - pos); |
171 | |
172 | /// We will search for the next occurrence in all strings at once. |
173 | while (pos < end && end != (pos = searcher.search(pos, end - pos))) |
174 | { |
175 | /// Determine which index it belongs to. |
176 | while (begin + offsets[i] <= pos) |
177 | { |
178 | res_data.push_back(0); |
179 | res_offsets[i] = res_data.size(); |
180 | ++i; |
181 | } |
182 | |
183 | /// We check that the entry does not pass through the boundaries of strings. |
184 | if (pos + needle.size() < begin + offsets[i]) |
185 | ParamExtractor::extract(pos + needle.size(), begin + offsets[i], res_data); |
186 | |
187 | pos = begin + offsets[i]; |
188 | |
189 | res_data.push_back(0); |
190 | res_offsets[i] = res_data.size(); |
191 | ++i; |
192 | } |
193 | |
194 | while (i < res_offsets.size()) |
195 | { |
196 | res_data.push_back(0); |
197 | res_offsets[i] = res_data.size(); |
198 | ++i; |
199 | } |
200 | } |
201 | }; |
202 | |
203 | } |
204 | |