1 | #pragma once |
2 | |
3 | #include <Columns/ColumnString.h> |
4 | #include <Common/memcpySmall.h> |
5 | |
6 | |
7 | namespace DB |
8 | { |
9 | |
10 | /** URL processing functions. See implementation in separate .cpp files. |
11 | * All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons. |
12 | * |
13 | * Functions for extraction parts of URL. |
14 | * If URL has nothing like, then empty string is returned. |
15 | * |
16 | * domain |
17 | * domainWithoutWWW |
18 | * topLevelDomain |
19 | * protocol |
20 | * path |
21 | * queryString |
22 | * fragment |
23 | * queryStringAndFragment |
24 | * |
25 | * Functions, removing parts from URL. |
26 | * If URL has nothing like, then it is returned unchanged. |
27 | * |
28 | * cutWWW |
29 | * cutFragment |
30 | * cutQueryString |
31 | * cutQueryStringAndFragment |
32 | * |
33 | * Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter. |
34 | * If there are many parameters with same name - return value of first one. Value is not %-decoded. |
35 | * |
36 | * extractURLParameter(URL, name) |
37 | * |
38 | * Extract all parameters from URL in form of array of strings name=value. |
39 | * extractURLParameters(URL) |
40 | * |
41 | * Extract names of all parameters from URL in form of array of strings. |
42 | * extractURLParameterNames(URL) |
43 | * |
44 | * Remove specified parameter from URL. |
45 | * cutURLParameter(URL, name) |
46 | * |
47 | * Get array of URL 'hierarchy' as in Yandex.Metrica tree-like reports. See docs. |
48 | * URLHierarchy(URL) |
49 | */ |
50 | |
51 | namespace ErrorCodes |
52 | { |
53 | extern const int ILLEGAL_COLUMN; |
54 | } |
55 | |
56 | using Pos = const char *; |
57 | |
58 | |
59 | /** Select part of string using the Extractor. |
60 | */ |
61 | template <typename Extractor> |
62 | struct |
63 | { |
64 | static void (const ColumnString::Chars & data, const ColumnString::Offsets & offsets, |
65 | ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) |
66 | { |
67 | size_t size = offsets.size(); |
68 | res_offsets.resize(size); |
69 | res_data.reserve(size * Extractor::getReserveLengthForElement()); |
70 | |
71 | size_t prev_offset = 0; |
72 | size_t res_offset = 0; |
73 | |
74 | /// Matched part. |
75 | Pos start; |
76 | size_t length; |
77 | |
78 | for (size_t i = 0; i < size; ++i) |
79 | { |
80 | Extractor::execute(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length); |
81 | |
82 | res_data.resize(res_data.size() + length + 1); |
83 | memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length); |
84 | res_offset += length + 1; |
85 | res_data[res_offset - 1] = 0; |
86 | |
87 | res_offsets[i] = res_offset; |
88 | prev_offset = offsets[i]; |
89 | } |
90 | } |
91 | |
92 | static void (const std::string & data, |
93 | std::string & res_data) |
94 | { |
95 | Pos start; |
96 | size_t length; |
97 | Extractor::execute(data.data(), data.size(), start, length); |
98 | res_data.assign(start, length); |
99 | } |
100 | |
101 | static void (const ColumnString::Chars &, size_t, ColumnString::Chars &) |
102 | { |
103 | throw Exception("Column of type FixedString is not supported by URL functions" , ErrorCodes::ILLEGAL_COLUMN); |
104 | } |
105 | }; |
106 | |
107 | |
108 | /** Delete part of string using the Extractor. |
109 | */ |
110 | template <typename Extractor> |
111 | struct CutSubstringImpl |
112 | { |
113 | static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, |
114 | ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) |
115 | { |
116 | res_data.reserve(data.size()); |
117 | size_t size = offsets.size(); |
118 | res_offsets.resize(size); |
119 | |
120 | size_t prev_offset = 0; |
121 | size_t res_offset = 0; |
122 | |
123 | /// Matched part. |
124 | Pos start; |
125 | size_t length; |
126 | |
127 | for (size_t i = 0; i < size; ++i) |
128 | { |
129 | const char * current = reinterpret_cast<const char *>(&data[prev_offset]); |
130 | Extractor::execute(current, offsets[i] - prev_offset - 1, start, length); |
131 | size_t start_index = start - reinterpret_cast<const char *>(data.data()); |
132 | |
133 | res_data.resize(res_data.size() + offsets[i] - prev_offset - length); |
134 | memcpySmallAllowReadWriteOverflow15( |
135 | &res_data[res_offset], current, start - current); |
136 | memcpySmallAllowReadWriteOverflow15( |
137 | &res_data[res_offset + start - current], start + length, offsets[i] - start_index - length); |
138 | res_offset += offsets[i] - prev_offset - length; |
139 | |
140 | res_offsets[i] = res_offset; |
141 | prev_offset = offsets[i]; |
142 | } |
143 | } |
144 | |
145 | static void constant(const std::string & data, |
146 | std::string & res_data) |
147 | { |
148 | Pos start; |
149 | size_t length; |
150 | Extractor::execute(data.data(), data.size(), start, length); |
151 | res_data.reserve(data.size() - length); |
152 | res_data.append(data.data(), start); |
153 | res_data.append(start + length, data.data() + data.size()); |
154 | } |
155 | |
156 | static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) |
157 | { |
158 | throw Exception("Column of type FixedString is not supported by URL functions" , ErrorCodes::ILLEGAL_COLUMN); |
159 | } |
160 | }; |
161 | |
162 | } |
163 | |