1#pragma once
2
3#include <Columns/ColumnString.h>
4#include <Common/memcpySmall.h>
5
6
7namespace DB
8{
9
10/** URL processing functions. See implementation in separate .cpp files.
11 * All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
12 *
13 * Functions for extraction parts of URL.
14 * If URL has nothing like, then empty string is returned.
15 *
16 * domain
17 * domainWithoutWWW
18 * topLevelDomain
19 * protocol
20 * path
21 * queryString
22 * fragment
23 * queryStringAndFragment
24 *
25 * Functions, removing parts from URL.
26 * If URL has nothing like, then it is returned unchanged.
27 *
28 * cutWWW
29 * cutFragment
30 * cutQueryString
31 * cutQueryStringAndFragment
32 *
33 * Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
34 * If there are many parameters with same name - return value of first one. Value is not %-decoded.
35 *
36 * extractURLParameter(URL, name)
37 *
38 * Extract all parameters from URL in form of array of strings name=value.
39 * extractURLParameters(URL)
40 *
41 * Extract names of all parameters from URL in form of array of strings.
42 * extractURLParameterNames(URL)
43 *
44 * Remove specified parameter from URL.
45 * cutURLParameter(URL, name)
46 *
47 * Get array of URL 'hierarchy' as in Yandex.Metrica tree-like reports. See docs.
48 * URLHierarchy(URL)
49 */
50
51namespace ErrorCodes
52{
53 extern const int ILLEGAL_COLUMN;
54}
55
56using Pos = const char *;
57
58
59/** Select part of string using the Extractor.
60 */
61template <typename Extractor>
62struct ExtractSubstringImpl
63{
64 static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
65 ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
66 {
67 size_t size = offsets.size();
68 res_offsets.resize(size);
69 res_data.reserve(size * Extractor::getReserveLengthForElement());
70
71 size_t prev_offset = 0;
72 size_t res_offset = 0;
73
74 /// Matched part.
75 Pos start;
76 size_t length;
77
78 for (size_t i = 0; i < size; ++i)
79 {
80 Extractor::execute(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1, start, length);
81
82 res_data.resize(res_data.size() + length + 1);
83 memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], start, length);
84 res_offset += length + 1;
85 res_data[res_offset - 1] = 0;
86
87 res_offsets[i] = res_offset;
88 prev_offset = offsets[i];
89 }
90 }
91
92 static void constant(const std::string & data,
93 std::string & res_data)
94 {
95 Pos start;
96 size_t length;
97 Extractor::execute(data.data(), data.size(), start, length);
98 res_data.assign(start, length);
99 }
100
101 static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
102 {
103 throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
104 }
105};
106
107
108/** Delete part of string using the Extractor.
109 */
110template <typename Extractor>
111struct CutSubstringImpl
112{
113 static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
114 ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
115 {
116 res_data.reserve(data.size());
117 size_t size = offsets.size();
118 res_offsets.resize(size);
119
120 size_t prev_offset = 0;
121 size_t res_offset = 0;
122
123 /// Matched part.
124 Pos start;
125 size_t length;
126
127 for (size_t i = 0; i < size; ++i)
128 {
129 const char * current = reinterpret_cast<const char *>(&data[prev_offset]);
130 Extractor::execute(current, offsets[i] - prev_offset - 1, start, length);
131 size_t start_index = start - reinterpret_cast<const char *>(data.data());
132
133 res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
134 memcpySmallAllowReadWriteOverflow15(
135 &res_data[res_offset], current, start - current);
136 memcpySmallAllowReadWriteOverflow15(
137 &res_data[res_offset + start - current], start + length, offsets[i] - start_index - length);
138 res_offset += offsets[i] - prev_offset - length;
139
140 res_offsets[i] = res_offset;
141 prev_offset = offsets[i];
142 }
143 }
144
145 static void constant(const std::string & data,
146 std::string & res_data)
147 {
148 Pos start;
149 size_t length;
150 Extractor::execute(data.data(), data.size(), start, length);
151 res_data.reserve(data.size() - length);
152 res_data.append(data.data(), start);
153 res_data.append(start + length, data.data() + data.size());
154 }
155
156 static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
157 {
158 throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN);
159 }
160};
161
162}
163