1#include <Functions/FunctionFactory.h>
2#include <Functions/FunctionsStringArray.h>
3
4namespace DB
5{
6
7class URLHierarchyImpl
8{
9private:
10 Pos begin;
11 Pos pos;
12 Pos end;
13
14public:
15 static constexpr auto name = "URLHierarchy";
16 static String getName() { return name; }
17
18 static size_t getNumberOfArguments() { return 1; }
19
20 static void checkArguments(const DataTypes & arguments)
21 {
22 if (!isString(arguments[0]))
23 throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
24 ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
25 }
26
27 void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {}
28
29 /// Returns the position of the argument that is the column of rows
30 size_t getStringsArgumentPosition()
31 {
32 return 0;
33 }
34
35 /// Called for each next string.
36 void set(Pos pos_, Pos end_)
37 {
38 begin = pos = pos_;
39 end = end_;
40 }
41
42 /// Get the next token, if any, or return false.
43 bool get(Pos & token_begin, Pos & token_end)
44 {
45 /// Code from URLParser.
46 if (pos == end)
47 return false;
48
49 if (pos == begin)
50 {
51 /// Let's parse everything that goes before the path
52
53 /// Assume that the protocol has already been changed to lowercase.
54 while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9')))
55 ++pos;
56
57 /** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes.
58 * (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there
59 * For the rest, simply return the full URL as the only element of the hierarchy.
60 */
61 if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end))
62 {
63 pos = end;
64 token_begin = begin;
65 token_end = end;
66 return true;
67 }
68
69 /// The domain for simplicity is everything that after the protocol and two slashes, until the next slash or `?` or `#`
70 while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
71 ++pos;
72
73 if (pos != end)
74 ++pos;
75
76 token_begin = begin;
77 token_end = pos;
78
79 return true;
80 }
81
82 /// We go to the next `/` or `?` or `#`, skipping all those at the beginning.
83 while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#'))
84 ++pos;
85 if (pos == end)
86 return false;
87 while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
88 ++pos;
89
90 if (pos != end)
91 ++pos;
92
93 token_begin = begin;
94 token_end = pos;
95
96 return true;
97 }
98};
99
100
101struct NameURLHierarchy { static constexpr auto name = "URLHierarchy"; };
102using FunctionURLHierarchy = FunctionTokens<URLHierarchyImpl>;
103
104void registerFunctionURLHierarchy(FunctionFactory & factory)
105{
106 factory.registerFunction<FunctionURLHierarchy>();
107}
108
109}
110