1#include <Functions/FunctionFactory.h>
2#include <Functions/FunctionsStringArray.h>
3
4namespace DB
5{
6
7class URLPathHierarchyImpl
8{
9private:
10 Pos begin;
11 Pos pos;
12 Pos end;
13 Pos start;
14
15public:
16 static constexpr auto name = "URLPathHierarchy";
17 static String getName() { return name; }
18
19 static size_t getNumberOfArguments() { return 1; }
20
21 static void checkArguments(const DataTypes & arguments)
22 {
23 if (!isString(arguments[0]))
24 throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
25 ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
26 }
27
28 void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {}
29
30 /// Returns the position of the argument that is the column of rows
31 size_t getStringsArgumentPosition()
32 {
33 return 0;
34 }
35
36 /// Called for each next string.
37 void set(Pos pos_, Pos end_)
38 {
39 begin = pos = pos_;
40 start = begin;
41 end = end_;
42 }
43
44 /// Get the next token, if any, or return false.
45 bool get(Pos & token_begin, Pos & token_end)
46 {
47 /// Code from URLParser.
48
49 if (pos == end)
50 return false;
51
52 if (pos == begin)
53 {
54 /// Let's parse everything that goes before the path
55
56 /// Assume that the protocol has already been changed to lowercase.
57 while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9')))
58 ++pos;
59
60 /** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes.
61 * (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there.
62 * For the rest, just return an empty array.
63 */
64 if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end))
65 {
66 pos = end;
67 return false;
68 }
69
70 /// The domain for simplicity is everything that after the protocol and the two slashes, until the next slash or `?` or `#`
71 while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
72 ++pos;
73
74 start = pos;
75
76 if (pos != end)
77 ++pos;
78 }
79
80 /// We go to the next `/` or `?` or `#`, skipping all those at the beginning.
81 while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#'))
82 ++pos;
83 if (pos == end)
84 return false;
85 while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#'))
86 ++pos;
87
88 if (pos != end)
89 ++pos;
90
91 token_begin = start;
92 token_end = pos;
93
94 return true;
95 }
96};
97
98
99struct NameURLPathHierarchy { static constexpr auto name = "URLPathHierarchy"; };
100using FunctionURLPathHierarchy = FunctionTokens<URLPathHierarchyImpl>;
101
102void registerFunctionURLPathHierarchy(FunctionFactory & factory)
103{
104 factory.registerFunction<FunctionURLPathHierarchy>();
105}
106
107}
108