1 | #include <Functions/FunctionFactory.h> |
2 | #include <Functions/FunctionsStringArray.h> |
3 | |
4 | namespace DB |
5 | { |
6 | |
7 | class URLHierarchyImpl |
8 | { |
9 | private: |
10 | Pos begin; |
11 | Pos pos; |
12 | Pos end; |
13 | |
14 | public: |
15 | static constexpr auto name = "URLHierarchy" ; |
16 | static String getName() { return name; } |
17 | |
18 | static size_t getNumberOfArguments() { return 1; } |
19 | |
20 | static void checkArguments(const DataTypes & arguments) |
21 | { |
22 | if (!isString(arguments[0])) |
23 | throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String." , |
24 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
25 | } |
26 | |
27 | void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {} |
28 | |
29 | /// Returns the position of the argument that is the column of rows |
30 | size_t getStringsArgumentPosition() |
31 | { |
32 | return 0; |
33 | } |
34 | |
35 | /// Called for each next string. |
36 | void set(Pos pos_, Pos end_) |
37 | { |
38 | begin = pos = pos_; |
39 | end = end_; |
40 | } |
41 | |
42 | /// Get the next token, if any, or return false. |
43 | bool get(Pos & token_begin, Pos & token_end) |
44 | { |
45 | /// Code from URLParser. |
46 | if (pos == end) |
47 | return false; |
48 | |
49 | if (pos == begin) |
50 | { |
51 | /// Let's parse everything that goes before the path |
52 | |
53 | /// Assume that the protocol has already been changed to lowercase. |
54 | while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9'))) |
55 | ++pos; |
56 | |
57 | /** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes. |
58 | * (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there |
59 | * For the rest, simply return the full URL as the only element of the hierarchy. |
60 | */ |
61 | if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end)) |
62 | { |
63 | pos = end; |
64 | token_begin = begin; |
65 | token_end = end; |
66 | return true; |
67 | } |
68 | |
69 | /// The domain for simplicity is everything that after the protocol and two slashes, until the next slash or `?` or `#` |
70 | while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#')) |
71 | ++pos; |
72 | |
73 | if (pos != end) |
74 | ++pos; |
75 | |
76 | token_begin = begin; |
77 | token_end = pos; |
78 | |
79 | return true; |
80 | } |
81 | |
82 | /// We go to the next `/` or `?` or `#`, skipping all those at the beginning. |
83 | while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#')) |
84 | ++pos; |
85 | if (pos == end) |
86 | return false; |
87 | while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#')) |
88 | ++pos; |
89 | |
90 | if (pos != end) |
91 | ++pos; |
92 | |
93 | token_begin = begin; |
94 | token_end = pos; |
95 | |
96 | return true; |
97 | } |
98 | }; |
99 | |
100 | |
101 | struct NameURLHierarchy { static constexpr auto name = "URLHierarchy" ; }; |
102 | using FunctionURLHierarchy = FunctionTokens<URLHierarchyImpl>; |
103 | |
104 | void registerFunctionURLHierarchy(FunctionFactory & factory) |
105 | { |
106 | factory.registerFunction<FunctionURLHierarchy>(); |
107 | } |
108 | |
109 | } |
110 | |