1 | #include <Functions/FunctionFactory.h> |
2 | #include <Functions/FunctionsStringArray.h> |
3 | |
4 | namespace DB |
5 | { |
6 | |
7 | class URLPathHierarchyImpl |
8 | { |
9 | private: |
10 | Pos begin; |
11 | Pos pos; |
12 | Pos end; |
13 | Pos start; |
14 | |
15 | public: |
16 | static constexpr auto name = "URLPathHierarchy" ; |
17 | static String getName() { return name; } |
18 | |
19 | static size_t getNumberOfArguments() { return 1; } |
20 | |
21 | static void checkArguments(const DataTypes & arguments) |
22 | { |
23 | if (!isString(arguments[0])) |
24 | throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String." , |
25 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
26 | } |
27 | |
28 | void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {} |
29 | |
30 | /// Returns the position of the argument that is the column of rows |
31 | size_t getStringsArgumentPosition() |
32 | { |
33 | return 0; |
34 | } |
35 | |
36 | /// Called for each next string. |
37 | void set(Pos pos_, Pos end_) |
38 | { |
39 | begin = pos = pos_; |
40 | start = begin; |
41 | end = end_; |
42 | } |
43 | |
44 | /// Get the next token, if any, or return false. |
45 | bool get(Pos & token_begin, Pos & token_end) |
46 | { |
47 | /// Code from URLParser. |
48 | |
49 | if (pos == end) |
50 | return false; |
51 | |
52 | if (pos == begin) |
53 | { |
54 | /// Let's parse everything that goes before the path |
55 | |
56 | /// Assume that the protocol has already been changed to lowercase. |
57 | while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9'))) |
58 | ++pos; |
59 | |
60 | /** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes. |
61 | * (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there. |
62 | * For the rest, just return an empty array. |
63 | */ |
64 | if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end)) |
65 | { |
66 | pos = end; |
67 | return false; |
68 | } |
69 | |
70 | /// The domain for simplicity is everything that after the protocol and the two slashes, until the next slash or `?` or `#` |
71 | while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#')) |
72 | ++pos; |
73 | |
74 | start = pos; |
75 | |
76 | if (pos != end) |
77 | ++pos; |
78 | } |
79 | |
80 | /// We go to the next `/` or `?` or `#`, skipping all those at the beginning. |
81 | while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#')) |
82 | ++pos; |
83 | if (pos == end) |
84 | return false; |
85 | while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#')) |
86 | ++pos; |
87 | |
88 | if (pos != end) |
89 | ++pos; |
90 | |
91 | token_begin = start; |
92 | token_end = pos; |
93 | |
94 | return true; |
95 | } |
96 | }; |
97 | |
98 | |
99 | struct NameURLPathHierarchy { static constexpr auto name = "URLPathHierarchy" ; }; |
100 | using FunctionURLPathHierarchy = FunctionTokens<URLPathHierarchyImpl>; |
101 | |
102 | void registerFunctionURLPathHierarchy(FunctionFactory & factory) |
103 | { |
104 | factory.registerFunction<FunctionURLPathHierarchy>(); |
105 | } |
106 | |
107 | } |
108 | |