| 1 | #include <Functions/FunctionFactory.h> |
| 2 | #include <Functions/FunctionsStringArray.h> |
| 3 | |
| 4 | namespace DB |
| 5 | { |
| 6 | |
| 7 | class URLPathHierarchyImpl |
| 8 | { |
| 9 | private: |
| 10 | Pos begin; |
| 11 | Pos pos; |
| 12 | Pos end; |
| 13 | Pos start; |
| 14 | |
| 15 | public: |
| 16 | static constexpr auto name = "URLPathHierarchy" ; |
| 17 | static String getName() { return name; } |
| 18 | |
| 19 | static size_t getNumberOfArguments() { return 1; } |
| 20 | |
| 21 | static void checkArguments(const DataTypes & arguments) |
| 22 | { |
| 23 | if (!isString(arguments[0])) |
| 24 | throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String." , |
| 25 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 26 | } |
| 27 | |
| 28 | void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {} |
| 29 | |
| 30 | /// Returns the position of the argument that is the column of rows |
| 31 | size_t getStringsArgumentPosition() |
| 32 | { |
| 33 | return 0; |
| 34 | } |
| 35 | |
| 36 | /// Called for each next string. |
| 37 | void set(Pos pos_, Pos end_) |
| 38 | { |
| 39 | begin = pos = pos_; |
| 40 | start = begin; |
| 41 | end = end_; |
| 42 | } |
| 43 | |
| 44 | /// Get the next token, if any, or return false. |
| 45 | bool get(Pos & token_begin, Pos & token_end) |
| 46 | { |
| 47 | /// Code from URLParser. |
| 48 | |
| 49 | if (pos == end) |
| 50 | return false; |
| 51 | |
| 52 | if (pos == begin) |
| 53 | { |
| 54 | /// Let's parse everything that goes before the path |
| 55 | |
| 56 | /// Assume that the protocol has already been changed to lowercase. |
| 57 | while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9'))) |
| 58 | ++pos; |
| 59 | |
| 60 | /** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes. |
| 61 | * (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there. |
| 62 | * For the rest, just return an empty array. |
| 63 | */ |
| 64 | if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end)) |
| 65 | { |
| 66 | pos = end; |
| 67 | return false; |
| 68 | } |
| 69 | |
| 70 | /// The domain for simplicity is everything that after the protocol and the two slashes, until the next slash or `?` or `#` |
| 71 | while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#')) |
| 72 | ++pos; |
| 73 | |
| 74 | start = pos; |
| 75 | |
| 76 | if (pos != end) |
| 77 | ++pos; |
| 78 | } |
| 79 | |
| 80 | /// We go to the next `/` or `?` or `#`, skipping all those at the beginning. |
| 81 | while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#')) |
| 82 | ++pos; |
| 83 | if (pos == end) |
| 84 | return false; |
| 85 | while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#')) |
| 86 | ++pos; |
| 87 | |
| 88 | if (pos != end) |
| 89 | ++pos; |
| 90 | |
| 91 | token_begin = start; |
| 92 | token_end = pos; |
| 93 | |
| 94 | return true; |
| 95 | } |
| 96 | }; |
| 97 | |
| 98 | |
| 99 | struct NameURLPathHierarchy { static constexpr auto name = "URLPathHierarchy" ; }; |
| 100 | using FunctionURLPathHierarchy = FunctionTokens<URLPathHierarchyImpl>; |
| 101 | |
| 102 | void registerFunctionURLPathHierarchy(FunctionFactory & factory) |
| 103 | { |
| 104 | factory.registerFunction<FunctionURLPathHierarchy>(); |
| 105 | } |
| 106 | |
| 107 | } |
| 108 | |