| 1 | #include <Functions/FunctionFactory.h> |
| 2 | #include <Functions/FunctionsStringArray.h> |
| 3 | |
| 4 | namespace DB |
| 5 | { |
| 6 | |
| 7 | class URLHierarchyImpl |
| 8 | { |
| 9 | private: |
| 10 | Pos begin; |
| 11 | Pos pos; |
| 12 | Pos end; |
| 13 | |
| 14 | public: |
| 15 | static constexpr auto name = "URLHierarchy" ; |
| 16 | static String getName() { return name; } |
| 17 | |
| 18 | static size_t getNumberOfArguments() { return 1; } |
| 19 | |
| 20 | static void checkArguments(const DataTypes & arguments) |
| 21 | { |
| 22 | if (!isString(arguments[0])) |
| 23 | throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String." , |
| 24 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 25 | } |
| 26 | |
| 27 | void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {} |
| 28 | |
| 29 | /// Returns the position of the argument that is the column of rows |
| 30 | size_t getStringsArgumentPosition() |
| 31 | { |
| 32 | return 0; |
| 33 | } |
| 34 | |
| 35 | /// Called for each next string. |
| 36 | void set(Pos pos_, Pos end_) |
| 37 | { |
| 38 | begin = pos = pos_; |
| 39 | end = end_; |
| 40 | } |
| 41 | |
| 42 | /// Get the next token, if any, or return false. |
| 43 | bool get(Pos & token_begin, Pos & token_end) |
| 44 | { |
| 45 | /// Code from URLParser. |
| 46 | if (pos == end) |
| 47 | return false; |
| 48 | |
| 49 | if (pos == begin) |
| 50 | { |
| 51 | /// Let's parse everything that goes before the path |
| 52 | |
| 53 | /// Assume that the protocol has already been changed to lowercase. |
| 54 | while (pos < end && ((*pos > 'a' && *pos < 'z') || (*pos > '0' && *pos < '9'))) |
| 55 | ++pos; |
| 56 | |
| 57 | /** We will calculate the hierarchy only for URLs in which there is a protocol, and after it there are two slashes. |
| 58 | * (http, file - fit, mailto, magnet - do not fit), and after two slashes still at least something is there |
| 59 | * For the rest, simply return the full URL as the only element of the hierarchy. |
| 60 | */ |
| 61 | if (pos == begin || pos == end || !(*pos++ == ':' && pos < end && *pos++ == '/' && pos < end && *pos++ == '/' && pos < end)) |
| 62 | { |
| 63 | pos = end; |
| 64 | token_begin = begin; |
| 65 | token_end = end; |
| 66 | return true; |
| 67 | } |
| 68 | |
| 69 | /// The domain for simplicity is everything that after the protocol and two slashes, until the next slash or `?` or `#` |
| 70 | while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#')) |
| 71 | ++pos; |
| 72 | |
| 73 | if (pos != end) |
| 74 | ++pos; |
| 75 | |
| 76 | token_begin = begin; |
| 77 | token_end = pos; |
| 78 | |
| 79 | return true; |
| 80 | } |
| 81 | |
| 82 | /// We go to the next `/` or `?` or `#`, skipping all those at the beginning. |
| 83 | while (pos < end && (*pos == '/' || *pos == '?' || *pos == '#')) |
| 84 | ++pos; |
| 85 | if (pos == end) |
| 86 | return false; |
| 87 | while (pos < end && !(*pos == '/' || *pos == '?' || *pos == '#')) |
| 88 | ++pos; |
| 89 | |
| 90 | if (pos != end) |
| 91 | ++pos; |
| 92 | |
| 93 | token_begin = begin; |
| 94 | token_end = pos; |
| 95 | |
| 96 | return true; |
| 97 | } |
| 98 | }; |
| 99 | |
| 100 | |
| 101 | struct NameURLHierarchy { static constexpr auto name = "URLHierarchy" ; }; |
| 102 | using FunctionURLHierarchy = FunctionTokens<URLHierarchyImpl>; |
| 103 | |
| 104 | void registerFunctionURLHierarchy(FunctionFactory & factory) |
| 105 | { |
| 106 | factory.registerFunction<FunctionURLHierarchy>(); |
| 107 | } |
| 108 | |
| 109 | } |
| 110 | |