| 1 | #pragma once |
| 2 | |
| 3 | #include "protocol.h" |
| 4 | #include <common/find_symbols.h> |
| 5 | #include <cstring> |
| 6 | #include <Common/StringUtils/StringUtils.h> |
| 7 | |
| 8 | namespace DB |
| 9 | { |
| 10 | |
| 11 | namespace |
| 12 | { |
| 13 | |
| 14 | inline StringRef checkAndReturnHost(const Pos & pos, const Pos & dot_pos, const Pos & start_of_host) |
| 15 | { |
| 16 | if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1) |
| 17 | return StringRef{}; |
| 18 | |
| 19 | auto after_dot = *(dot_pos + 1); |
| 20 | if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#') |
| 21 | return StringRef{}; |
| 22 | |
| 23 | return StringRef(start_of_host, pos - start_of_host); |
| 24 | } |
| 25 | |
| 26 | } |
| 27 | |
| 28 | /// Extracts host from given url. |
| 29 | inline StringRef getURLHost(const char * data, size_t size) |
| 30 | { |
| 31 | Pos pos = data; |
| 32 | Pos end = data + size; |
| 33 | |
| 34 | if (*pos == '/' && *(pos + 1) == '/') |
| 35 | { |
| 36 | pos += 2; |
| 37 | } |
| 38 | else |
| 39 | { |
| 40 | Pos scheme_end = data + std::min(size, 16UL); |
| 41 | for (++pos; pos < scheme_end; ++pos) |
| 42 | { |
| 43 | if (!isAlphaNumericASCII(*pos)) |
| 44 | { |
| 45 | switch (*pos) |
| 46 | { |
| 47 | case '.': |
| 48 | case '-': |
| 49 | case '+': |
| 50 | break; |
| 51 | case ' ': /// restricted symbols |
| 52 | case '\t': |
| 53 | case '<': |
| 54 | case '>': |
| 55 | case '%': |
| 56 | case '{': |
| 57 | case '}': |
| 58 | case '|': |
| 59 | case '\\': |
| 60 | case '^': |
| 61 | case '~': |
| 62 | case '[': |
| 63 | case ']': |
| 64 | case ';': |
| 65 | case '=': |
| 66 | case '&': |
| 67 | return StringRef{}; |
| 68 | default: |
| 69 | goto exloop; |
| 70 | } |
| 71 | } |
| 72 | } |
| 73 | exloop: if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/') |
| 74 | pos += 3; |
| 75 | else |
| 76 | pos = data; |
| 77 | } |
| 78 | |
| 79 | Pos dot_pos = nullptr; |
| 80 | auto start_of_host = pos; |
| 81 | for (; pos < end; ++pos) |
| 82 | { |
| 83 | switch (*pos) |
| 84 | { |
| 85 | case '.': |
| 86 | dot_pos = pos; |
| 87 | break; |
| 88 | case ':': /// end symbols |
| 89 | case '/': |
| 90 | case '?': |
| 91 | case '#': |
| 92 | return checkAndReturnHost(pos, dot_pos, start_of_host); |
| 93 | case '@': /// myemail@gmail.com |
| 94 | start_of_host = pos + 1; |
| 95 | break; |
| 96 | case ' ': /// restricted symbols in whole URL |
| 97 | case '\t': |
| 98 | case '<': |
| 99 | case '>': |
| 100 | case '%': |
| 101 | case '{': |
| 102 | case '}': |
| 103 | case '|': |
| 104 | case '\\': |
| 105 | case '^': |
| 106 | case '~': |
| 107 | case '[': |
| 108 | case ']': |
| 109 | case ';': |
| 110 | case '=': |
| 111 | case '&': |
| 112 | return StringRef{}; |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | return checkAndReturnHost(pos, dot_pos, start_of_host); |
| 117 | } |
| 118 | |
| 119 | template <bool without_www> |
| 120 | struct ExtractDomain |
| 121 | { |
| 122 | static size_t getReserveLengthForElement() { return 15; } |
| 123 | |
| 124 | static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size) |
| 125 | { |
| 126 | StringRef host = getURLHost(data, size); |
| 127 | |
| 128 | if (host.size == 0) |
| 129 | { |
| 130 | res_data = data; |
| 131 | res_size = 0; |
| 132 | } |
| 133 | else |
| 134 | { |
| 135 | if (without_www && host.size > 4 && !strncmp(host.data, "www." , 4)) |
| 136 | host = { host.data + 4, host.size - 4 }; |
| 137 | |
| 138 | res_data = host.data; |
| 139 | res_size = host.size; |
| 140 | } |
| 141 | } |
| 142 | }; |
| 143 | |
| 144 | } |
| 145 | |