1 | #pragma once |
2 | |
3 | #include "protocol.h" |
4 | #include <common/find_symbols.h> |
5 | #include <cstring> |
6 | #include <Common/StringUtils/StringUtils.h> |
7 | |
8 | namespace DB |
9 | { |
10 | |
11 | namespace |
12 | { |
13 | |
14 | inline StringRef checkAndReturnHost(const Pos & pos, const Pos & dot_pos, const Pos & start_of_host) |
15 | { |
16 | if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1) |
17 | return StringRef{}; |
18 | |
19 | auto after_dot = *(dot_pos + 1); |
20 | if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#') |
21 | return StringRef{}; |
22 | |
23 | return StringRef(start_of_host, pos - start_of_host); |
24 | } |
25 | |
26 | } |
27 | |
28 | /// Extracts host from given url. |
29 | inline StringRef getURLHost(const char * data, size_t size) |
30 | { |
31 | Pos pos = data; |
32 | Pos end = data + size; |
33 | |
34 | if (*pos == '/' && *(pos + 1) == '/') |
35 | { |
36 | pos += 2; |
37 | } |
38 | else |
39 | { |
40 | Pos scheme_end = data + std::min(size, 16UL); |
41 | for (++pos; pos < scheme_end; ++pos) |
42 | { |
43 | if (!isAlphaNumericASCII(*pos)) |
44 | { |
45 | switch (*pos) |
46 | { |
47 | case '.': |
48 | case '-': |
49 | case '+': |
50 | break; |
51 | case ' ': /// restricted symbols |
52 | case '\t': |
53 | case '<': |
54 | case '>': |
55 | case '%': |
56 | case '{': |
57 | case '}': |
58 | case '|': |
59 | case '\\': |
60 | case '^': |
61 | case '~': |
62 | case '[': |
63 | case ']': |
64 | case ';': |
65 | case '=': |
66 | case '&': |
67 | return StringRef{}; |
68 | default: |
69 | goto exloop; |
70 | } |
71 | } |
72 | } |
73 | exloop: if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/') |
74 | pos += 3; |
75 | else |
76 | pos = data; |
77 | } |
78 | |
79 | Pos dot_pos = nullptr; |
80 | auto start_of_host = pos; |
81 | for (; pos < end; ++pos) |
82 | { |
83 | switch (*pos) |
84 | { |
85 | case '.': |
86 | dot_pos = pos; |
87 | break; |
88 | case ':': /// end symbols |
89 | case '/': |
90 | case '?': |
91 | case '#': |
92 | return checkAndReturnHost(pos, dot_pos, start_of_host); |
93 | case '@': /// myemail@gmail.com |
94 | start_of_host = pos + 1; |
95 | break; |
96 | case ' ': /// restricted symbols in whole URL |
97 | case '\t': |
98 | case '<': |
99 | case '>': |
100 | case '%': |
101 | case '{': |
102 | case '}': |
103 | case '|': |
104 | case '\\': |
105 | case '^': |
106 | case '~': |
107 | case '[': |
108 | case ']': |
109 | case ';': |
110 | case '=': |
111 | case '&': |
112 | return StringRef{}; |
113 | } |
114 | } |
115 | |
116 | return checkAndReturnHost(pos, dot_pos, start_of_host); |
117 | } |
118 | |
119 | template <bool without_www> |
120 | struct ExtractDomain |
121 | { |
122 | static size_t getReserveLengthForElement() { return 15; } |
123 | |
124 | static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size) |
125 | { |
126 | StringRef host = getURLHost(data, size); |
127 | |
128 | if (host.size == 0) |
129 | { |
130 | res_data = data; |
131 | res_size = 0; |
132 | } |
133 | else |
134 | { |
135 | if (without_www && host.size > 4 && !strncmp(host.data, "www." , 4)) |
136 | host = { host.data + 4, host.size - 4 }; |
137 | |
138 | res_data = host.data; |
139 | res_size = host.size; |
140 | } |
141 | } |
142 | }; |
143 | |
144 | } |
145 | |