1#pragma once
2
3#include "protocol.h"
4#include <common/find_symbols.h>
5#include <cstring>
6#include <Common/StringUtils/StringUtils.h>
7
8namespace DB
9{
10
11namespace
12{
13
14inline StringRef checkAndReturnHost(const Pos & pos, const Pos & dot_pos, const Pos & start_of_host)
15{
16 if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1)
17 return StringRef{};
18
19 auto after_dot = *(dot_pos + 1);
20 if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#')
21 return StringRef{};
22
23 return StringRef(start_of_host, pos - start_of_host);
24}
25
26}
27
28/// Extracts host from given url.
29inline StringRef getURLHost(const char * data, size_t size)
30{
31 Pos pos = data;
32 Pos end = data + size;
33
34 if (*pos == '/' && *(pos + 1) == '/')
35 {
36 pos += 2;
37 }
38 else
39 {
40 Pos scheme_end = data + std::min(size, 16UL);
41 for (++pos; pos < scheme_end; ++pos)
42 {
43 if (!isAlphaNumericASCII(*pos))
44 {
45 switch (*pos)
46 {
47 case '.':
48 case '-':
49 case '+':
50 break;
51 case ' ': /// restricted symbols
52 case '\t':
53 case '<':
54 case '>':
55 case '%':
56 case '{':
57 case '}':
58 case '|':
59 case '\\':
60 case '^':
61 case '~':
62 case '[':
63 case ']':
64 case ';':
65 case '=':
66 case '&':
67 return StringRef{};
68 default:
69 goto exloop;
70 }
71 }
72 }
73exloop: if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/')
74 pos += 3;
75 else
76 pos = data;
77 }
78
79 Pos dot_pos = nullptr;
80 auto start_of_host = pos;
81 for (; pos < end; ++pos)
82 {
83 switch (*pos)
84 {
85 case '.':
86 dot_pos = pos;
87 break;
88 case ':': /// end symbols
89 case '/':
90 case '?':
91 case '#':
92 return checkAndReturnHost(pos, dot_pos, start_of_host);
93 case '@': /// myemail@gmail.com
94 start_of_host = pos + 1;
95 break;
96 case ' ': /// restricted symbols in whole URL
97 case '\t':
98 case '<':
99 case '>':
100 case '%':
101 case '{':
102 case '}':
103 case '|':
104 case '\\':
105 case '^':
106 case '~':
107 case '[':
108 case ']':
109 case ';':
110 case '=':
111 case '&':
112 return StringRef{};
113 }
114 }
115
116 return checkAndReturnHost(pos, dot_pos, start_of_host);
117}
118
119template <bool without_www>
120struct ExtractDomain
121{
122 static size_t getReserveLengthForElement() { return 15; }
123
124 static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
125 {
126 StringRef host = getURLHost(data, size);
127
128 if (host.size == 0)
129 {
130 res_data = data;
131 res_size = 0;
132 }
133 else
134 {
135 if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
136 host = { host.data + 4, host.size - 4 };
137
138 res_data = host.data;
139 res_size = host.size;
140 }
141 }
142};
143
144}
145