1 | /* |
2 | * Lightweight URL & URI parser (RFC 1738, RFC 3986) |
3 | * https://github.com/corporateshark/LUrlParser |
4 | * |
5 | * The MIT License (MIT) |
6 | * |
7 | * Copyright (C) 2015 Sergey Kosarevsky (sk@linderdaum.com) |
8 | * |
9 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
10 | * of this software and associated documentation files (the "Software"), to deal |
11 | * in the Software without restriction, including without limitation the rights |
12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
13 | * copies of the Software, and to permit persons to whom the Software is |
14 | * furnished to do so, subject to the following conditions: |
15 | * |
16 | * The above copyright notice and this permission notice shall be included in all |
17 | * copies or substantial portions of the Software. |
18 | * |
19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
25 | * SOFTWARE. |
26 | * |
27 | * IXUrlParser.cpp |
28 | * Author: Benjamin Sergeant |
29 | * Copyright (c) 2019 Machine Zone, Inc. All rights reserved. |
30 | */ |
31 | |
32 | #include "IXUrlParser.h" |
33 | |
34 | #include <algorithm> |
35 | #include <cstdlib> |
36 | #include <cstring> |
37 | |
38 | namespace |
39 | { |
40 | enum LUrlParserError |
41 | { |
42 | LUrlParserError_Ok = 0, |
43 | LUrlParserError_Uninitialized = 1, |
44 | LUrlParserError_NoUrlCharacter = 2, |
45 | LUrlParserError_InvalidSchemeName = 3, |
46 | LUrlParserError_NoDoubleSlash = 4, |
47 | LUrlParserError_NoAtSign = 5, |
48 | LUrlParserError_UnexpectedEndOfLine = 6, |
49 | LUrlParserError_NoSlash = 7, |
50 | }; |
51 | |
52 | class clParseURL |
53 | { |
54 | public: |
55 | LUrlParserError m_ErrorCode; |
56 | std::string m_Scheme; |
57 | std::string m_Host; |
58 | std::string m_Port; |
59 | std::string m_Path; |
60 | std::string m_Query; |
61 | std::string m_Fragment; |
62 | std::string m_UserName; |
63 | std::string m_Password; |
64 | |
65 | clParseURL() |
66 | : m_ErrorCode(LUrlParserError_Uninitialized) |
67 | { |
68 | } |
69 | |
70 | /// return 'true' if the parsing was successful |
71 | bool IsValid() const |
72 | { |
73 | return m_ErrorCode == LUrlParserError_Ok; |
74 | } |
75 | |
76 | /// helper to convert the port number to int, return 'true' if the port is valid (within the |
77 | /// 0..65535 range) |
78 | bool GetPort(int* OutPort) const; |
79 | |
80 | /// parse the URL |
81 | static clParseURL ParseURL(const std::string& URL); |
82 | |
83 | private: |
84 | explicit clParseURL(LUrlParserError ErrorCode) |
85 | : m_ErrorCode(ErrorCode) |
86 | { |
87 | } |
88 | }; |
89 | |
90 | static bool IsSchemeValid(const std::string& SchemeName) |
91 | { |
92 | for (auto c : SchemeName) |
93 | { |
94 | if (!isalpha(c) && c != '+' && c != '-' && c != '.') return false; |
95 | } |
96 | |
97 | return true; |
98 | } |
99 | |
100 | bool clParseURL::GetPort(int* OutPort) const |
101 | { |
102 | if (!IsValid()) |
103 | { |
104 | return false; |
105 | } |
106 | |
107 | int Port = atoi(m_Port.c_str()); |
108 | |
109 | if (Port <= 0 || Port > 65535) |
110 | { |
111 | return false; |
112 | } |
113 | |
114 | if (OutPort) |
115 | { |
116 | *OutPort = Port; |
117 | } |
118 | |
119 | return true; |
120 | } |
121 | |
122 | // based on RFC 1738 and RFC 3986 |
123 | clParseURL clParseURL::ParseURL(const std::string& URL) |
124 | { |
125 | clParseURL Result; |
126 | |
127 | const char* CurrentString = URL.c_str(); |
128 | |
129 | /* |
130 | * <scheme>:<scheme-specific-part> |
131 | * <scheme> := [a-z\+\-\.]+ |
132 | * For resiliency, programs interpreting URLs should treat upper case letters as |
133 | *equivalent to lower case in scheme names |
134 | */ |
135 | |
136 | // try to read scheme |
137 | { |
138 | const char* LocalString = strchr(CurrentString, ':'); |
139 | |
140 | if (!LocalString) |
141 | { |
142 | return clParseURL(LUrlParserError_NoUrlCharacter); |
143 | } |
144 | |
145 | // save the scheme name |
146 | Result.m_Scheme = std::string(CurrentString, LocalString - CurrentString); |
147 | |
148 | if (!IsSchemeValid(Result.m_Scheme)) |
149 | { |
150 | return clParseURL(LUrlParserError_InvalidSchemeName); |
151 | } |
152 | |
153 | // scheme should be lowercase |
154 | std::transform( |
155 | Result.m_Scheme.begin(), Result.m_Scheme.end(), Result.m_Scheme.begin(), ::tolower); |
156 | |
157 | // skip ':' |
158 | CurrentString = LocalString + 1; |
159 | } |
160 | |
161 | /* |
162 | * //<user>:<password>@<host>:<port>/<url-path> |
163 | * any ":", "@" and "/" must be normalized |
164 | */ |
165 | |
166 | // skip "//" |
167 | if (*CurrentString++ != '/') return clParseURL(LUrlParserError_NoDoubleSlash); |
168 | if (*CurrentString++ != '/') return clParseURL(LUrlParserError_NoDoubleSlash); |
169 | |
170 | // check if the user name and password are specified |
171 | bool bHasUserName = false; |
172 | |
173 | const char* LocalString = CurrentString; |
174 | |
175 | while (*LocalString) |
176 | { |
177 | if (*LocalString == '@') |
178 | { |
179 | // user name and password are specified |
180 | bHasUserName = true; |
181 | break; |
182 | } |
183 | else if (*LocalString == '/') |
184 | { |
185 | // end of <host>:<port> specification |
186 | bHasUserName = false; |
187 | break; |
188 | } |
189 | |
190 | LocalString++; |
191 | } |
192 | |
193 | // user name and password |
194 | LocalString = CurrentString; |
195 | |
196 | if (bHasUserName) |
197 | { |
198 | // read user name |
199 | while (*LocalString && *LocalString != ':' && *LocalString != '@') |
200 | LocalString++; |
201 | |
202 | Result.m_UserName = std::string(CurrentString, LocalString - CurrentString); |
203 | |
204 | // proceed with the current pointer |
205 | CurrentString = LocalString; |
206 | |
207 | if (*CurrentString == ':') |
208 | { |
209 | // skip ':' |
210 | CurrentString++; |
211 | |
212 | // read password |
213 | LocalString = CurrentString; |
214 | |
215 | while (*LocalString && *LocalString != '@') |
216 | LocalString++; |
217 | |
218 | Result.m_Password = std::string(CurrentString, LocalString - CurrentString); |
219 | |
220 | CurrentString = LocalString; |
221 | } |
222 | |
223 | // skip '@' |
224 | if (*CurrentString != '@') |
225 | { |
226 | return clParseURL(LUrlParserError_NoAtSign); |
227 | } |
228 | |
229 | CurrentString++; |
230 | } |
231 | |
232 | bool bHasBracket = (*CurrentString == '['); |
233 | |
234 | // go ahead, read the host name |
235 | LocalString = CurrentString; |
236 | |
237 | while (*LocalString) |
238 | { |
239 | if (bHasBracket && *LocalString == ']') |
240 | { |
241 | // end of IPv6 address |
242 | LocalString++; |
243 | break; |
244 | } |
245 | else if (!bHasBracket && (*LocalString == ':' || *LocalString == '/')) |
246 | { |
247 | // port number is specified |
248 | break; |
249 | } |
250 | |
251 | LocalString++; |
252 | } |
253 | |
254 | Result.m_Host = std::string(CurrentString, LocalString - CurrentString); |
255 | |
256 | CurrentString = LocalString; |
257 | |
258 | // is port number specified? |
259 | if (*CurrentString == ':') |
260 | { |
261 | CurrentString++; |
262 | |
263 | // read port number |
264 | LocalString = CurrentString; |
265 | |
266 | while (*LocalString && *LocalString != '/') |
267 | LocalString++; |
268 | |
269 | Result.m_Port = std::string(CurrentString, LocalString - CurrentString); |
270 | |
271 | CurrentString = LocalString; |
272 | } |
273 | |
274 | // end of string |
275 | if (!*CurrentString) |
276 | { |
277 | Result.m_ErrorCode = LUrlParserError_Ok; |
278 | |
279 | return Result; |
280 | } |
281 | |
282 | // skip '/' |
283 | if (*CurrentString != '/') |
284 | { |
285 | return clParseURL(LUrlParserError_NoSlash); |
286 | } |
287 | |
288 | CurrentString++; |
289 | |
290 | // parse the path |
291 | LocalString = CurrentString; |
292 | |
293 | while (*LocalString && *LocalString != '#' && *LocalString != '?') |
294 | LocalString++; |
295 | |
296 | Result.m_Path = std::string(CurrentString, LocalString - CurrentString); |
297 | |
298 | CurrentString = LocalString; |
299 | |
300 | // check for query |
301 | if (*CurrentString == '?') |
302 | { |
303 | // skip '?' |
304 | CurrentString++; |
305 | |
306 | // read query |
307 | LocalString = CurrentString; |
308 | |
309 | while (*LocalString && *LocalString != '#') |
310 | LocalString++; |
311 | |
312 | Result.m_Query = std::string(CurrentString, LocalString - CurrentString); |
313 | |
314 | CurrentString = LocalString; |
315 | } |
316 | |
317 | // check for fragment |
318 | if (*CurrentString == '#') |
319 | { |
320 | // skip '#' |
321 | CurrentString++; |
322 | |
323 | // read fragment |
324 | LocalString = CurrentString; |
325 | |
326 | while (*LocalString) |
327 | LocalString++; |
328 | |
329 | Result.m_Fragment = std::string(CurrentString, LocalString - CurrentString); |
330 | } |
331 | |
332 | Result.m_ErrorCode = LUrlParserError_Ok; |
333 | |
334 | return Result; |
335 | } |
336 | } // namespace |
337 | |
338 | namespace ix |
339 | { |
340 | bool UrlParser::parse(const std::string& url, |
341 | std::string& protocol, |
342 | std::string& host, |
343 | std::string& path, |
344 | std::string& query, |
345 | int& port) |
346 | { |
347 | clParseURL res = clParseURL::ParseURL(url); |
348 | |
349 | if (!res.IsValid()) |
350 | { |
351 | return false; |
352 | } |
353 | |
354 | protocol = res.m_Scheme; |
355 | host = res.m_Host; |
356 | path = res.m_Path; |
357 | query = res.m_Query; |
358 | |
359 | if (!res.GetPort(&port)) |
360 | { |
361 | if (protocol == "ws" || protocol == "http" ) |
362 | { |
363 | port = 80; |
364 | } |
365 | else if (protocol == "wss" || protocol == "https" ) |
366 | { |
367 | port = 443; |
368 | } |
369 | else |
370 | { |
371 | // Invalid protocol. Should be caught by regex check |
372 | // but this missing branch trigger cpplint linter. |
373 | return false; |
374 | } |
375 | } |
376 | |
377 | if (path.empty()) |
378 | { |
379 | path = "/" ; |
380 | } |
381 | else if (path[0] != '/') |
382 | { |
383 | path = '/' + path; |
384 | } |
385 | |
386 | if (!query.empty()) |
387 | { |
388 | path += "?" ; |
389 | path += query; |
390 | } |
391 | |
392 | return true; |
393 | } |
394 | |
395 | } // namespace ix |
396 | |