1/*
2 * Lightweight URL & URI parser (RFC 1738, RFC 3986)
3 * https://github.com/corporateshark/LUrlParser
4 *
5 * The MIT License (MIT)
6 *
7 * Copyright (C) 2015 Sergey Kosarevsky (sk@linderdaum.com)
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * in the Software without restriction, including without limitation the rights
12 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included in all
17 * copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 * SOFTWARE.
26 *
27 * IXUrlParser.cpp
28 * Author: Benjamin Sergeant
29 * Copyright (c) 2019 Machine Zone, Inc. All rights reserved.
30 */
31
32#include "IXUrlParser.h"
33
34#include <algorithm>
35#include <cstdlib>
36#include <cstring>
37
38namespace
39{
40 enum LUrlParserError
41 {
42 LUrlParserError_Ok = 0,
43 LUrlParserError_Uninitialized = 1,
44 LUrlParserError_NoUrlCharacter = 2,
45 LUrlParserError_InvalidSchemeName = 3,
46 LUrlParserError_NoDoubleSlash = 4,
47 LUrlParserError_NoAtSign = 5,
48 LUrlParserError_UnexpectedEndOfLine = 6,
49 LUrlParserError_NoSlash = 7,
50 };
51
52 class clParseURL
53 {
54 public:
55 LUrlParserError m_ErrorCode;
56 std::string m_Scheme;
57 std::string m_Host;
58 std::string m_Port;
59 std::string m_Path;
60 std::string m_Query;
61 std::string m_Fragment;
62 std::string m_UserName;
63 std::string m_Password;
64
65 clParseURL()
66 : m_ErrorCode(LUrlParserError_Uninitialized)
67 {
68 }
69
70 /// return 'true' if the parsing was successful
71 bool IsValid() const
72 {
73 return m_ErrorCode == LUrlParserError_Ok;
74 }
75
76 /// helper to convert the port number to int, return 'true' if the port is valid (within the
77 /// 0..65535 range)
78 bool GetPort(int* OutPort) const;
79
80 /// parse the URL
81 static clParseURL ParseURL(const std::string& URL);
82
83 private:
84 explicit clParseURL(LUrlParserError ErrorCode)
85 : m_ErrorCode(ErrorCode)
86 {
87 }
88 };
89
90 static bool IsSchemeValid(const std::string& SchemeName)
91 {
92 for (auto c : SchemeName)
93 {
94 if (!isalpha(c) && c != '+' && c != '-' && c != '.') return false;
95 }
96
97 return true;
98 }
99
100 bool clParseURL::GetPort(int* OutPort) const
101 {
102 if (!IsValid())
103 {
104 return false;
105 }
106
107 int Port = atoi(m_Port.c_str());
108
109 if (Port <= 0 || Port > 65535)
110 {
111 return false;
112 }
113
114 if (OutPort)
115 {
116 *OutPort = Port;
117 }
118
119 return true;
120 }
121
122 // based on RFC 1738 and RFC 3986
123 clParseURL clParseURL::ParseURL(const std::string& URL)
124 {
125 clParseURL Result;
126
127 const char* CurrentString = URL.c_str();
128
129 /*
130 * <scheme>:<scheme-specific-part>
131 * <scheme> := [a-z\+\-\.]+
132 * For resiliency, programs interpreting URLs should treat upper case letters as
133 *equivalent to lower case in scheme names
134 */
135
136 // try to read scheme
137 {
138 const char* LocalString = strchr(CurrentString, ':');
139
140 if (!LocalString)
141 {
142 return clParseURL(LUrlParserError_NoUrlCharacter);
143 }
144
145 // save the scheme name
146 Result.m_Scheme = std::string(CurrentString, LocalString - CurrentString);
147
148 if (!IsSchemeValid(Result.m_Scheme))
149 {
150 return clParseURL(LUrlParserError_InvalidSchemeName);
151 }
152
153 // scheme should be lowercase
154 std::transform(
155 Result.m_Scheme.begin(), Result.m_Scheme.end(), Result.m_Scheme.begin(), ::tolower);
156
157 // skip ':'
158 CurrentString = LocalString + 1;
159 }
160
161 /*
162 * //<user>:<password>@<host>:<port>/<url-path>
163 * any ":", "@" and "/" must be normalized
164 */
165
166 // skip "//"
167 if (*CurrentString++ != '/') return clParseURL(LUrlParserError_NoDoubleSlash);
168 if (*CurrentString++ != '/') return clParseURL(LUrlParserError_NoDoubleSlash);
169
170 // check if the user name and password are specified
171 bool bHasUserName = false;
172
173 const char* LocalString = CurrentString;
174
175 while (*LocalString)
176 {
177 if (*LocalString == '@')
178 {
179 // user name and password are specified
180 bHasUserName = true;
181 break;
182 }
183 else if (*LocalString == '/')
184 {
185 // end of <host>:<port> specification
186 bHasUserName = false;
187 break;
188 }
189
190 LocalString++;
191 }
192
193 // user name and password
194 LocalString = CurrentString;
195
196 if (bHasUserName)
197 {
198 // read user name
199 while (*LocalString && *LocalString != ':' && *LocalString != '@')
200 LocalString++;
201
202 Result.m_UserName = std::string(CurrentString, LocalString - CurrentString);
203
204 // proceed with the current pointer
205 CurrentString = LocalString;
206
207 if (*CurrentString == ':')
208 {
209 // skip ':'
210 CurrentString++;
211
212 // read password
213 LocalString = CurrentString;
214
215 while (*LocalString && *LocalString != '@')
216 LocalString++;
217
218 Result.m_Password = std::string(CurrentString, LocalString - CurrentString);
219
220 CurrentString = LocalString;
221 }
222
223 // skip '@'
224 if (*CurrentString != '@')
225 {
226 return clParseURL(LUrlParserError_NoAtSign);
227 }
228
229 CurrentString++;
230 }
231
232 bool bHasBracket = (*CurrentString == '[');
233
234 // go ahead, read the host name
235 LocalString = CurrentString;
236
237 while (*LocalString)
238 {
239 if (bHasBracket && *LocalString == ']')
240 {
241 // end of IPv6 address
242 LocalString++;
243 break;
244 }
245 else if (!bHasBracket && (*LocalString == ':' || *LocalString == '/'))
246 {
247 // port number is specified
248 break;
249 }
250
251 LocalString++;
252 }
253
254 Result.m_Host = std::string(CurrentString, LocalString - CurrentString);
255
256 CurrentString = LocalString;
257
258 // is port number specified?
259 if (*CurrentString == ':')
260 {
261 CurrentString++;
262
263 // read port number
264 LocalString = CurrentString;
265
266 while (*LocalString && *LocalString != '/')
267 LocalString++;
268
269 Result.m_Port = std::string(CurrentString, LocalString - CurrentString);
270
271 CurrentString = LocalString;
272 }
273
274 // end of string
275 if (!*CurrentString)
276 {
277 Result.m_ErrorCode = LUrlParserError_Ok;
278
279 return Result;
280 }
281
282 // skip '/'
283 if (*CurrentString != '/')
284 {
285 return clParseURL(LUrlParserError_NoSlash);
286 }
287
288 CurrentString++;
289
290 // parse the path
291 LocalString = CurrentString;
292
293 while (*LocalString && *LocalString != '#' && *LocalString != '?')
294 LocalString++;
295
296 Result.m_Path = std::string(CurrentString, LocalString - CurrentString);
297
298 CurrentString = LocalString;
299
300 // check for query
301 if (*CurrentString == '?')
302 {
303 // skip '?'
304 CurrentString++;
305
306 // read query
307 LocalString = CurrentString;
308
309 while (*LocalString && *LocalString != '#')
310 LocalString++;
311
312 Result.m_Query = std::string(CurrentString, LocalString - CurrentString);
313
314 CurrentString = LocalString;
315 }
316
317 // check for fragment
318 if (*CurrentString == '#')
319 {
320 // skip '#'
321 CurrentString++;
322
323 // read fragment
324 LocalString = CurrentString;
325
326 while (*LocalString)
327 LocalString++;
328
329 Result.m_Fragment = std::string(CurrentString, LocalString - CurrentString);
330 }
331
332 Result.m_ErrorCode = LUrlParserError_Ok;
333
334 return Result;
335 }
336} // namespace
337
338namespace ix
339{
340 bool UrlParser::parse(const std::string& url,
341 std::string& protocol,
342 std::string& host,
343 std::string& path,
344 std::string& query,
345 int& port)
346 {
347 clParseURL res = clParseURL::ParseURL(url);
348
349 if (!res.IsValid())
350 {
351 return false;
352 }
353
354 protocol = res.m_Scheme;
355 host = res.m_Host;
356 path = res.m_Path;
357 query = res.m_Query;
358
359 if (!res.GetPort(&port))
360 {
361 if (protocol == "ws" || protocol == "http")
362 {
363 port = 80;
364 }
365 else if (protocol == "wss" || protocol == "https")
366 {
367 port = 443;
368 }
369 else
370 {
371 // Invalid protocol. Should be caught by regex check
372 // but this missing branch trigger cpplint linter.
373 return false;
374 }
375 }
376
377 if (path.empty())
378 {
379 path = "/";
380 }
381 else if (path[0] != '/')
382 {
383 path = '/' + path;
384 }
385
386 if (!query.empty())
387 {
388 path += "?";
389 path += query;
390 }
391
392 return true;
393 }
394
395} // namespace ix
396