1 | #include <assert.h> |
2 | #include <stdio.h> |
3 | #include <string.h> |
4 | |
5 | #include "houdini.h" |
6 | |
7 | /* |
8 | * The following characters will not be escaped: |
9 | * |
10 | * -_.+!*'(),%#@?=;:/,+&$ alphanum |
11 | * |
12 | * Note that this character set is the addition of: |
13 | * |
14 | * - The characters which are safe to be in an URL |
15 | * - The characters which are *not* safe to be in |
16 | * an URL because they are RESERVED characters. |
17 | * |
18 | * We assume (lazily) that any RESERVED char that |
19 | * appears inside an URL is actually meant to |
20 | * have its native function (i.e. as an URL |
21 | * component/separator) and hence needs no escaping. |
22 | * |
23 | * There are two exceptions: the characters & (amp) |
24 | * and ' (single quote) do not appear in the table. |
25 | * They are meant to appear in the URL as components, |
26 | * yet they require special HTML-entity escaping |
27 | * to generate valid HTML markup. |
28 | * |
29 | * All other characters will be escaped to %XX. |
30 | * |
31 | */ |
32 | static const char HREF_SAFE[] = { |
33 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
34 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, |
35 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
36 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
37 | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
38 | 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
39 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
40 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
41 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
42 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
43 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
44 | }; |
45 | |
46 | int houdini_escape_href(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) { |
47 | static const uint8_t hex_chars[] = "0123456789ABCDEF" ; |
48 | bufsize_t i = 0, org; |
49 | uint8_t hex_str[3]; |
50 | |
51 | hex_str[0] = '%'; |
52 | |
53 | while (i < size) { |
54 | org = i; |
55 | while (i < size && HREF_SAFE[src[i]] != 0) |
56 | i++; |
57 | |
58 | if (likely(i > org)) |
59 | cmark_strbuf_put(ob, src + org, i - org); |
60 | |
61 | /* escaping */ |
62 | if (i >= size) |
63 | break; |
64 | |
65 | switch (src[i]) { |
66 | /* amp appears all the time in URLs, but needs |
67 | * HTML-entity escaping to be inside an href */ |
68 | case '&': |
69 | cmark_strbuf_puts(ob, "&" ); |
70 | break; |
71 | |
72 | /* the single quote is a valid URL character |
73 | * according to the standard; it needs HTML |
74 | * entity escaping too */ |
75 | case '\'': |
76 | cmark_strbuf_puts(ob, "'" ); |
77 | break; |
78 | |
79 | /* the space can be escaped to %20 or a plus |
80 | * sign. we're going with the generic escape |
81 | * for now. the plus thing is more commonly seen |
82 | * when building GET strings */ |
83 | #if 0 |
84 | case ' ': |
85 | cmark_strbuf_putc(ob, '+'); |
86 | break; |
87 | #endif |
88 | |
89 | /* every other character goes with a %XX escaping */ |
90 | default: |
91 | hex_str[1] = hex_chars[(src[i] >> 4) & 0xF]; |
92 | hex_str[2] = hex_chars[src[i] & 0xF]; |
93 | cmark_strbuf_put(ob, hex_str, 3); |
94 | } |
95 | |
96 | i++; |
97 | } |
98 | |
99 | return 1; |
100 | } |
101 | |