1#include <assert.h>
2#include <stdio.h>
3#include <string.h>
4
5#include "houdini.h"
6
7/*
8 * The following characters will not be escaped:
9 *
10 * -_.+!*'(),%#@?=;:/,+&$ alphanum
11 *
12 * Note that this character set is the addition of:
13 *
14 * - The characters which are safe to be in an URL
15 * - The characters which are *not* safe to be in
16 * an URL because they are RESERVED characters.
17 *
18 * We assume (lazily) that any RESERVED char that
19 * appears inside an URL is actually meant to
20 * have its native function (i.e. as an URL
21 * component/separator) and hence needs no escaping.
22 *
23 * There are two exceptions: the characters & (amp)
24 * and ' (single quote) do not appear in the table.
25 * They are meant to appear in the URL as components,
26 * yet they require special HTML-entity escaping
27 * to generate valid HTML markup.
28 *
29 * All other characters will be escaped to %XX.
30 *
31 */
32static const char HREF_SAFE[] = {
33 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
34 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
37 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44};
45
46int houdini_escape_href(cmark_strbuf *ob, const uint8_t *src, bufsize_t size) {
47 static const uint8_t hex_chars[] = "0123456789ABCDEF";
48 bufsize_t i = 0, org;
49 uint8_t hex_str[3];
50
51 hex_str[0] = '%';
52
53 while (i < size) {
54 org = i;
55 while (i < size && HREF_SAFE[src[i]] != 0)
56 i++;
57
58 if (likely(i > org))
59 cmark_strbuf_put(ob, src + org, i - org);
60
61 /* escaping */
62 if (i >= size)
63 break;
64
65 switch (src[i]) {
66 /* amp appears all the time in URLs, but needs
67 * HTML-entity escaping to be inside an href */
68 case '&':
69 cmark_strbuf_puts(ob, "&amp;");
70 break;
71
72 /* the single quote is a valid URL character
73 * according to the standard; it needs HTML
74 * entity escaping too */
75 case '\'':
76 cmark_strbuf_puts(ob, "&#x27;");
77 break;
78
79/* the space can be escaped to %20 or a plus
80 * sign. we're going with the generic escape
81 * for now. the plus thing is more commonly seen
82 * when building GET strings */
83#if 0
84 case ' ':
85 cmark_strbuf_putc(ob, '+');
86 break;
87#endif
88
89 /* every other character goes with a %XX escaping */
90 default:
91 hex_str[1] = hex_chars[(src[i] >> 4) & 0xF];
92 hex_str[2] = hex_chars[src[i] & 0xF];
93 cmark_strbuf_put(ob, hex_str, 3);
94 }
95
96 i++;
97 }
98
99 return 1;
100}
101