1/*
2 * Copyright (c) 2015, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "config.h"
30
31#include "utf8_validate.h"
32
33#include "ue2common.h"
34#include "util/unicode_def.h"
35
36#include <cstring>
37
38namespace ue2 {
39
40static
41bool hasValidContBytes(const u8 *s, size_t num) {
42 /* continuer bytes must all be of the form 10xx xxxx */
43 for (size_t i = 0; i < num; i++) {
44 if ((s[i] & 0xc0) != UTF_CONT_BYTE_HEADER) {
45 return false;
46 }
47 }
48 return true;
49}
50
51static
52bool isAllowedCodepoint(u32 val) {
53 if (val >= 0xd800 && val <= 0xdfff) {
54 return false; // High and low surrogate halves
55 }
56 if (val > 0x10ffff) {
57 return false; // As per limit in RFC 3629
58 }
59
60 return true;
61}
62
63bool isValidUtf8(const char *expression, const size_t len) {
64 if (!expression) {
65 return true;
66 }
67
68 const u8 *s = (const u8 *)expression;
69 u32 val;
70
71 size_t i = 0;
72 while (i < len) {
73 DEBUG_PRINTF("byte %zu: 0x%02x\n", i, s[i]);
74 // One octet.
75 if (s[i] < 0x7f) {
76 DEBUG_PRINTF("one octet\n");
77 i++;
78 continue;
79 }
80
81 // Two octets.
82 if ((s[i] & 0xe0) == UTF_TWO_BYTE_HEADER) {
83 DEBUG_PRINTF("two octets\n");
84 if (i + 2 > len) {
85 break;
86 }
87 if (!hasValidContBytes(&s[i] + 1, 1)) {
88 break;
89 }
90 val = ((s[i] & 0x1f) << 6) | (s[i + 1] & UTF_CONT_BYTE_VALUE_MASK);
91 DEBUG_PRINTF("val=0x%x\n", val);
92 if (val < 1U << 7) {
93 DEBUG_PRINTF("overlong encoding\n");
94 break;
95 }
96 if (!isAllowedCodepoint(val)) {
97 DEBUG_PRINTF("codepoint not allowed\n");
98 break;
99 }
100 i += 2;
101 continue;
102 }
103
104 // Three octets.
105 if ((s[i] & 0xf0) == UTF_THREE_BYTE_HEADER) {
106 DEBUG_PRINTF("three octets\n");
107 if (i + 3 > len) {
108 break;
109 }
110 if (!hasValidContBytes(&s[i] + 1, 2)) {
111 break;
112 }
113 val = ((s[i] & 0xf) << 12) |
114 ((s[i + 1] & UTF_CONT_BYTE_VALUE_MASK) << 6) |
115 (s[i + 2] & UTF_CONT_BYTE_VALUE_MASK);
116 if (val < 1U << 11) {
117 DEBUG_PRINTF("overlong encoding\n");
118 break;
119 }
120 if (!isAllowedCodepoint(val)) {
121 DEBUG_PRINTF("codepoint not allowed\n");
122 break;
123 }
124 i += 3;
125 continue;
126 }
127
128 // Four octets.
129 if ((s[i] & 0xf8) == UTF_FOUR_BYTE_HEADER) {
130 DEBUG_PRINTF("four octets\n");
131 if (i + 4 > len) {
132 break;
133 }
134 if (!hasValidContBytes(&s[i] + 1, 3)) {
135 break;
136 }
137 val = ((s[i] & 0xf) << 18) |
138 ((s[i + 1] & UTF_CONT_BYTE_VALUE_MASK) << 12) |
139 ((s[i + 2] & UTF_CONT_BYTE_VALUE_MASK) << 6) |
140 (s[i + 3] & UTF_CONT_BYTE_VALUE_MASK);
141 if (val < 1U << 16) {
142 DEBUG_PRINTF("overlong encoding\n");
143 break;
144 }
145 if (!isAllowedCodepoint(val)) {
146 DEBUG_PRINTF("codepoint not allowed\n");
147 break;
148 }
149 i += 4;
150 continue;
151 }
152
153 // Something else?
154 DEBUG_PRINTF("bad byte 0x%02x\n", s[i]);
155 break;
156 }
157
158 DEBUG_PRINTF("i=%zu, len=%zu\n", i, len);
159 return i == len;
160}
161
162} // namespace ue2
163