1/* Copyright (c) 2018, Google Inc.
2 *
3 * Permission to use, copy, modify, and/or distribute this software for any
4 * purpose with or without fee is hereby granted, provided that the above
5 * copyright notice and this permission notice appear in all copies.
6 *
7 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15#include <openssl/bytestring.h>
16
17#include "internal.h"
18
19
20static int is_valid_code_point(uint32_t v) {
21 // References in the following are to Unicode 9.0.0.
22 if (// The Unicode space runs from zero to 0x10ffff (3.4 D9).
23 v > 0x10ffff ||
24 // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved
25 // (3.4 D14)
26 (v & 0xfffe) == 0xfffe ||
27 (v >= 0xfdd0 && v <= 0xfdef) ||
28 // Surrogate code points are invalid (3.2 C1).
29 (v >= 0xd800 && v <= 0xdfff)) {
30 return 0;
31 }
32 return 1;
33}
34
35// BOTTOM_BITS returns a byte with the bottom |n| bits set.
36#define BOTTOM_BITS(n) (uint8_t)((1u << (n)) - 1)
37
38// TOP_BITS returns a byte with the top |n| bits set.
39#define TOP_BITS(n) ((uint8_t)~BOTTOM_BITS(8 - (n)))
40
41int cbs_get_utf8(CBS *cbs, uint32_t *out) {
42 uint8_t c;
43 if (!CBS_get_u8(cbs, &c)) {
44 return 0;
45 }
46 if (c <= 0x7f) {
47 *out = c;
48 return 1;
49 }
50 uint32_t v, lower_bound;
51 size_t len;
52 if ((c & TOP_BITS(3)) == TOP_BITS(2)) {
53 v = c & BOTTOM_BITS(5);
54 len = 1;
55 lower_bound = 0x80;
56 } else if ((c & TOP_BITS(4)) == TOP_BITS(3)) {
57 v = c & BOTTOM_BITS(4);
58 len = 2;
59 lower_bound = 0x800;
60 } else if ((c & TOP_BITS(5)) == TOP_BITS(4)) {
61 v = c & BOTTOM_BITS(3);
62 len = 3;
63 lower_bound = 0x10000;
64 } else {
65 return 0;
66 }
67 for (size_t i = 0; i < len; i++) {
68 if (!CBS_get_u8(cbs, &c) ||
69 (c & TOP_BITS(2)) != TOP_BITS(1)) {
70 return 0;
71 }
72 v <<= 6;
73 v |= c & BOTTOM_BITS(6);
74 }
75 if (!is_valid_code_point(v) ||
76 v < lower_bound) {
77 return 0;
78 }
79 *out = v;
80 return 1;
81}
82
83int cbs_get_latin1(CBS *cbs, uint32_t *out) {
84 uint8_t c;
85 if (!CBS_get_u8(cbs, &c)) {
86 return 0;
87 }
88 *out = c;
89 return 1;
90}
91
92int cbs_get_ucs2_be(CBS *cbs, uint32_t *out) {
93 // Note UCS-2 (used by BMPString) does not support surrogates.
94 uint16_t c;
95 if (!CBS_get_u16(cbs, &c) ||
96 !is_valid_code_point(c)) {
97 return 0;
98 }
99 *out = c;
100 return 1;
101}
102
103int cbs_get_utf32_be(CBS *cbs, uint32_t *out) {
104 return CBS_get_u32(cbs, out) && is_valid_code_point(*out);
105}
106
107size_t cbb_get_utf8_len(uint32_t u) {
108 if (u <= 0x7f) {
109 return 1;
110 }
111 if (u <= 0x7ff) {
112 return 2;
113 }
114 if (u <= 0xffff) {
115 return 3;
116 }
117 return 4;
118}
119
120int cbb_add_utf8(CBB *cbb, uint32_t u) {
121 if (!is_valid_code_point(u)) {
122 return 0;
123 }
124 if (u <= 0x7f) {
125 return CBB_add_u8(cbb, (uint8_t)u);
126 }
127 if (u <= 0x7ff) {
128 return CBB_add_u8(cbb, TOP_BITS(2) | (u >> 6)) &&
129 CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
130 }
131 if (u <= 0xffff) {
132 return CBB_add_u8(cbb, TOP_BITS(3) | (u >> 12)) &&
133 CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
134 CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
135 }
136 if (u <= 0x10ffff) {
137 return CBB_add_u8(cbb, TOP_BITS(4) | (u >> 18)) &&
138 CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 12) & BOTTOM_BITS(6))) &&
139 CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
140 CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
141 }
142 return 0;
143}
144
145int cbb_add_latin1(CBB *cbb, uint32_t u) {
146 return u <= 0xff && CBB_add_u8(cbb, (uint8_t)u);
147}
148
149int cbb_add_ucs2_be(CBB *cbb, uint32_t u) {
150 return u <= 0xffff && is_valid_code_point(u) && CBB_add_u16(cbb, (uint16_t)u);
151}
152
153int cbb_add_utf32_be(CBB *cbb, uint32_t u) {
154 return is_valid_code_point(u) && CBB_add_u32(cbb, u);
155}
156