1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_UTIL_UTF8_H |
19 | #define ARROW_UTIL_UTF8_H |
20 | |
21 | #include <cassert> |
22 | #include <cstdint> |
23 | #include <cstring> |
24 | #include <memory> |
25 | |
26 | #include "arrow/util/macros.h" |
27 | #include "arrow/util/string_view.h" |
28 | #include "arrow/util/visibility.h" |
29 | |
30 | namespace arrow { |
31 | namespace util { |
32 | |
33 | namespace internal { |
34 | |
35 | // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
36 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
37 | |
38 | // A compact state table allowing UTF8 decoding using two dependent |
39 | // lookups per byte. The first lookup determines the character class |
40 | // and the second lookup reads the next state. |
41 | // In this table states are multiples of 12. |
42 | ARROW_EXPORT extern const uint8_t utf8_small_table[256 + 9 * 12]; |
43 | |
44 | // Success / reject states when looked up in the small table |
45 | static constexpr uint8_t kUTF8DecodeAccept = 0; |
46 | static constexpr uint8_t kUTF8DecodeReject = 12; |
47 | |
48 | // An expanded state table allowing transitions using a single lookup |
49 | // at the expense of a larger memory footprint (but on non-random data, |
50 | // not all the table will end up accessed and cached). |
51 | // In this table states are multiples of 256. |
52 | ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256]; |
53 | |
54 | // Success / reject states when looked up in the large table |
55 | static constexpr uint16_t kUTF8ValidateAccept = 0; |
56 | static constexpr uint16_t kUTF8ValidateReject = 256; |
57 | |
58 | static inline uint8_t DecodeOneUTF8Byte(uint8_t byte, uint8_t state, uint32_t* codep) { |
59 | uint8_t type = utf8_small_table[byte]; |
60 | |
61 | *codep = (state != kUTF8DecodeAccept) ? (byte & 0x3fu) | (*codep << 6) |
62 | : (0xff >> type) & (byte); |
63 | |
64 | state = utf8_small_table[256 + state + type]; |
65 | return state; |
66 | } |
67 | |
68 | static inline uint16_t ValidateOneUTF8Byte(uint8_t byte, uint16_t state) { |
69 | return utf8_large_table[state + byte]; |
70 | } |
71 | |
72 | #ifndef NDEBUG |
73 | ARROW_EXPORT void CheckUTF8Initialized(); |
74 | #endif |
75 | |
76 | } // namespace internal |
77 | |
78 | // This function needs to be called before doing UTF8 validation. |
79 | ARROW_EXPORT void InitializeUTF8(); |
80 | |
81 | inline bool ValidateUTF8(const uint8_t* data, int64_t size) { |
82 | static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL; |
83 | // For some reason, defining this variable outside the loop helps clang |
84 | uint64_t mask; |
85 | |
86 | #ifndef NDEBUG |
87 | internal::CheckUTF8Initialized(); |
88 | #endif |
89 | |
90 | while (size >= 8) { |
91 | // XXX This is doing an unaligned access. Contemporary architectures |
92 | // (x86-64, AArch64, PPC64) support it natively and often have good |
93 | // performance nevertheless. |
94 | memcpy(&mask, data, 8); |
95 | if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) { |
96 | // 8 bytes of pure ASCII, move forward |
97 | size -= 8; |
98 | data += 8; |
99 | continue; |
100 | } |
101 | // Non-ASCII run detected. |
102 | // We process at least 4 bytes, to avoid too many spurious 64-bit reads |
103 | // in case the non-ASCII bytes are at the end of the tested 64-bit word. |
104 | // We also only check for rejection at the end since that state is stable |
105 | // (once in reject state, we always remain in reject state). |
106 | // It is guaranteed that size >= 8 when arriving here, which allows |
107 | // us to avoid size checks. |
108 | uint16_t state = internal::kUTF8ValidateAccept; |
109 | // Byte 0 |
110 | state = internal::ValidateOneUTF8Byte(*data++, state); |
111 | --size; |
112 | // Byte 1 |
113 | state = internal::ValidateOneUTF8Byte(*data++, state); |
114 | --size; |
115 | // Byte 2 |
116 | state = internal::ValidateOneUTF8Byte(*data++, state); |
117 | --size; |
118 | // Byte 3 |
119 | state = internal::ValidateOneUTF8Byte(*data++, state); |
120 | --size; |
121 | // Byte 4 |
122 | state = internal::ValidateOneUTF8Byte(*data++, state); |
123 | --size; |
124 | if (state == internal::kUTF8ValidateAccept) { |
125 | continue; // Got full char, switch back to ASCII detection |
126 | } |
127 | // Byte 5 |
128 | state = internal::ValidateOneUTF8Byte(*data++, state); |
129 | --size; |
130 | if (state == internal::kUTF8ValidateAccept) { |
131 | continue; // Got full char, switch back to ASCII detection |
132 | } |
133 | // Byte 6 |
134 | state = internal::ValidateOneUTF8Byte(*data++, state); |
135 | --size; |
136 | if (state == internal::kUTF8ValidateAccept) { |
137 | continue; // Got full char, switch back to ASCII detection |
138 | } |
139 | // Byte 7 |
140 | state = internal::ValidateOneUTF8Byte(*data++, state); |
141 | --size; |
142 | if (state == internal::kUTF8ValidateAccept) { |
143 | continue; // Got full char, switch back to ASCII detection |
144 | } |
145 | // kUTF8ValidateAccept not reached along 4 transitions has to mean a rejection |
146 | assert(state == internal::kUTF8ValidateReject); |
147 | return false; |
148 | } |
149 | |
150 | // Validate string tail one byte at a time |
151 | // Note the state table is designed so that, once in the reject state, |
152 | // we remain in that state until the end. So we needn't check for |
153 | // rejection at each char (we don't gain much by short-circuiting here). |
154 | uint16_t state = internal::kUTF8ValidateAccept; |
155 | while (size-- > 0) { |
156 | state = internal::ValidateOneUTF8Byte(*data++, state); |
157 | } |
158 | return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept); |
159 | } |
160 | |
161 | inline bool ValidateUTF8(const util::string_view& str) { |
162 | const uint8_t* data = reinterpret_cast<const uint8_t*>(str.data()); |
163 | const size_t length = str.size(); |
164 | |
165 | return ValidateUTF8(data, length); |
166 | } |
167 | |
168 | } // namespace util |
169 | } // namespace arrow |
170 | |
171 | #endif |
172 | |