1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_UTIL_UTF8_H
19#define ARROW_UTIL_UTF8_H
20
21#include <cassert>
22#include <cstdint>
23#include <cstring>
24#include <memory>
25
26#include "arrow/util/macros.h"
27#include "arrow/util/string_view.h"
28#include "arrow/util/visibility.h"
29
30namespace arrow {
31namespace util {
32
33namespace internal {
34
35// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
36// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
37
38// A compact state table allowing UTF8 decoding using two dependent
39// lookups per byte. The first lookup determines the character class
40// and the second lookup reads the next state.
41// In this table states are multiples of 12.
42ARROW_EXPORT extern const uint8_t utf8_small_table[256 + 9 * 12];
43
44// Success / reject states when looked up in the small table
45static constexpr uint8_t kUTF8DecodeAccept = 0;
46static constexpr uint8_t kUTF8DecodeReject = 12;
47
48// An expanded state table allowing transitions using a single lookup
49// at the expense of a larger memory footprint (but on non-random data,
50// not all the table will end up accessed and cached).
51// In this table states are multiples of 256.
52ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256];
53
54// Success / reject states when looked up in the large table
55static constexpr uint16_t kUTF8ValidateAccept = 0;
56static constexpr uint16_t kUTF8ValidateReject = 256;
57
58static inline uint8_t DecodeOneUTF8Byte(uint8_t byte, uint8_t state, uint32_t* codep) {
59 uint8_t type = utf8_small_table[byte];
60
61 *codep = (state != kUTF8DecodeAccept) ? (byte & 0x3fu) | (*codep << 6)
62 : (0xff >> type) & (byte);
63
64 state = utf8_small_table[256 + state + type];
65 return state;
66}
67
68static inline uint16_t ValidateOneUTF8Byte(uint8_t byte, uint16_t state) {
69 return utf8_large_table[state + byte];
70}
71
72#ifndef NDEBUG
73ARROW_EXPORT void CheckUTF8Initialized();
74#endif
75
76} // namespace internal
77
78// This function needs to be called before doing UTF8 validation.
79ARROW_EXPORT void InitializeUTF8();
80
81inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
82 static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL;
83 // For some reason, defining this variable outside the loop helps clang
84 uint64_t mask;
85
86#ifndef NDEBUG
87 internal::CheckUTF8Initialized();
88#endif
89
90 while (size >= 8) {
91 // XXX This is doing an unaligned access. Contemporary architectures
92 // (x86-64, AArch64, PPC64) support it natively and often have good
93 // performance nevertheless.
94 memcpy(&mask, data, 8);
95 if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) {
96 // 8 bytes of pure ASCII, move forward
97 size -= 8;
98 data += 8;
99 continue;
100 }
101 // Non-ASCII run detected.
102 // We process at least 4 bytes, to avoid too many spurious 64-bit reads
103 // in case the non-ASCII bytes are at the end of the tested 64-bit word.
104 // We also only check for rejection at the end since that state is stable
105 // (once in reject state, we always remain in reject state).
106 // It is guaranteed that size >= 8 when arriving here, which allows
107 // us to avoid size checks.
108 uint16_t state = internal::kUTF8ValidateAccept;
109 // Byte 0
110 state = internal::ValidateOneUTF8Byte(*data++, state);
111 --size;
112 // Byte 1
113 state = internal::ValidateOneUTF8Byte(*data++, state);
114 --size;
115 // Byte 2
116 state = internal::ValidateOneUTF8Byte(*data++, state);
117 --size;
118 // Byte 3
119 state = internal::ValidateOneUTF8Byte(*data++, state);
120 --size;
121 // Byte 4
122 state = internal::ValidateOneUTF8Byte(*data++, state);
123 --size;
124 if (state == internal::kUTF8ValidateAccept) {
125 continue; // Got full char, switch back to ASCII detection
126 }
127 // Byte 5
128 state = internal::ValidateOneUTF8Byte(*data++, state);
129 --size;
130 if (state == internal::kUTF8ValidateAccept) {
131 continue; // Got full char, switch back to ASCII detection
132 }
133 // Byte 6
134 state = internal::ValidateOneUTF8Byte(*data++, state);
135 --size;
136 if (state == internal::kUTF8ValidateAccept) {
137 continue; // Got full char, switch back to ASCII detection
138 }
139 // Byte 7
140 state = internal::ValidateOneUTF8Byte(*data++, state);
141 --size;
142 if (state == internal::kUTF8ValidateAccept) {
143 continue; // Got full char, switch back to ASCII detection
144 }
145 // kUTF8ValidateAccept not reached along 4 transitions has to mean a rejection
146 assert(state == internal::kUTF8ValidateReject);
147 return false;
148 }
149
150 // Validate string tail one byte at a time
151 // Note the state table is designed so that, once in the reject state,
152 // we remain in that state until the end. So we needn't check for
153 // rejection at each char (we don't gain much by short-circuiting here).
154 uint16_t state = internal::kUTF8ValidateAccept;
155 while (size-- > 0) {
156 state = internal::ValidateOneUTF8Byte(*data++, state);
157 }
158 return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept);
159}
160
161inline bool ValidateUTF8(const util::string_view& str) {
162 const uint8_t* data = reinterpret_cast<const uint8_t*>(str.data());
163 const size_t length = str.size();
164
165 return ValidateUTF8(data, length);
166}
167
168} // namespace util
169} // namespace arrow
170
171#endif
172