utf8.h source code [arrow/arrow/util/utf8.h]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#ifndef ARROW_UTIL_UTF8_H
19	#define ARROW_UTIL_UTF8_H
20
21	#include <cassert>
22	#include <cstdint>
23	#include <cstring>
24	#include <memory>
25
26	#include "arrow/util/macros.h"
27	#include "arrow/util/string_view.h"
28	#include "arrow/util/visibility.h"
29
30	namespace arrow {
31	namespace util {
32
33	namespace internal {
34
35	// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
36	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
37
38	// A compact state table allowing UTF8 decoding using two dependent
39	// lookups per byte. The first lookup determines the character class
40	// and the second lookup reads the next state.
41	// In this table states are multiples of 12.
42	ARROW_EXPORT extern const uint8_t utf8_small_table[`256` + `9` * `12`];
43
44	// Success / reject states when looked up in the small table
45	static constexpr uint8_t kUTF8DecodeAccept = `0`;
46	static constexpr uint8_t kUTF8DecodeReject = `12`;
47
48	// An expanded state table allowing transitions using a single lookup
49	// at the expense of a larger memory footprint (but on non-random data,
50	// not all the table will end up accessed and cached).
51	// In this table states are multiples of 256.
52	ARROW_EXPORT extern uint16_t utf8_large_table[`9` * `256`];
53
54	// Success / reject states when looked up in the large table
55	static constexpr uint16_t kUTF8ValidateAccept = `0`;
56	static constexpr uint16_t kUTF8ValidateReject = `256`;
57
58	static inline uint8_t DecodeOneUTF8Byte(uint8_t byte, uint8_t state, uint32_t* codep) {
59	uint8_t type = utf8_small_table[byte];
60
61	codep = (state != kUTF8DecodeAccept) ? (byte & `0x3fu`) \| (codep << `6`)
62	: (`0xff` >> type) & (byte);
63
64	state = utf8_small_table[`256` + state + type];
65	return state;
66	}
67
68	static inline uint16_t ValidateOneUTF8Byte(uint8_t byte, uint16_t state) {
69	return utf8_large_table[state + byte];
70	}
71
72	#ifndef NDEBUG
73	ARROW_EXPORT void CheckUTF8Initialized();
74	#endif
75
76	} // namespace internal
77
78	// This function needs to be called before doing UTF8 validation.
79	ARROW_EXPORT void InitializeUTF8();
80
81	inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
82	static constexpr uint64_t high_bits_64 = `0x8080808080808080ULL`;
83	// For some reason, defining this variable outside the loop helps clang
84	uint64_t mask;
85
86	#ifndef NDEBUG
87	internal::CheckUTF8Initialized();
88	#endif
89
90	while (size >= `8`) {
91	// XXX This is doing an unaligned access. Contemporary architectures
92	// (x86-64, AArch64, PPC64) support it natively and often have good
93	// performance nevertheless.
94	memcpy(&mask, data, `8`);
95	if (ARROW_PREDICT_TRUE((mask & high_bits_64) == `0`)) {
96	// 8 bytes of pure ASCII, move forward
97	size -= `8`;
98	data += `8`;
99	continue;
100	}
101	// Non-ASCII run detected.
102	// We process at least 4 bytes, to avoid too many spurious 64-bit reads
103	// in case the non-ASCII bytes are at the end of the tested 64-bit word.
104	// We also only check for rejection at the end since that state is stable
105	// (once in reject state, we always remain in reject state).
106	// It is guaranteed that size >= 8 when arriving here, which allows
107	// us to avoid size checks.
108	uint16_t state = internal::kUTF8ValidateAccept;
109	// Byte 0
110	state = internal::ValidateOneUTF8Byte(*data++, state);
111	--size;
112	// Byte 1
113	state = internal::ValidateOneUTF8Byte(*data++, state);
114	--size;
115	// Byte 2
116	state = internal::ValidateOneUTF8Byte(*data++, state);
117	--size;
118	// Byte 3
119	state = internal::ValidateOneUTF8Byte(*data++, state);
120	--size;
121	// Byte 4
122	state = internal::ValidateOneUTF8Byte(*data++, state);
123	--size;
124	if (state == internal::kUTF8ValidateAccept) {
125	continue; // Got full char, switch back to ASCII detection
126	}
127	// Byte 5
128	state = internal::ValidateOneUTF8Byte(*data++, state);
129	--size;
130	if (state == internal::kUTF8ValidateAccept) {
131	continue; // Got full char, switch back to ASCII detection
132	}
133	// Byte 6
134	state = internal::ValidateOneUTF8Byte(*data++, state);
135	--size;
136	if (state == internal::kUTF8ValidateAccept) {
137	continue; // Got full char, switch back to ASCII detection
138	}
139	// Byte 7
140	state = internal::ValidateOneUTF8Byte(*data++, state);
141	--size;
142	if (state == internal::kUTF8ValidateAccept) {
143	continue; // Got full char, switch back to ASCII detection
144	}
145	// kUTF8ValidateAccept not reached along 4 transitions has to mean a rejection
146	assert(state == internal::kUTF8ValidateReject);
147	return false;
148	}
149
150	// Validate string tail one byte at a time
151	// Note the state table is designed so that, once in the reject state,
152	// we remain in that state until the end. So we needn't check for
153	// rejection at each char (we don't gain much by short-circuiting here).
154	uint16_t state = internal::kUTF8ValidateAccept;
155	while (size-- > `0`) {
156	state = internal::ValidateOneUTF8Byte(*data++, state);
157	}
158	return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept);
159	}
160
161	inline bool ValidateUTF8(const util::string_view& str) {
162	const uint8_t* data = reinterpret_cast<const uint8_t*>(str.data());
163	const size_t length = str.size();
164
165	return ValidateUTF8(data, length);
166	}
167
168	} // namespace util
169	} // namespace arrow
170
171	#endif
172

Browse the source code of arrow/arrow/util/utf8.h