1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <mutex> |
19 | |
20 | #include "arrow/util/logging.h" |
21 | #include "arrow/util/utf8.h" |
22 | |
23 | namespace arrow { |
24 | namespace util { |
25 | namespace internal { |
26 | |
27 | // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
28 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
29 | |
30 | // clang-format off |
31 | const uint8_t utf8_small_table[] = { // NOLINT |
32 | // The first part of the table maps bytes to character classes that |
33 | // to reduce the size of the transition table and create bitmasks. |
34 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT |
35 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT |
36 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT |
37 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT |
38 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // NOLINT |
39 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // NOLINT |
40 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // NOLINT |
41 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // NOLINT |
42 | |
43 | // The second part is a transition table that maps a combination |
44 | // of a state of the automaton and a character class to a state. |
45 | // Character classes are between 0 and 11, states are multiples of 12. |
46 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, // NOLINT |
47 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, // NOLINT |
48 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, // NOLINT |
49 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, // NOLINT |
50 | 12,36,12,12,12,12,12,12,12,12,12,12, // NOLINT |
51 | }; |
52 | // clang-format on |
53 | |
54 | uint16_t utf8_large_table[9 * 256] = {0xffff}; |
55 | |
56 | static void InitializeLargeTable() { |
57 | for (uint32_t state = 0; state < 9; ++state) { |
58 | for (uint32_t byte = 0; byte < 256; ++byte) { |
59 | uint32_t byte_class = utf8_small_table[byte]; |
60 | uint8_t next_state = utf8_small_table[256 + state * 12 + byte_class] / 12; |
61 | DCHECK_LT(next_state, 9); |
62 | utf8_large_table[state * 256 + byte] = static_cast<uint16_t>(next_state * 256); |
63 | } |
64 | } |
65 | } |
66 | |
67 | #ifndef NDEBUG |
68 | ARROW_EXPORT void CheckUTF8Initialized() { |
69 | DCHECK_EQ(utf8_large_table[0], 0) |
70 | << "InitializeUTF8() must be called before calling UTF8 routines" ; |
71 | } |
72 | #endif |
73 | |
74 | } // namespace internal |
75 | |
76 | static std::once_flag utf8_initialized; |
77 | |
78 | void InitializeUTF8() { |
79 | std::call_once(utf8_initialized, internal::InitializeLargeTable); |
80 | } |
81 | |
82 | } // namespace util |
83 | } // namespace arrow |
84 | |