1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <mutex>
19
20#include "arrow/util/logging.h"
21#include "arrow/util/utf8.h"
22
23namespace arrow {
24namespace util {
25namespace internal {
26
27// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
28// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
29
30// clang-format off
31const uint8_t utf8_small_table[] = { // NOLINT
32 // The first part of the table maps bytes to character classes that
33 // to reduce the size of the transition table and create bitmasks.
34 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
35 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
36 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // NOLINT
38 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // NOLINT
39 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // NOLINT
40 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // NOLINT
41 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, // NOLINT
42
43 // The second part is a transition table that maps a combination
44 // of a state of the automaton and a character class to a state.
45 // Character classes are between 0 and 11, states are multiples of 12.
46 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, // NOLINT
47 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, // NOLINT
48 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, // NOLINT
49 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, // NOLINT
50 12,36,12,12,12,12,12,12,12,12,12,12, // NOLINT
51};
52// clang-format on
53
54uint16_t utf8_large_table[9 * 256] = {0xffff};
55
56static void InitializeLargeTable() {
57 for (uint32_t state = 0; state < 9; ++state) {
58 for (uint32_t byte = 0; byte < 256; ++byte) {
59 uint32_t byte_class = utf8_small_table[byte];
60 uint8_t next_state = utf8_small_table[256 + state * 12 + byte_class] / 12;
61 DCHECK_LT(next_state, 9);
62 utf8_large_table[state * 256 + byte] = static_cast<uint16_t>(next_state * 256);
63 }
64 }
65}
66
67#ifndef NDEBUG
68ARROW_EXPORT void CheckUTF8Initialized() {
69 DCHECK_EQ(utf8_large_table[0], 0)
70 << "InitializeUTF8() must be called before calling UTF8 routines";
71}
72#endif
73
74} // namespace internal
75
76static std::once_flag utf8_initialized;
77
78void InitializeUTF8() {
79 std::call_once(utf8_initialized, internal::InitializeLargeTable);
80}
81
82} // namespace util
83} // namespace arrow
84