1// Copyright 2017 The Abseil Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "absl/strings/ascii.h"
16
17namespace absl {
18namespace ascii_internal {
19
20// # Table generated by this Python code (bit 0x02 is currently unused):
21// TODO(mbar) Move Python code for generation of table to BUILD and link here.
22
23// NOTE: The kAsciiPropertyBits table used within this code was generated by
24// Python code of the following form. (Bit 0x02 is currently unused and
25// available.)
26//
27// def Hex2(n):
28// return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
29// def IsPunct(ch):
30// return (ord(ch) >= 32 and ord(ch) < 127 and
31// not ch.isspace() and not ch.isalnum())
32// def IsBlank(ch):
33// return ch in ' \t'
34// def IsCntrl(ch):
35// return ord(ch) < 32 or ord(ch) == 127
36// def IsXDigit(ch):
37// return ch.isdigit() or ch.lower() in 'abcdef'
38// for i in range(128):
39// ch = chr(i)
40// mask = ((ch.isalpha() and 0x01 or 0) |
41// (ch.isalnum() and 0x04 or 0) |
42// (ch.isspace() and 0x08 or 0) |
43// (IsPunct(ch) and 0x10 or 0) |
44// (IsBlank(ch) and 0x20 or 0) |
45// (IsCntrl(ch) and 0x40 or 0) |
46// (IsXDigit(ch) and 0x80 or 0))
47// print Hex2(mask) + ',',
48// if i % 16 == 7:
49// print ' //', Hex2(i & 0x78)
50// elif i % 16 == 15:
51// print
52
53// clang-format off
54// Array of bitfields holding character information. Each bit value corresponds
55// to a particular character feature. For readability, and because the value
56// of these bits is tightly coupled to this implementation, the individual bits
57// are not named. Note that bitfields for all characters above ASCII 127 are
58// zero-initialized.
59const unsigned char kPropertyBits[256] = {
60 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
61 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
62 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
63 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
64 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
65 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
66 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
67 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
68 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
69 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
70 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
71 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
72 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
73 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
74 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
75 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
76};
77
78// Array of characters for the ascii_tolower() function. For values 'A'
79// through 'Z', return the lower-case character; otherwise, return the
80// identity of the passed character.
81const char kToLower[256] = {
82 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
83 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
84 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
85 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
86 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
87 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
88 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
89 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
90 '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
91 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
92 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
93 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
94 '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
95 '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
96 '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
97 '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
98 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
99 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
100 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
101 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
102 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
103 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
104 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
105 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
106 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
107 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
108 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
109 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
110 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
111 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
112 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
113 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
114};
115
116// Array of characters for the ascii_toupper() function. For values 'a'
117// through 'z', return the upper-case character; otherwise, return the
118// identity of the passed character.
119const char kToUpper[256] = {
120 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
121 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
122 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
123 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
124 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
125 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
126 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
127 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
128 '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
129 '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
130 '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
131 '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
132 '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
133 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
134 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
135 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
136 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
137 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
138 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
139 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
140 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
141 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
142 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
143 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
144 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
145 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
146 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
147 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
148 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
149 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
150 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
151 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
152};
153// clang-format on
154
155} // namespace ascii_internal
156
157void AsciiStrToLower(std::string* s) {
158 for (auto& ch : *s) {
159 ch = absl::ascii_tolower(ch);
160 }
161}
162
163void AsciiStrToUpper(std::string* s) {
164 for (auto& ch : *s) {
165 ch = absl::ascii_toupper(ch);
166 }
167}
168
169void RemoveExtraAsciiWhitespace(std::string* str) {
170 auto stripped = StripAsciiWhitespace(*str);
171
172 if (stripped.empty()) {
173 str->clear();
174 return;
175 }
176
177 auto input_it = stripped.begin();
178 auto input_end = stripped.end();
179 auto output_it = &(*str)[0];
180 bool is_ws = false;
181
182 for (; input_it < input_end; ++input_it) {
183 if (is_ws) {
184 // Consecutive whitespace? Keep only the last.
185 is_ws = absl::ascii_isspace(*input_it);
186 if (is_ws) --output_it;
187 } else {
188 is_ws = absl::ascii_isspace(*input_it);
189 }
190
191 *output_it = *input_it;
192 ++output_it;
193 }
194
195 str->erase(output_it - &(*str)[0]);
196}
197
198} // namespace absl
199