ascii.cc source code [Abseil/strings/ascii.cc]

1	// Copyright 2017 The Abseil Authors.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// https://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	#include "absl/strings/ascii.h"
16
17	namespace absl {
18	namespace ascii_internal {
19
20	// # Table generated by this Python code (bit 0x02 is currently unused):
21	// TODO(mbar) Move Python code for generation of table to BUILD and link here.
22
23	// NOTE: The kAsciiPropertyBits table used within this code was generated by
24	// Python code of the following form. (Bit 0x02 is currently unused and
25	// available.)
26	//
27	// def Hex2(n):
28	// return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
29	// def IsPunct(ch):
30	// return (ord(ch) >= 32 and ord(ch) < 127 and
31	// not ch.isspace() and not ch.isalnum())
32	// def IsBlank(ch):
33	// return ch in ' \t'
34	// def IsCntrl(ch):
35	// return ord(ch) < 32 or ord(ch) == 127
36	// def IsXDigit(ch):
37	// return ch.isdigit() or ch.lower() in 'abcdef'
38	// for i in range(128):
39	// ch = chr(i)
40	// mask = ((ch.isalpha() and 0x01 or 0) \|
41	// (ch.isalnum() and 0x04 or 0) \|
42	// (ch.isspace() and 0x08 or 0) \|
43	// (IsPunct(ch) and 0x10 or 0) \|
44	// (IsBlank(ch) and 0x20 or 0) \|
45	// (IsCntrl(ch) and 0x40 or 0) \|
46	// (IsXDigit(ch) and 0x80 or 0))
47	// print Hex2(mask) + ',',
48	// if i % 16 == 7:
49	// print ' //', Hex2(i & 0x78)
50	// elif i % 16 == 15:
51	// print
52
53	// clang-format off
54	// Array of bitfields holding character information. Each bit value corresponds
55	// to a particular character feature. For readability, and because the value
56	// of these bits is tightly coupled to this implementation, the individual bits
57	// are not named. Note that bitfields for all characters above ASCII 127 are
58	// zero-initialized.
59	const unsigned char kPropertyBits[`256`] = {
60	`0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`, // 0x00
61	`0x40`, `0x68`, `0x48`, `0x48`, `0x48`, `0x48`, `0x40`, `0x40`,
62	`0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`, // 0x10
63	`0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`, `0x40`,
64	`0x28`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, // 0x20
65	`0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`,
66	`0x84`, `0x84`, `0x84`, `0x84`, `0x84`, `0x84`, `0x84`, `0x84`, // 0x30
67	`0x84`, `0x84`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`,
68	`0x10`, `0x85`, `0x85`, `0x85`, `0x85`, `0x85`, `0x85`, `0x05`, // 0x40
69	`0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`,
70	`0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, // 0x50
71	`0x05`, `0x05`, `0x05`, `0x10`, `0x10`, `0x10`, `0x10`, `0x10`,
72	`0x10`, `0x85`, `0x85`, `0x85`, `0x85`, `0x85`, `0x85`, `0x05`, // 0x60
73	`0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`,
74	`0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, `0x05`, // 0x70
75	`0x05`, `0x05`, `0x05`, `0x10`, `0x10`, `0x10`, `0x10`, `0x40`,
76	};
77
78	// Array of characters for the ascii_tolower() function. For values 'A'
79	// through 'Z', return the lower-case character; otherwise, return the
80	// identity of the passed character.
81	const char kToLower[`256`] = {
82	`'\x00'`, `'\x01'`, `'\x02'`, `'\x03'`, `'\x04'`, `'\x05'`, `'\x06'`, `'\x07'`,
83	`'\x08'`, `'\x09'`, `'\x0a'`, `'\x0b'`, `'\x0c'`, `'\x0d'`, `'\x0e'`, `'\x0f'`,
84	`'\x10'`, `'\x11'`, `'\x12'`, `'\x13'`, `'\x14'`, `'\x15'`, `'\x16'`, `'\x17'`,
85	`'\x18'`, `'\x19'`, `'\x1a'`, `'\x1b'`, `'\x1c'`, `'\x1d'`, `'\x1e'`, `'\x1f'`,
86	`'\x20'`, `'\x21'`, `'\x22'`, `'\x23'`, `'\x24'`, `'\x25'`, `'\x26'`, `'\x27'`,
87	`'\x28'`, `'\x29'`, `'\x2a'`, `'\x2b'`, `'\x2c'`, `'\x2d'`, `'\x2e'`, `'\x2f'`,
88	`'\x30'`, `'\x31'`, `'\x32'`, `'\x33'`, `'\x34'`, `'\x35'`, `'\x36'`, `'\x37'`,
89	`'\x38'`, `'\x39'`, `'\x3a'`, `'\x3b'`, `'\x3c'`, `'\x3d'`, `'\x3e'`, `'\x3f'`,
90	`'\x40'`, `'a'`, `'b'`, `'c'`, `'d'`, `'e'`, `'f'`, `'g'`,
91	`'h'`, `'i'`, `'j'`, `'k'`, `'l'`, `'m'`, `'n'`, `'o'`,
92	`'p'`, `'q'`, `'r'`, `'s'`, `'t'`, `'u'`, `'v'`, `'w'`,
93	`'x'`, `'y'`, `'z'`, `'\x5b'`, `'\x5c'`, `'\x5d'`, `'\x5e'`, `'\x5f'`,
94	`'\x60'`, `'\x61'`, `'\x62'`, `'\x63'`, `'\x64'`, `'\x65'`, `'\x66'`, `'\x67'`,
95	`'\x68'`, `'\x69'`, `'\x6a'`, `'\x6b'`, `'\x6c'`, `'\x6d'`, `'\x6e'`, `'\x6f'`,
96	`'\x70'`, `'\x71'`, `'\x72'`, `'\x73'`, `'\x74'`, `'\x75'`, `'\x76'`, `'\x77'`,
97	`'\x78'`, `'\x79'`, `'\x7a'`, `'\x7b'`, `'\x7c'`, `'\x7d'`, `'\x7e'`, `'\x7f'`,
98	`'\x80'`, `'\x81'`, `'\x82'`, `'\x83'`, `'\x84'`, `'\x85'`, `'\x86'`, `'\x87'`,
99	`'\x88'`, `'\x89'`, `'\x8a'`, `'\x8b'`, `'\x8c'`, `'\x8d'`, `'\x8e'`, `'\x8f'`,
100	`'\x90'`, `'\x91'`, `'\x92'`, `'\x93'`, `'\x94'`, `'\x95'`, `'\x96'`, `'\x97'`,
101	`'\x98'`, `'\x99'`, `'\x9a'`, `'\x9b'`, `'\x9c'`, `'\x9d'`, `'\x9e'`, `'\x9f'`,
102	`'\xa0'`, `'\xa1'`, `'\xa2'`, `'\xa3'`, `'\xa4'`, `'\xa5'`, `'\xa6'`, `'\xa7'`,
103	`'\xa8'`, `'\xa9'`, `'\xaa'`, `'\xab'`, `'\xac'`, `'\xad'`, `'\xae'`, `'\xaf'`,
104	`'\xb0'`, `'\xb1'`, `'\xb2'`, `'\xb3'`, `'\xb4'`, `'\xb5'`, `'\xb6'`, `'\xb7'`,
105	`'\xb8'`, `'\xb9'`, `'\xba'`, `'\xbb'`, `'\xbc'`, `'\xbd'`, `'\xbe'`, `'\xbf'`,
106	`'\xc0'`, `'\xc1'`, `'\xc2'`, `'\xc3'`, `'\xc4'`, `'\xc5'`, `'\xc6'`, `'\xc7'`,
107	`'\xc8'`, `'\xc9'`, `'\xca'`, `'\xcb'`, `'\xcc'`, `'\xcd'`, `'\xce'`, `'\xcf'`,
108	`'\xd0'`, `'\xd1'`, `'\xd2'`, `'\xd3'`, `'\xd4'`, `'\xd5'`, `'\xd6'`, `'\xd7'`,
109	`'\xd8'`, `'\xd9'`, `'\xda'`, `'\xdb'`, `'\xdc'`, `'\xdd'`, `'\xde'`, `'\xdf'`,
110	`'\xe0'`, `'\xe1'`, `'\xe2'`, `'\xe3'`, `'\xe4'`, `'\xe5'`, `'\xe6'`, `'\xe7'`,
111	`'\xe8'`, `'\xe9'`, `'\xea'`, `'\xeb'`, `'\xec'`, `'\xed'`, `'\xee'`, `'\xef'`,
112	`'\xf0'`, `'\xf1'`, `'\xf2'`, `'\xf3'`, `'\xf4'`, `'\xf5'`, `'\xf6'`, `'\xf7'`,
113	`'\xf8'`, `'\xf9'`, `'\xfa'`, `'\xfb'`, `'\xfc'`, `'\xfd'`, `'\xfe'`, `'\xff'`,
114	};
115
116	// Array of characters for the ascii_toupper() function. For values 'a'
117	// through 'z', return the upper-case character; otherwise, return the
118	// identity of the passed character.
119	const char kToUpper[`256`] = {
120	`'\x00'`, `'\x01'`, `'\x02'`, `'\x03'`, `'\x04'`, `'\x05'`, `'\x06'`, `'\x07'`,
121	`'\x08'`, `'\x09'`, `'\x0a'`, `'\x0b'`, `'\x0c'`, `'\x0d'`, `'\x0e'`, `'\x0f'`,
122	`'\x10'`, `'\x11'`, `'\x12'`, `'\x13'`, `'\x14'`, `'\x15'`, `'\x16'`, `'\x17'`,
123	`'\x18'`, `'\x19'`, `'\x1a'`, `'\x1b'`, `'\x1c'`, `'\x1d'`, `'\x1e'`, `'\x1f'`,
124	`'\x20'`, `'\x21'`, `'\x22'`, `'\x23'`, `'\x24'`, `'\x25'`, `'\x26'`, `'\x27'`,
125	`'\x28'`, `'\x29'`, `'\x2a'`, `'\x2b'`, `'\x2c'`, `'\x2d'`, `'\x2e'`, `'\x2f'`,
126	`'\x30'`, `'\x31'`, `'\x32'`, `'\x33'`, `'\x34'`, `'\x35'`, `'\x36'`, `'\x37'`,
127	`'\x38'`, `'\x39'`, `'\x3a'`, `'\x3b'`, `'\x3c'`, `'\x3d'`, `'\x3e'`, `'\x3f'`,
128	`'\x40'`, `'\x41'`, `'\x42'`, `'\x43'`, `'\x44'`, `'\x45'`, `'\x46'`, `'\x47'`,
129	`'\x48'`, `'\x49'`, `'\x4a'`, `'\x4b'`, `'\x4c'`, `'\x4d'`, `'\x4e'`, `'\x4f'`,
130	`'\x50'`, `'\x51'`, `'\x52'`, `'\x53'`, `'\x54'`, `'\x55'`, `'\x56'`, `'\x57'`,
131	`'\x58'`, `'\x59'`, `'\x5a'`, `'\x5b'`, `'\x5c'`, `'\x5d'`, `'\x5e'`, `'\x5f'`,
132	`'\x60'`, `'A'`, `'B'`, `'C'`, `'D'`, `'E'`, `'F'`, `'G'`,
133	`'H'`, `'I'`, `'J'`, `'K'`, `'L'`, `'M'`, `'N'`, `'O'`,
134	`'P'`, `'Q'`, `'R'`, `'S'`, `'T'`, `'U'`, `'V'`, `'W'`,
135	`'X'`, `'Y'`, `'Z'`, `'\x7b'`, `'\x7c'`, `'\x7d'`, `'\x7e'`, `'\x7f'`,
136	`'\x80'`, `'\x81'`, `'\x82'`, `'\x83'`, `'\x84'`, `'\x85'`, `'\x86'`, `'\x87'`,
137	`'\x88'`, `'\x89'`, `'\x8a'`, `'\x8b'`, `'\x8c'`, `'\x8d'`, `'\x8e'`, `'\x8f'`,
138	`'\x90'`, `'\x91'`, `'\x92'`, `'\x93'`, `'\x94'`, `'\x95'`, `'\x96'`, `'\x97'`,
139	`'\x98'`, `'\x99'`, `'\x9a'`, `'\x9b'`, `'\x9c'`, `'\x9d'`, `'\x9e'`, `'\x9f'`,
140	`'\xa0'`, `'\xa1'`, `'\xa2'`, `'\xa3'`, `'\xa4'`, `'\xa5'`, `'\xa6'`, `'\xa7'`,
141	`'\xa8'`, `'\xa9'`, `'\xaa'`, `'\xab'`, `'\xac'`, `'\xad'`, `'\xae'`, `'\xaf'`,
142	`'\xb0'`, `'\xb1'`, `'\xb2'`, `'\xb3'`, `'\xb4'`, `'\xb5'`, `'\xb6'`, `'\xb7'`,
143	`'\xb8'`, `'\xb9'`, `'\xba'`, `'\xbb'`, `'\xbc'`, `'\xbd'`, `'\xbe'`, `'\xbf'`,
144	`'\xc0'`, `'\xc1'`, `'\xc2'`, `'\xc3'`, `'\xc4'`, `'\xc5'`, `'\xc6'`, `'\xc7'`,
145	`'\xc8'`, `'\xc9'`, `'\xca'`, `'\xcb'`, `'\xcc'`, `'\xcd'`, `'\xce'`, `'\xcf'`,
146	`'\xd0'`, `'\xd1'`, `'\xd2'`, `'\xd3'`, `'\xd4'`, `'\xd5'`, `'\xd6'`, `'\xd7'`,
147	`'\xd8'`, `'\xd9'`, `'\xda'`, `'\xdb'`, `'\xdc'`, `'\xdd'`, `'\xde'`, `'\xdf'`,
148	`'\xe0'`, `'\xe1'`, `'\xe2'`, `'\xe3'`, `'\xe4'`, `'\xe5'`, `'\xe6'`, `'\xe7'`,
149	`'\xe8'`, `'\xe9'`, `'\xea'`, `'\xeb'`, `'\xec'`, `'\xed'`, `'\xee'`, `'\xef'`,
150	`'\xf0'`, `'\xf1'`, `'\xf2'`, `'\xf3'`, `'\xf4'`, `'\xf5'`, `'\xf6'`, `'\xf7'`,
151	`'\xf8'`, `'\xf9'`, `'\xfa'`, `'\xfb'`, `'\xfc'`, `'\xfd'`, `'\xfe'`, `'\xff'`,
152	};
153	// clang-format on
154
155	} // namespace ascii_internal
156
157	void AsciiStrToLower(std::string* s) {
158	for (auto& ch : *s) {
159	ch = absl::ascii_tolower(ch);
160	}
161	}
162
163	void AsciiStrToUpper(std::string* s) {
164	for (auto& ch : *s) {
165	ch = absl::ascii_toupper(ch);
166	}
167	}
168
169	void RemoveExtraAsciiWhitespace(std::string* str) {
170	auto stripped = StripAsciiWhitespace(*str);
171
172	if (stripped.empty()) {
173	str->clear();
174	return;
175	}
176
177	auto input_it = stripped.begin();
178	auto input_end = stripped.end();
179	auto output_it = &(*str)[`0`];
180	bool is_ws = false;
181
182	for (; input_it < input_end; ++input_it) {
183	if (is_ws) {
184	// Consecutive whitespace? Keep only the last.
185	is_ws = absl::ascii_isspace(*input_it);
186	if (is_ws) --output_it;
187	} else {
188	is_ws = absl::ascii_isspace(*input_it);
189	}
190
191	output_it = input_it;
192	++output_it;
193	}
194
195	str->erase(output_it - &(*str)[`0`]);
196	}
197
198	} // namespace absl
199

Browse the source code of Abseil/strings/ascii.cc