gconv_charset.c source code [Glibc/iconv/gconv_charset.c]

1	/ Charset name normalization.*
2	Copyright (C) 2020 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19
20	#include <stdlib.h>
21	#include <ctype.h>
22	#include <locale.h>
23	#include <stdbool.h>
24	#include <string.h>
25	#include <sys/stat.h>
26	#include "gconv_int.h"
27	#include "gconv_charset.h"
28
29
30	/ This function returns a pointer to the last suffix in a conversion code*
31	string. Valid suffixes matched by this function are of the form: '/' or ','
32	followed by arbitrary text that doesn't contain '/' or ','. It does not
33	edit the string in any way. The caller is expected to parse the suffix and
34	remove it (by e.g. truncating the string) before the next call. /*
35	static char *
36	find_suffix (char *s)
37	{
38	/ The conversion code is in the form of a triplet, separated by '/' chars.*
39	The third component of the triplet contains suffixes. If we don't have two
40	slashes, we don't have a suffix. /*
41
42	int slash_count = `0`;
43	char *suffix_term = NULL;
44
45	for (int i = `0`; s[i] != `'\0'`; i++)
46	switch (s[i])
47	{
48	case `'/'`:
49	slash_count++;
50	/ Fallthrough /
51	case `','`:
52	suffix_term = &s[i];
53	}
54
55	if (slash_count >= `2`)
56	return suffix_term;
57
58	return NULL;
59	}
60
61
62	struct gconv_parsed_code
63	{
64	char *code;
65	bool translit;
66	bool ignore;
67	};
68
69
70	/ This function parses an iconv_open encoding PC.CODE, strips any suffixes*
71	(such as TRANSLIT or IGNORE) from it and sets corresponding flags in it. /*
72	static void
73	gconv_parse_code (struct gconv_parsed_code *pc)
74	{
75	pc->translit = false;
76	pc->ignore = false;
77
78	while (`1`)
79	{
80	/ First drop any trailing whitespaces and separators. /
81	size_t len = strlen (pc->code);
82	while ((len > `0`)
83	&& (isspace (pc->code[len - `1`])
84	\|\| pc->code[len - `1`] == `','`
85	\|\| pc->code[len - `1`] == `'/'`))
86	len--;
87
88	pc->code[len] = `'\0'`;
89
90	if (len == `0`)
91	return;
92
93	char * suffix = find_suffix (pc->code);
94	if (suffix == NULL)
95	{
96	/ At this point, we have processed and removed all suffixes from the*
97	code and what remains of the code is suffix free. /*
98	return;
99	}
100	else
101	{
102	/ A suffix is processed from the end of the code array going*
103	backwards, one suffix at a time. The suffix is an index into the
104	code character array and points to: one past the end of the code
105	and any unprocessed suffixes, and to the beginning of the suffix
106	currently being processed during this iteration. We must process
107	this suffix and then drop it from the code by terminating the
108	preceding text with NULL.
109
110	We want to allow and recognize suffixes such as:
111
112	"/TRANSLIT" i.e. single suffix
113	"//TRANSLIT" i.e. single suffix and multiple separators
114	"//TRANSLIT/IGNORE" i.e. suffixes separated by "/"
115	"/TRANSLIT//IGNORE" i.e. suffixes separated by "//"
116	"//IGNORE,TRANSLIT" i.e. suffixes separated by ","
117	"//IGNORE," i.e. trailing ","
118	"//TRANSLIT/" i.e. trailing "/"
119	"//TRANSLIT//" i.e. trailing "//"
120	"/" i.e. empty suffix.
121
122	Unknown suffixes are silently discarded and ignored. /*
123
124	if ((__strcasecmp_l (suffix,
125	GCONV_TRIPLE_SEPARATOR
126	GCONV_TRANSLIT_SUFFIX,
127	_nl_C_locobj_ptr) == `0`)
128	\|\| (__strcasecmp_l (suffix,
129	GCONV_SUFFIX_SEPARATOR
130	GCONV_TRANSLIT_SUFFIX,
131	_nl_C_locobj_ptr) == `0`))
132	pc->translit = true;
133
134	if ((__strcasecmp_l (suffix,
135	GCONV_TRIPLE_SEPARATOR
136	GCONV_IGNORE_ERRORS_SUFFIX,
137	_nl_C_locobj_ptr) == `0`)
138	\|\| (__strcasecmp_l (suffix,
139	GCONV_SUFFIX_SEPARATOR
140	GCONV_IGNORE_ERRORS_SUFFIX,
141	_nl_C_locobj_ptr) == `0`))
142	pc->ignore = true;
143
144	/ We just processed this suffix. We can now drop it from the*
145	code string by truncating it at the suffix's position. /*
146	suffix[`0`] = `'\0'`;
147	}
148	}
149	}
150
151
152	/ This function accepts the charset names of the source and destination of the*
153	conversion and populates conv_spec with an equivalent conversion*
154	specification that may later be used by __gconv_open. The charset names
155	might contain options in the form of suffixes that alter the conversion,
156	e.g. "ISO-10646/UTF-8/TRANSLIT". It processes the charset names, ignoring
157	and truncating any suffix options in fromcode, and processing and truncating
158	any suffix options in tocode. Supported suffix options ("TRANSLIT" or
159	"IGNORE") when found in tocode lead to the corresponding flag in conv_spec*
160	to be set to true. Unrecognized suffix options are silently discarded. If
161	the function succeeds, it returns conv_spec back to the caller. It returns
162	NULL upon failure. conv_spec must be allocated and freed by the caller. /*
163	struct gconv_spec *
164	__gconv_create_spec (struct gconv_spec conv_spec, const* char *fromcode,
165	const char *tocode)
166	{
167	struct gconv_parsed_code pfc, ptc;
168	struct gconv_spec *ret = NULL;
169
170	pfc.code = __strdup (fromcode);
171	ptc.code = __strdup (tocode);
172
173	if ((pfc.code == NULL)
174	\|\| (ptc.code == NULL))
175	goto out;
176
177	gconv_parse_code (&pfc);
178	gconv_parse_code (&ptc);
179
180	/ We ignore suffixes in the fromcode because that is how the current*
181	implementation has always handled them. Only suffixes in the tocode are
182	processed and handled. The reality is that invalid input in the input
183	character set should only be ignored if the fromcode specifies IGNORE.
184	The current implementation ignores invalid intput in the input character
185	set if the tocode contains IGNORE. We preserve this behavior for
186	backwards compatibility. In the future we may split the handling of
187	IGNORE to allow a finer grained specification of ignorning invalid input
188	and/or ignoring invalid output. /*
189	conv_spec->translit = ptc.translit;
190	conv_spec->ignore = ptc.ignore;
191
192	/ 3 extra bytes because 1 extra for '\0', and 2 extra so strip might*
193	be able to add one or two trailing '/' characters if necessary. /*
194	conv_spec->fromcode = malloc (strlen (fromcode) + `3`);
195	if (conv_spec->fromcode == NULL)
196	goto out;
197
198	conv_spec->tocode = malloc (strlen (tocode) + `3`);
199	if (conv_spec->tocode == NULL)
200	{
201	free (conv_spec->fromcode);
202	conv_spec->fromcode = NULL;
203	goto out;
204	}
205
206	/ Strip unrecognized characters and ensure that the code has two '/'*
207	characters as per conversion code triplet specification. /*
208	strip (conv_spec->fromcode, pfc.code);
209	strip (conv_spec->tocode, ptc.code);
210	ret = conv_spec;
211
212	out:
213	free (pfc.code);
214	free (ptc.code);
215
216	return ret;
217	}
218	libc_hidden_def (__gconv_create_spec)
219

Browse the source code of Glibc/iconv/gconv_charset.c