xmlstring.c source code [ClickHouse/contrib/libxml2/xmlstring.c]

1	/*
2	* string.c : an XML string utilities module
3	*
4	* This module provides various utility functions for manipulating
5	* the xmlChar* type. All functions named xmlStr* have been moved here
6	* from the parser.c file (their original home).
7	*
8	* See Copyright for the status of this software.
9	*
10	* UTF8 string routines from:
11	* William Brack <wbrack@mmm.com.hk>
12	*
13	* daniel@veillard.com
14	*/
15
16	#define IN_LIBXML
17	#include "libxml.h"
18
19	#include <stdlib.h>
20	#include <string.h>
21	#include <libxml/xmlmemory.h>
22	#include <libxml/parserInternals.h>
23	#include <libxml/xmlstring.h>
24
25	/************************************************************************
26	* *
27	* Commodity functions to handle xmlChars *
28	* *
29	************************************************************************/
30
31	/**
32	* xmlStrndup:
33	* @cur: the input xmlChar *
34	* @len: the len of @cur
35	*
36	* a strndup for array of xmlChar's
37	*
38	* Returns a new xmlChar * or NULL
39	*/
40	xmlChar *
41	xmlStrndup(const xmlChar cur, int* len) {
42	xmlChar *ret;
43
44	if ((cur == NULL) \|\| (len < `0`)) return(NULL);
45	ret = (xmlChar ) xmlMallocAtomic((len + `1`) sizeof(xmlChar));
46	if (ret == NULL) {
47	xmlErrMemory(NULL, NULL);
48	return(NULL);
49	}
50	memcpy(ret, cur, len * sizeof(xmlChar));
51	ret[len] = `0`;
52	return(ret);
53	}
54
55	/**
56	* xmlStrdup:
57	* @cur: the input xmlChar *
58	*
59	* a strdup for array of xmlChar's. Since they are supposed to be
60	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
61	* a termination mark of '0'.
62	*
63	* Returns a new xmlChar * or NULL
64	*/
65	xmlChar *
66	xmlStrdup(const xmlChar *cur) {
67	const xmlChar *p = cur;
68
69	if (cur == NULL) return(NULL);
70	while (p != `0`) p++; /* non input consuming /
71	return(xmlStrndup(cur, p - cur));
72	}
73
74	/**
75	* xmlCharStrndup:
76	* @cur: the input char *
77	* @len: the len of @cur
78	*
79	* a strndup for char's to xmlChar's
80	*
81	* Returns a new xmlChar * or NULL
82	*/
83
84	xmlChar *
85	xmlCharStrndup(const char cur, int* len) {
86	int i;
87	xmlChar *ret;
88
89	if ((cur == NULL) \|\| (len < `0`)) return(NULL);
90	ret = (xmlChar ) xmlMallocAtomic((len + `1`) sizeof(xmlChar));
91	if (ret == NULL) {
92	xmlErrMemory(NULL, NULL);
93	return(NULL);
94	}
95	for (i = `0`;i < len;i++) {
96	ret[i] = (xmlChar) cur[i];
97	if (ret[i] == `0`) return(ret);
98	}
99	ret[len] = `0`;
100	return(ret);
101	}
102
103	/**
104	* xmlCharStrdup:
105	* @cur: the input char *
106	*
107	* a strdup for char's to xmlChar's
108	*
109	* Returns a new xmlChar * or NULL
110	*/
111
112	xmlChar *
113	xmlCharStrdup(const char *cur) {
114	const char *p = cur;
115
116	if (cur == NULL) return(NULL);
117	while (p != `'\0'`) p++; /* non input consuming /
118	return(xmlCharStrndup(cur, p - cur));
119	}
120
121	/**
122	* xmlStrcmp:
123	* @str1: the first xmlChar *
124	* @str2: the second xmlChar *
125	*
126	* a strcmp for xmlChar's
127	*
128	* Returns the integer result of the comparison
129	*/
130
131	int
132	xmlStrcmp(const xmlChar str1, const* xmlChar *str2) {
133	register int tmp;
134
135	if (str1 == str2) return(`0`);
136	if (str1 == NULL) return(-`1`);
137	if (str2 == NULL) return(`1`);
138	do {
139	tmp = str1++ - str2;
140	if (tmp != `0`) return(tmp);
141	} while (*str2++ != `0`);
142	return `0`;
143	}
144
145	/**
146	* xmlStrEqual:
147	* @str1: the first xmlChar *
148	* @str2: the second xmlChar *
149	*
150	* Check if both strings are equal of have same content.
151	* Should be a bit more readable and faster than xmlStrcmp()
152	*
153	* Returns 1 if they are equal, 0 if they are different
154	*/
155
156	int
157	xmlStrEqual(const xmlChar str1, const* xmlChar *str2) {
158	if (str1 == str2) return(`1`);
159	if (str1 == NULL) return(`0`);
160	if (str2 == NULL) return(`0`);
161	do {
162	if (str1++ != str2) return(`0`);
163	} while (*str2++);
164	return(`1`);
165	}
166
167	/**
168	* xmlStrQEqual:
169	* @pref: the prefix of the QName
170	* @name: the localname of the QName
171	* @str: the second xmlChar *
172	*
173	* Check if a QName is Equal to a given string
174	*
175	* Returns 1 if they are equal, 0 if they are different
176	*/
177
178	int
179	xmlStrQEqual(const xmlChar pref, const* xmlChar name, const* xmlChar *str) {
180	if (pref == NULL) return(xmlStrEqual(name, str));
181	if (name == NULL) return(`0`);
182	if (str == NULL) return(`0`);
183
184	do {
185	if (pref++ != str) return(`0`);
186	} while ((str++) && (pref));
187	if (str++ != `':'`) return*(`0`);
188	do {
189	if (name++ != str) return(`0`);
190	} while (*str++);
191	return(`1`);
192	}
193
194	/**
195	* xmlStrncmp:
196	* @str1: the first xmlChar *
197	* @str2: the second xmlChar *
198	* @len: the max comparison length
199	*
200	* a strncmp for xmlChar's
201	*
202	* Returns the integer result of the comparison
203	*/
204
205	int
206	xmlStrncmp(const xmlChar str1, const* xmlChar str2, int* len) {
207	register int tmp;
208
209	if (len <= `0`) return(`0`);
210	if (str1 == str2) return(`0`);
211	if (str1 == NULL) return(-`1`);
212	if (str2 == NULL) return(`1`);
213	#ifdef __GNUC__
214	tmp = strncmp((const char )str1, (const* char *)str2, len);
215	return tmp;
216	#else
217	do {
218	tmp = str1++ - str2;
219	if (tmp != `0` \|\| --len == `0`) return(tmp);
220	} while (*str2++ != `0`);
221	return `0`;
222	#endif
223	}
224
225	static const xmlChar casemap[`256`] = {
226	`0x00`,`0x01`,`0x02`,`0x03`,`0x04`,`0x05`,`0x06`,`0x07`,
227	`0x08`,`0x09`,`0x0A`,`0x0B`,`0x0C`,`0x0D`,`0x0E`,`0x0F`,
228	`0x10`,`0x11`,`0x12`,`0x13`,`0x14`,`0x15`,`0x16`,`0x17`,
229	`0x18`,`0x19`,`0x1A`,`0x1B`,`0x1C`,`0x1D`,`0x1E`,`0x1F`,
230	`0x20`,`0x21`,`0x22`,`0x23`,`0x24`,`0x25`,`0x26`,`0x27`,
231	`0x28`,`0x29`,`0x2A`,`0x2B`,`0x2C`,`0x2D`,`0x2E`,`0x2F`,
232	`0x30`,`0x31`,`0x32`,`0x33`,`0x34`,`0x35`,`0x36`,`0x37`,
233	`0x38`,`0x39`,`0x3A`,`0x3B`,`0x3C`,`0x3D`,`0x3E`,`0x3F`,
234	`0x40`,`0x61`,`0x62`,`0x63`,`0x64`,`0x65`,`0x66`,`0x67`,
235	`0x68`,`0x69`,`0x6A`,`0x6B`,`0x6C`,`0x6D`,`0x6E`,`0x6F`,
236	`0x70`,`0x71`,`0x72`,`0x73`,`0x74`,`0x75`,`0x76`,`0x77`,
237	`0x78`,`0x79`,`0x7A`,`0x7B`,`0x5C`,`0x5D`,`0x5E`,`0x5F`,
238	`0x60`,`0x61`,`0x62`,`0x63`,`0x64`,`0x65`,`0x66`,`0x67`,
239	`0x68`,`0x69`,`0x6A`,`0x6B`,`0x6C`,`0x6D`,`0x6E`,`0x6F`,
240	`0x70`,`0x71`,`0x72`,`0x73`,`0x74`,`0x75`,`0x76`,`0x77`,
241	`0x78`,`0x79`,`0x7A`,`0x7B`,`0x7C`,`0x7D`,`0x7E`,`0x7F`,
242	`0x80`,`0x81`,`0x82`,`0x83`,`0x84`,`0x85`,`0x86`,`0x87`,
243	`0x88`,`0x89`,`0x8A`,`0x8B`,`0x8C`,`0x8D`,`0x8E`,`0x8F`,
244	`0x90`,`0x91`,`0x92`,`0x93`,`0x94`,`0x95`,`0x96`,`0x97`,
245	`0x98`,`0x99`,`0x9A`,`0x9B`,`0x9C`,`0x9D`,`0x9E`,`0x9F`,
246	`0xA0`,`0xA1`,`0xA2`,`0xA3`,`0xA4`,`0xA5`,`0xA6`,`0xA7`,
247	`0xA8`,`0xA9`,`0xAA`,`0xAB`,`0xAC`,`0xAD`,`0xAE`,`0xAF`,
248	`0xB0`,`0xB1`,`0xB2`,`0xB3`,`0xB4`,`0xB5`,`0xB6`,`0xB7`,
249	`0xB8`,`0xB9`,`0xBA`,`0xBB`,`0xBC`,`0xBD`,`0xBE`,`0xBF`,
250	`0xC0`,`0xC1`,`0xC2`,`0xC3`,`0xC4`,`0xC5`,`0xC6`,`0xC7`,
251	`0xC8`,`0xC9`,`0xCA`,`0xCB`,`0xCC`,`0xCD`,`0xCE`,`0xCF`,
252	`0xD0`,`0xD1`,`0xD2`,`0xD3`,`0xD4`,`0xD5`,`0xD6`,`0xD7`,
253	`0xD8`,`0xD9`,`0xDA`,`0xDB`,`0xDC`,`0xDD`,`0xDE`,`0xDF`,
254	`0xE0`,`0xE1`,`0xE2`,`0xE3`,`0xE4`,`0xE5`,`0xE6`,`0xE7`,
255	`0xE8`,`0xE9`,`0xEA`,`0xEB`,`0xEC`,`0xED`,`0xEE`,`0xEF`,
256	`0xF0`,`0xF1`,`0xF2`,`0xF3`,`0xF4`,`0xF5`,`0xF6`,`0xF7`,
257	`0xF8`,`0xF9`,`0xFA`,`0xFB`,`0xFC`,`0xFD`,`0xFE`,`0xFF`
258	};
259
260	/**
261	* xmlStrcasecmp:
262	* @str1: the first xmlChar *
263	* @str2: the second xmlChar *
264	*
265	* a strcasecmp for xmlChar's
266	*
267	* Returns the integer result of the comparison
268	*/
269
270	int
271	xmlStrcasecmp(const xmlChar str1, const* xmlChar *str2) {
272	register int tmp;
273
274	if (str1 == str2) return(`0`);
275	if (str1 == NULL) return(-`1`);
276	if (str2 == NULL) return(`1`);
277	do {
278	tmp = casemap[str1++] - casemap[str2];
279	if (tmp != `0`) return(tmp);
280	} while (*str2++ != `0`);
281	return `0`;
282	}
283
284	/**
285	* xmlStrncasecmp:
286	* @str1: the first xmlChar *
287	* @str2: the second xmlChar *
288	* @len: the max comparison length
289	*
290	* a strncasecmp for xmlChar's
291	*
292	* Returns the integer result of the comparison
293	*/
294
295	int
296	xmlStrncasecmp(const xmlChar str1, const* xmlChar str2, int* len) {
297	register int tmp;
298
299	if (len <= `0`) return(`0`);
300	if (str1 == str2) return(`0`);
301	if (str1 == NULL) return(-`1`);
302	if (str2 == NULL) return(`1`);
303	do {
304	tmp = casemap[str1++] - casemap[str2];
305	if (tmp != `0` \|\| --len == `0`) return(tmp);
306	} while (*str2++ != `0`);
307	return `0`;
308	}
309
310	/**
311	* xmlStrchr:
312	* @str: the xmlChar * array
313	* @val: the xmlChar to search
314	*
315	* a strchr for xmlChar's
316	*
317	* Returns the xmlChar * for the first occurrence or NULL.
318	*/
319
320	const xmlChar *
321	xmlStrchr(const xmlChar *str, xmlChar val) {
322	if (str == NULL) return(NULL);
323	while (str != `0`) { /* non input consuming /
324	if (str == val) return((xmlChar ) str);
325	str++;
326	}
327	return(NULL);
328	}
329
330	/**
331	* xmlStrstr:
332	* @str: the xmlChar * array (haystack)
333	* @val: the xmlChar to search (needle)
334	*
335	* a strstr for xmlChar's
336	*
337	* Returns the xmlChar * for the first occurrence or NULL.
338	*/
339
340	const xmlChar *
341	xmlStrstr(const xmlChar str, const* xmlChar *val) {
342	int n;
343
344	if (str == NULL) return(NULL);
345	if (val == NULL) return(NULL);
346	n = xmlStrlen(val);
347
348	if (n == `0`) return(str);
349	while (str != `0`) { /* non input consuming /
350	if (str == val) {
351	if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352	}
353	str++;
354	}
355	return(NULL);
356	}
357
358	/**
359	* xmlStrcasestr:
360	* @str: the xmlChar * array (haystack)
361	* @val: the xmlChar to search (needle)
362	*
363	* a case-ignoring strstr for xmlChar's
364	*
365	* Returns the xmlChar * for the first occurrence or NULL.
366	*/
367
368	const xmlChar *
369	xmlStrcasestr(const xmlChar str, const* xmlChar *val) {
370	int n;
371
372	if (str == NULL) return(NULL);
373	if (val == NULL) return(NULL);
374	n = xmlStrlen(val);
375
376	if (n == `0`) return(str);
377	while (str != `0`) { /* non input consuming /
378	if (casemap[str] == casemap[val])
379	if (!xmlStrncasecmp(str, val, n)) return(str);
380	str++;
381	}
382	return(NULL);
383	}
384
385	/**
386	* xmlStrsub:
387	* @str: the xmlChar * array (haystack)
388	* @start: the index of the first char (zero based)
389	* @len: the length of the substring
390	*
391	* Extract a substring of a given string
392	*
393	* Returns the xmlChar * for the first occurrence or NULL.
394	*/
395
396	xmlChar *
397	xmlStrsub(const xmlChar str, int* start, int len) {
398	int i;
399
400	if (str == NULL) return(NULL);
401	if (start < `0`) return(NULL);
402	if (len < `0`) return(NULL);
403
404	for (i = `0`;i < start;i++) {
405	if (str == `0`) return*(NULL);
406	str++;
407	}
408	if (str == `0`) return*(NULL);
409	return(xmlStrndup(str, len));
410	}
411
412	/**
413	* xmlStrlen:
414	* @str: the xmlChar * array
415	*
416	* length of a xmlChar's string
417	*
418	* Returns the number of xmlChar contained in the ARRAY.
419	*/
420
421	int
422	xmlStrlen(const xmlChar *str) {
423	int len = `0`;
424
425	if (str == NULL) return(`0`);
426	while (str != `0`) { /* non input consuming /
427	str++;
428	len++;
429	}
430	return(len);
431	}
432
433	/**
434	* xmlStrncat:
435	* @cur: the original xmlChar * array
436	* @add: the xmlChar * array added
437	* @len: the length of @add
438	*
439	* a strncat for array of xmlChar's, it will extend @cur with the len
440	* first bytes of @add. Note that if @len < 0 then this is an API error
441	* and NULL will be returned.
442	*
443	* Returns a new xmlChar *, the original @cur is reallocated and should
444	* not be freed.
445	*/
446
447	xmlChar *
448	xmlStrncat(xmlChar cur, const* xmlChar add, int* len) {
449	int size;
450	xmlChar *ret;
451
452	if ((add == NULL) \|\| (len == `0`))
453	return(cur);
454	if (len < `0`)
455	return(NULL);
456	if (cur == NULL)
457	return(xmlStrndup(add, len));
458
459	size = xmlStrlen(cur);
460	if (size < `0`)
461	return(NULL);
462	ret = (xmlChar ) xmlRealloc(cur, (size + len + `1`) sizeof(xmlChar));
463	if (ret == NULL) {
464	xmlErrMemory(NULL, NULL);
465	return(cur);
466	}
467	memcpy(&ret[size], add, len * sizeof(xmlChar));
468	ret[size + len] = `0`;
469	return(ret);
470	}
471
472	/**
473	* xmlStrncatNew:
474	* @str1: first xmlChar string
475	* @str2: second xmlChar string
476	* @len: the len of @str2 or < 0
477	*
478	* same as xmlStrncat, but creates a new string. The original
479	* two strings are not freed. If @len is < 0 then the length
480	* will be calculated automatically.
481	*
482	* Returns a new xmlChar * or NULL
483	*/
484	xmlChar *
485	xmlStrncatNew(const xmlChar str1, const* xmlChar str2, int* len) {
486	int size;
487	xmlChar *ret;
488
489	if (len < `0`) {
490	len = xmlStrlen(str2);
491	if (len < `0`)
492	return(NULL);
493	}
494	if ((str2 == NULL) \|\| (len == `0`))
495	return(xmlStrdup(str1));
496	if (str1 == NULL)
497	return(xmlStrndup(str2, len));
498
499	size = xmlStrlen(str1);
500	if (size < `0`)
501	return(NULL);
502	ret = (xmlChar ) xmlMalloc((size + len + `1`) sizeof(xmlChar));
503	if (ret == NULL) {
504	xmlErrMemory(NULL, NULL);
505	return(xmlStrndup(str1, size));
506	}
507	memcpy(ret, str1, size * sizeof(xmlChar));
508	memcpy(&ret[size], str2, len * sizeof(xmlChar));
509	ret[size + len] = `0`;
510	return(ret);
511	}
512
513	/**
514	* xmlStrcat:
515	* @cur: the original xmlChar * array
516	* @add: the xmlChar * array added
517	*
518	* a strcat for array of xmlChar's. Since they are supposed to be
519	* encoded in UTF-8 or an encoding with 8bit based chars, we assume
520	* a termination mark of '0'.
521	*
522	* Returns a new xmlChar * containing the concatenated string. The original
523	* @cur is reallocated and should not be freed.
524	*/
525	xmlChar *
526	xmlStrcat(xmlChar cur, const* xmlChar *add) {
527	const xmlChar *p = add;
528
529	if (add == NULL) return(cur);
530	if (cur == NULL)
531	return(xmlStrdup(add));
532
533	while (p != `0`) p++; /* non input consuming /
534	return(xmlStrncat(cur, add, p - add));
535	}
536
537	/**
538	* xmlStrPrintf:
539	* @buf: the result buffer.
540	* @len: the result buffer length.
541	* @msg: the message with printf formatting.
542	* @...: extra parameters for the message.
543	*
544	* Formats @msg and places result into @buf.
545	*
546	* Returns the number of characters written to @buf or -1 if an error occurs.
547	*/
548	int XMLCDECL
549	xmlStrPrintf(xmlChar buf, int* len, const char *msg, ...) {
550	va_list args;
551	int ret;
552
553	if((buf == NULL) \|\| (msg == NULL)) {
554	return(-`1`);
555	}
556
557	va_start(args, msg);
558	ret = vsnprintf((char ) buf, len, (const* char *) msg, args);
559	va_end(args);
560	buf[len - `1`] = `0`; / be safe ! /
561
562	return(ret);
563	}
564
565	/**
566	* xmlStrVPrintf:
567	* @buf: the result buffer.
568	* @len: the result buffer length.
569	* @msg: the message with printf formatting.
570	* @ap: extra parameters for the message.
571	*
572	* Formats @msg and places result into @buf.
573	*
574	* Returns the number of characters written to @buf or -1 if an error occurs.
575	*/
576	int
577	xmlStrVPrintf(xmlChar buf, int* len, const char *msg, va_list ap) {
578	int ret;
579
580	if((buf == NULL) \|\| (msg == NULL)) {
581	return(-`1`);
582	}
583
584	ret = vsnprintf((char ) buf, len, (const* char *) msg, ap);
585	buf[len - `1`] = `0`; / be safe ! /
586
587	return(ret);
588	}
589
590	/************************************************************************
591	* *
592	* Generic UTF8 handling routines *
593	* *
594	* From rfc2044: encoding of the Unicode values on UTF-8: *
595	* *
596	* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
597	* 0000 0000-0000 007F 0xxxxxxx *
598	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
599	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
600	* *
601	* I hope we won't use values > 0xFFFF anytime soon ! *
602	* *
603	************************************************************************/
604
605
606	/**
607	* xmlUTF8Size:
608	* @utf: pointer to the UTF8 character
609	*
610	* calculates the internal size of a UTF8 character
611	*
612	* returns the numbers of bytes in the character, -1 on format error
613	*/
614	int
615	xmlUTF8Size(const xmlChar *utf) {
616	xmlChar mask;
617	int len;
618
619	if (utf == NULL)
620	return -`1`;
621	if (*utf < `0x80`)
622	return `1`;
623	/ check valid UTF8 character /
624	if (!(*utf & `0x40`))
625	return -`1`;
626	/ determine number of bytes in char /
627	len = `2`;
628	for (mask=`0x20`; mask != `0`; mask>>=`1`) {
629	if (!(*utf & mask))
630	return len;
631	len++;
632	}
633	return -`1`;
634	}
635
636	/**
637	* xmlUTF8Charcmp:
638	* @utf1: pointer to first UTF8 char
639	* @utf2: pointer to second UTF8 char
640	*
641	* compares the two UCS4 values
642	*
643	* returns result of the compare as with xmlStrncmp
644	*/
645	int
646	xmlUTF8Charcmp(const xmlChar utf1, const* xmlChar *utf2) {
647
648	if (utf1 == NULL ) {
649	if (utf2 == NULL)
650	return `0`;
651	return -`1`;
652	}
653	return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
654	}
655
656	/**
657	* xmlUTF8Strlen:
658	* @utf: a sequence of UTF-8 encoded bytes
659	*
660	* compute the length of an UTF8 string, it doesn't do a full UTF8
661	* checking of the content of the string.
662	*
663	* Returns the number of characters in the string or -1 in case of error
664	*/
665	int
666	xmlUTF8Strlen(const xmlChar *utf) {
667	int ret = `0`;
668
669	if (utf == NULL)
670	return(-`1`);
671
672	while (*utf != `0`) {
673	if (utf[`0`] & `0x80`) {
674	if ((utf[`1`] & `0xc0`) != `0x80`)
675	return(-`1`);
676	if ((utf[`0`] & `0xe0`) == `0xe0`) {
677	if ((utf[`2`] & `0xc0`) != `0x80`)
678	return(-`1`);
679	if ((utf[`0`] & `0xf0`) == `0xf0`) {
680	if ((utf[`0`] & `0xf8`) != `0xf0` \|\| (utf[`3`] & `0xc0`) != `0x80`)
681	return(-`1`);
682	utf += `4`;
683	} else {
684	utf += `3`;
685	}
686	} else {
687	utf += `2`;
688	}
689	} else {
690	utf++;
691	}
692	ret++;
693	}
694	return(ret);
695	}
696
697	/**
698	* xmlGetUTF8Char:
699	* @utf: a sequence of UTF-8 encoded bytes
700	* @len: a pointer to the minimum number of bytes present in
701	* the sequence. This is used to assure the next character
702	* is completely contained within the sequence.
703	*
704	* Read the first UTF8 character from @utf
705	*
706	* Returns the char value or -1 in case of error, and sets *len to
707	* the actual number of bytes consumed (0 in case of error)
708	*/
709	int
710	xmlGetUTF8Char(const unsigned char utf, int* *len) {
711	unsigned int c;
712
713	if (utf == NULL)
714	goto error;
715	if (len == NULL)
716	goto error;
717	if (*len < `1`)
718	goto error;
719
720	c = utf[`0`];
721	if (c & `0x80`) {
722	if (*len < `2`)
723	goto error;
724	if ((utf[`1`] & `0xc0`) != `0x80`)
725	goto error;
726	if ((c & `0xe0`) == `0xe0`) {
727	if (*len < `3`)
728	goto error;
729	if ((utf[`2`] & `0xc0`) != `0x80`)
730	goto error;
731	if ((c & `0xf0`) == `0xf0`) {
732	if (*len < `4`)
733	goto error;
734	if ((c & `0xf8`) != `0xf0` \|\| (utf[`3`] & `0xc0`) != `0x80`)
735	goto error;
736	*len = `4`;
737	/ 4-byte code /
738	c = (utf[`0`] & `0x7`) << `18`;
739	c \|= (utf[`1`] & `0x3f`) << `12`;
740	c \|= (utf[`2`] & `0x3f`) << `6`;
741	c \|= utf[`3`] & `0x3f`;
742	} else {
743	/ 3-byte code /
744	*len = `3`;
745	c = (utf[`0`] & `0xf`) << `12`;
746	c \|= (utf[`1`] & `0x3f`) << `6`;
747	c \|= utf[`2`] & `0x3f`;
748	}
749	} else {
750	/ 2-byte code /
751	*len = `2`;
752	c = (utf[`0`] & `0x1f`) << `6`;
753	c \|= utf[`1`] & `0x3f`;
754	}
755	} else {
756	/ 1-byte code /
757	*len = `1`;
758	}
759	return(c);
760
761	error:
762	if (len != NULL)
763	*len = `0`;
764	return(-`1`);
765	}
766
767	/**
768	* xmlCheckUTF8:
769	* @utf: Pointer to putative UTF-8 encoded string.
770	*
771	* Checks @utf for being valid UTF-8. @utf is assumed to be
772	* null-terminated. This function is not super-strict, as it will
773	* allow longer UTF-8 sequences than necessary. Note that Java is
774	* capable of producing these sequences if provoked. Also note, this
775	* routine checks for the 4-byte maximum size, but does not check for
776	* 0x10ffff maximum value.
777	*
778	* Return value: true if @utf is valid.
779	**/
780	int
781	xmlCheckUTF8(const unsigned char *utf)
782	{
783	int ix;
784	unsigned char c;
785
786	if (utf == NULL)
787	return(`0`);
788	/*
789	* utf is a string of 1, 2, 3 or 4 bytes. The valid strings
790	* are as follows (in "bit format"):
791	* 0xxxxxxx valid 1-byte
792	* 110xxxxx 10xxxxxx valid 2-byte
793	* 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
794	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
795	*/
796	for (ix = `0`; (c = utf[ix]);) { / string is 0-terminated /
797	if ((c & `0x80`) == `0x00`) { / 1-byte code, starts with 10 /
798	ix++;
799	} else if ((c & `0xe0`) == `0xc0`) {/ 2-byte code, starts with 110 /
800	if ((utf[ix+`1`] & `0xc0` ) != `0x80`)
801	return `0`;
802	ix += `2`;
803	} else if ((c & `0xf0`) == `0xe0`) {/ 3-byte code, starts with 1110 /
804	if (((utf[ix+`1`] & `0xc0`) != `0x80`) \|\|
805	((utf[ix+`2`] & `0xc0`) != `0x80`))
806	return `0`;
807	ix += `3`;
808	} else if ((c & `0xf8`) == `0xf0`) {/ 4-byte code, starts with 11110 /
809	if (((utf[ix+`1`] & `0xc0`) != `0x80`) \|\|
810	((utf[ix+`2`] & `0xc0`) != `0x80`) \|\|
811	((utf[ix+`3`] & `0xc0`) != `0x80`))
812	return `0`;
813	ix += `4`;
814	} else / unknown encoding /
815	return `0`;
816	}
817	return(`1`);
818	}
819
820	/**
821	* xmlUTF8Strsize:
822	* @utf: a sequence of UTF-8 encoded bytes
823	* @len: the number of characters in the array
824	*
825	* storage size of an UTF8 string
826	* the behaviour is not guaranteed if the input string is not UTF-8
827	*
828	* Returns the storage size of
829	* the first 'len' characters of ARRAY
830	*/
831
832	int
833	xmlUTF8Strsize(const xmlChar utf, int* len) {
834	const xmlChar *ptr=utf;
835	xmlChar ch;
836
837	if (utf == NULL)
838	return(`0`);
839
840	if (len <= `0`)
841	return(`0`);
842
843	while ( len-- > `0`) {
844	if ( !*ptr )
845	break;
846	if ( (ch = *ptr++) & `0x80`)
847	while ((ch<<=`1`) & `0x80` ) {
848	if (ptr == `0`) break*;
849	ptr++;
850	}
851	}
852	return (ptr - utf);
853	}
854
855
856	/**
857	* xmlUTF8Strndup:
858	* @utf: the input UTF8 *
859	* @len: the len of @utf (in chars)
860	*
861	* a strndup for array of UTF8's
862	*
863	* Returns a new UTF8 * or NULL
864	*/
865	xmlChar *
866	xmlUTF8Strndup(const xmlChar utf, int* len) {
867	xmlChar *ret;
868	int i;
869
870	if ((utf == NULL) \|\| (len < `0`)) return(NULL);
871	i = xmlUTF8Strsize(utf, len);
872	ret = (xmlChar ) xmlMallocAtomic((i + `1`) sizeof(xmlChar));
873	if (ret == NULL) {
874	xmlGenericError(xmlGenericErrorContext,
875	"malloc of %ld byte failed\n",
876	(len + `1`) * (long)sizeof(xmlChar));
877	return(NULL);
878	}
879	memcpy(ret, utf, i * sizeof(xmlChar));
880	ret[i] = `0`;
881	return(ret);
882	}
883
884	/**
885	* xmlUTF8Strpos:
886	* @utf: the input UTF8 *
887	* @pos: the position of the desired UTF8 char (in chars)
888	*
889	* a function to provide the equivalent of fetching a
890	* character from a string array
891	*
892	* Returns a pointer to the UTF8 character or NULL
893	*/
894	const xmlChar *
895	xmlUTF8Strpos(const xmlChar utf, int* pos) {
896	xmlChar ch;
897
898	if (utf == NULL) return(NULL);
899	if (pos < `0`)
900	return(NULL);
901	while (pos--) {
902	if ((ch=utf++) == `0`) return*(NULL);
903	if ( ch & `0x80` ) {
904	/ if not simple ascii, verify proper format /
905	if ( (ch & `0xc0`) != `0xc0` )
906	return(NULL);
907	/ then skip over remaining bytes for this char /
908	while ( (ch <<= `1`) & `0x80` )
909	if ( (*utf++ & `0xc0`) != `0x80` )
910	return(NULL);
911	}
912	}
913	return((xmlChar *)utf);
914	}
915
916	/**
917	* xmlUTF8Strloc:
918	* @utf: the input UTF8 *
919	* @utfchar: the UTF8 character to be found
920	*
921	* a function to provide the relative location of a UTF8 char
922	*
923	* Returns the relative character position of the desired char
924	* or -1 if not found
925	*/
926	int
927	xmlUTF8Strloc(const xmlChar utf, const* xmlChar *utfchar) {
928	int i, size;
929	xmlChar ch;
930
931	if (utf==NULL \|\| utfchar==NULL) return -`1`;
932	size = xmlUTF8Strsize(utfchar, `1`);
933	for(i=`0`; (ch=*utf) != `0`; i++) {
934	if (xmlStrncmp(utf, utfchar, size)==`0`)
935	return(i);
936	utf++;
937	if ( ch & `0x80` ) {
938	/ if not simple ascii, verify proper format /
939	if ( (ch & `0xc0`) != `0xc0` )
940	return(-`1`);
941	/ then skip over remaining bytes for this char /
942	while ( (ch <<= `1`) & `0x80` )
943	if ( (*utf++ & `0xc0`) != `0x80` )
944	return(-`1`);
945	}
946	}
947
948	return(-`1`);
949	}
950	/**
951	* xmlUTF8Strsub:
952	* @utf: a sequence of UTF-8 encoded bytes
953	* @start: relative pos of first char
954	* @len: total number to copy
955	*
956	* Create a substring from a given UTF-8 string
957	* Note: positions are given in units of UTF-8 chars
958	*
959	* Returns a pointer to a newly created string
960	* or NULL if any problem
961	*/
962
963	xmlChar *
964	xmlUTF8Strsub(const xmlChar utf, int* start, int len) {
965	int i;
966	xmlChar ch;
967
968	if (utf == NULL) return(NULL);
969	if (start < `0`) return(NULL);
970	if (len < `0`) return(NULL);
971
972	/*
973	* Skip over any leading chars
974	*/
975	for (i = `0`;i < start;i++) {
976	if ((ch=utf++) == `0`) return*(NULL);
977	if ( ch & `0x80` ) {
978	/ if not simple ascii, verify proper format /
979	if ( (ch & `0xc0`) != `0xc0` )
980	return(NULL);
981	/ then skip over remaining bytes for this char /
982	while ( (ch <<= `1`) & `0x80` )
983	if ( (*utf++ & `0xc0`) != `0x80` )
984	return(NULL);
985	}
986	}
987
988	return(xmlUTF8Strndup(utf, len));
989	}
990
991	/**
992	* xmlEscapeFormatString:
993	* @msg: a pointer to the string in which to escape '%' characters.
994	* Must be a heap-allocated buffer created by libxml2 that may be
995	* returned, or that may be freed and replaced.
996	*
997	* Replaces the string pointed to by 'msg' with an escaped string.
998	* Returns the same string with all '%' characters escaped.
999	*/
1000	xmlChar *
1001	xmlEscapeFormatString(xmlChar **msg)
1002	{
1003	xmlChar *msgPtr = NULL;
1004	xmlChar *result = NULL;
1005	xmlChar *resultPtr = NULL;
1006	size_t count = `0`;
1007	size_t msgLen = `0`;
1008	size_t resultLen = `0`;
1009
1010	if (!msg \|\| !*msg)
1011	return(NULL);
1012
1013	for (msgPtr = msg; msgPtr != `'\0'`; ++msgPtr) {
1014	++msgLen;
1015	if (*msgPtr == `'%'`)
1016	++count;
1017	}
1018
1019	if (count == `0`)
1020	return(*msg);
1021
1022	resultLen = msgLen + count + `1`;
1023	result = (xmlChar ) xmlMallocAtomic(resultLen sizeof(xmlChar));
1024	if (result == NULL) {
1025	/ Clear msg to prevent format string vulnerabilities in
1026	out-of-memory situations. /*
1027	xmlFree(*msg);
1028	*msg = NULL;
1029	xmlErrMemory(NULL, NULL);
1030	return(NULL);
1031	}
1032
1033	for (msgPtr = msg, resultPtr = result; msgPtr != `'\0'`; ++msgPtr, ++resultPtr) {
1034	resultPtr = msgPtr;
1035	if (*msgPtr == `'%'`)
1036	*(++resultPtr) = `'%'`;
1037	}
1038	result[resultLen - `1`] = `'\0'`;
1039
1040	xmlFree(*msg);
1041	*msg = result;
1042
1043	return *msg;
1044	}
1045
1046	#define bottom_xmlstring
1047	#include "elfgcchack.h"
1048

Browse the source code of ClickHouse/contrib/libxml2/xmlstring.c