unistr_cnv.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/unistr_cnv.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*******************************************************************************
5	*
6	* Copyright (C) 1999-2014, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	*******************************************************************************
10	* file name: unistr_cnv.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:2
14	*
15	* created on: 2004aug19
16	* created by: Markus W. Scherer
17	*
18	* Character conversion functions moved here from unistr.cpp
19	*/
20
21	#include "unicode/utypes.h"
22
23	#if !UCONFIG_NO_CONVERSION
24
25	#include "unicode/putil.h"
26	#include "cstring.h"
27	#include "cmemory.h"
28	#include "unicode/ustring.h"
29	#include "unicode/unistr.h"
30	#include "unicode/ucnv.h"
31	#include "ucnv_imp.h"
32	#include "putilimp.h"
33	#include "ustr_cnv.h"
34	#include "ustr_imp.h"
35
36	U_NAMESPACE_BEGIN
37
38	//========================================
39	// Constructors
40	//========================================
41
42	#if !U_CHARSET_IS_UTF8
43
44	UnicodeString::UnicodeString(const char *codepageData) {
45	fUnion.fFields.fLengthAndFlags = kShortString;
46	if(codepageData != `0`) {
47	doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), `0`);
48	}
49	}
50
51	UnicodeString::UnicodeString(const char *codepageData,
52	int32_t dataLength) {
53	fUnion.fFields.fLengthAndFlags = kShortString;
54	if(codepageData != `0`) {
55	doCodepageCreate(codepageData, dataLength, `0`);
56	}
57	}
58
59	// else see unistr.cpp
60	#endif
61
62	UnicodeString::UnicodeString(const char *codepageData,
63	const char *codepage) {
64	fUnion.fFields.fLengthAndFlags = kShortString;
65	if(codepageData != `0`) {
66	doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
67	}
68	}
69
70	UnicodeString::UnicodeString(const char *codepageData,
71	int32_t dataLength,
72	const char *codepage) {
73	fUnion.fFields.fLengthAndFlags = kShortString;
74	if(codepageData != `0`) {
75	doCodepageCreate(codepageData, dataLength, codepage);
76	}
77	}
78
79	UnicodeString::UnicodeString(const char *src, int32_t srcLength,
80	UConverter *cnv,
81	UErrorCode &errorCode) {
82	fUnion.fFields.fLengthAndFlags = kShortString;
83	if(U_SUCCESS(errorCode)) {
84	// check arguments
85	if(src==NULL) {
86	// treat as an empty string, do nothing more
87	} else if(srcLength<-`1`) {
88	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
89	} else {
90	// get input length
91	if(srcLength==-`1`) {
92	srcLength=(int32_t)uprv_strlen(src);
93	}
94	if(srcLength>`0`) {
95	if(cnv!=`0`) {
96	// use the provided converter
97	ucnv_resetToUnicode(cnv);
98	doCodepageCreate(src, srcLength, cnv, errorCode);
99	} else {
100	// use the default converter
101	cnv=u_getDefaultConverter(&errorCode);
102	doCodepageCreate(src, srcLength, cnv, errorCode);
103	u_releaseDefaultConverter(cnv);
104	}
105	}
106	}
107
108	if(U_FAILURE(errorCode)) {
109	setToBogus();
110	}
111	}
112	}
113
114	//========================================
115	// Codeset conversion
116	//========================================
117
118	#if !U_CHARSET_IS_UTF8
119
120	int32_t
121	UnicodeString::extract(int32_t start,
122	int32_t length,
123	char *target,
124	uint32_t dstSize) const {
125	return extract(start, length, target, dstSize, `0`);
126	}
127
128	// else see unistr.cpp
129	#endif
130
131	int32_t
132	UnicodeString::extract(int32_t start,
133	int32_t length,
134	char *target,
135	uint32_t dstSize,
136	const char codepage) const*
137	{
138	// if the arguments are illegal, then do nothing
139	if(/dstSize < 0 \|\| /(dstSize > `0` && target == `0`)) {
140	return `0`;
141	}
142
143	// pin the indices to legal values
144	pinIndices(start, length);
145
146	// We need to cast dstSize to int32_t for all subsequent code.
147	// I don't know why the API was defined with uint32_t but we are stuck with it.
148	// Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
149	// as a limit in some functions, it may wrap around and yield a pointer
150	// that compares less-than target.
151	int32_t capacity;
152	if(dstSize < `0x7fffffff`) {
153	// Assume that the capacity is real and a limit pointer won't wrap around.
154	capacity = (int32_t)dstSize;
155	} else {
156	// Pin the capacity so that a limit pointer does not wrap around.
157	char targetLimit = (char* *)U_MAX_PTR(target);
158	// U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
159	// greater than target and does not wrap around the top of the address space.
160	capacity = (int32_t)(targetLimit - target);
161	}
162
163	// create the converter
164	UConverter *converter;
165	UErrorCode status = U_ZERO_ERROR;
166
167	// just write the NUL if the string length is 0
168	if(length == `0`) {
169	return u_terminateChars(target, capacity, `0`, &status);
170	}
171
172	// if the codepage is the default, use our cache
173	// if it is an empty string, then use the "invariant character" conversion
174	if (codepage == `0`) {
175	const char *defaultName = ucnv_getDefaultName();
176	if(UCNV_FAST_IS_UTF8(defaultName)) {
177	return toUTF8(start, length, target, capacity);
178	}
179	converter = u_getDefaultConverter(&status);
180	} else if (*codepage == `0`) {
181	// use the "invariant characters" conversion
182	int32_t destLength;
183	if(length <= capacity) {
184	destLength = length;
185	} else {
186	destLength = capacity;
187	}
188	u_UCharsToChars(getArrayStart() + start, target, destLength);
189	return u_terminateChars(target, capacity, length, &status);
190	} else {
191	converter = ucnv_open(codepage, &status);
192	}
193
194	length = doExtract(start, length, target, capacity, converter, status);
195
196	// close the converter
197	if (codepage == `0`) {
198	u_releaseDefaultConverter(converter);
199	} else {
200	ucnv_close(converter);
201	}
202
203	return length;
204	}
205
206	int32_t
207	UnicodeString::extract(char *dest, int32_t destCapacity,
208	UConverter *cnv,
209	UErrorCode &errorCode) const
210	{
211	if(U_FAILURE(errorCode)) {
212	return `0`;
213	}
214
215	if(isBogus() \|\| destCapacity<`0` \|\| (destCapacity>`0` && dest==`0`)) {
216	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
217	return `0`;
218	}
219
220	// nothing to do?
221	if(isEmpty()) {
222	return u_terminateChars(dest, destCapacity, `0`, &errorCode);
223	}
224
225	// get the converter
226	UBool isDefaultConverter;
227	if(cnv==`0`) {
228	isDefaultConverter=TRUE;
229	cnv=u_getDefaultConverter(&errorCode);
230	if(U_FAILURE(errorCode)) {
231	return `0`;
232	}
233	} else {
234	isDefaultConverter=FALSE;
235	ucnv_resetFromUnicode(cnv);
236	}
237
238	// convert
239	int32_t len=doExtract(`0`, length(), dest, destCapacity, cnv, errorCode);
240
241	// release the converter
242	if(isDefaultConverter) {
243	u_releaseDefaultConverter(cnv);
244	}
245
246	return len;
247	}
248
249	int32_t
250	UnicodeString::doExtract(int32_t start, int32_t length,
251	char *dest, int32_t destCapacity,
252	UConverter *cnv,
253	UErrorCode &errorCode) const
254	{
255	if(U_FAILURE(errorCode)) {
256	if(destCapacity!=`0`) {
257	*dest=`0`;
258	}
259	return `0`;
260	}
261
262	const UChar src=getArrayStart()+start, srcLimit=src+length;
263	char *originalDest=dest;
264	const char *destLimit;
265
266	if(destCapacity==`0`) {
267	destLimit=dest=`0`;
268	} else if(destCapacity==-`1`) {
269	// Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
270	destLimit=(char*)U_MAX_PTR(dest);
271	// for NUL-termination, translate into highest int32_t
272	destCapacity=`0x7fffffff`;
273	} else {
274	destLimit=dest+destCapacity;
275	}
276
277	// perform the conversion
278	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, `0`, TRUE, &errorCode);
279	length=(int32_t)(dest-originalDest);
280
281	// if an overflow occurs, then get the preflighting length
282	if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
283	char buffer[`1024`];
284
285	destLimit=buffer+sizeof(buffer);
286	do {
287	dest=buffer;
288	errorCode=U_ZERO_ERROR;
289	ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, `0`, TRUE, &errorCode);
290	length+=(int32_t)(dest-buffer);
291	} while(errorCode==U_BUFFER_OVERFLOW_ERROR);
292	}
293
294	return u_terminateChars(originalDest, destCapacity, length, &errorCode);
295	}
296
297	void
298	UnicodeString::doCodepageCreate(const char *codepageData,
299	int32_t dataLength,
300	const char *codepage)
301	{
302	// if there's nothing to convert, do nothing
303	if(codepageData == `0` \|\| dataLength == `0` \|\| dataLength < -`1`) {
304	return;
305	}
306	if(dataLength == -`1`) {
307	dataLength = (int32_t)uprv_strlen(codepageData);
308	}
309
310	UErrorCode status = U_ZERO_ERROR;
311
312	// create the converter
313	// if the codepage is the default, use our cache
314	// if it is an empty string, then use the "invariant character" conversion
315	UConverter *converter;
316	if (codepage == `0`) {
317	const char *defaultName = ucnv_getDefaultName();
318	if(UCNV_FAST_IS_UTF8(defaultName)) {
319	setToUTF8(StringPiece (codepageData, dataLength));
320	return;
321	}
322	converter = u_getDefaultConverter(&status);
323	} else if(*codepage == `0`) {
324	// use the "invariant characters" conversion
325	if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
326	u_charsToUChars(codepageData, getArrayStart(), dataLength);
327	setLength(dataLength);
328	} else {
329	setToBogus();
330	}
331	return;
332	} else {
333	converter = ucnv_open(codepage, &status);
334	}
335
336	// if we failed, set the appropriate flags and return
337	if(U_FAILURE(status)) {
338	setToBogus();
339	return;
340	}
341
342	// perform the conversion
343	doCodepageCreate(codepageData, dataLength, converter, status);
344	if(U_FAILURE(status)) {
345	setToBogus();
346	}
347
348	// close the converter
349	if(codepage == `0`) {
350	u_releaseDefaultConverter(converter);
351	} else {
352	ucnv_close(converter);
353	}
354	}
355
356	void
357	UnicodeString::doCodepageCreate(const char *codepageData,
358	int32_t dataLength,
359	UConverter *converter,
360	UErrorCode &status)
361	{
362	if(U_FAILURE(status)) {
363	return;
364	}
365
366	// set up the conversion parameters
367	const char *mySource = codepageData;
368	const char *mySourceEnd = mySource + dataLength;
369	UChar array, myTarget;
370
371	// estimate the size needed:
372	int32_t arraySize;
373	if(dataLength <= US_STACKBUF_SIZE) {
374	// try to use the stack buffer
375	arraySize = US_STACKBUF_SIZE;
376	} else {
377	// 1.25 UChar's per source byte should cover most cases
378	arraySize = dataLength + (dataLength >> `2`);
379	}
380
381	// we do not care about the current contents
382	UBool doCopyArray = FALSE;
383	for(;;) {
384	if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
385	setToBogus();
386	break;
387	}
388
389	// perform the conversion
390	array = getArrayStart();
391	myTarget = array + length();
392	ucnv_toUnicode(converter, &myTarget, array + getCapacity(),
393	&mySource, mySourceEnd, `0`, TRUE, &status);
394
395	// update the conversion parameters
396	setLength((int32_t)(myTarget - array));
397
398	// allocate more space and copy data, if needed
399	if(status == U_BUFFER_OVERFLOW_ERROR) {
400	// reset the error code
401	status = U_ZERO_ERROR;
402
403	// keep the previous conversion results
404	doCopyArray = TRUE;
405
406	// estimate the new size needed, larger than before
407	// try 2 UChar's per remaining source byte
408	arraySize = (int32_t)(length() + `2` * (mySourceEnd - mySource));
409	} else {
410	break;
411	}
412	}
413	}
414
415	U_NAMESPACE_END
416
417	#endif
418

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/unistr_cnv.cpp