ucmndata.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/ucmndata.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	*
6	* Copyright (C) 1999-2011, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	******************************************************************************/
10
11
12	/------------------------------------------------------------------------------*
13	*
14	* UCommonData An abstract interface for dealing with ICU Common Data Files.
15	* ICU Common Data Files are a grouping of a number of individual
16	* data items (resources, converters, tables, anything) into a
17	* single file or dll. The combined format includes a table of
18	* contents for locating the individual items by name.
19	*
20	* Two formats for the table of contents are supported, which is
21	* why there is an abstract inteface involved.
22	*
23	*/
24
25	#include "unicode/utypes.h"
26	#include "unicode/udata.h"
27	#include "cstring.h"
28	#include "ucmndata.h"
29	#include "udatamem.h"
30
31	#if defined(UDATA_DEBUG) \|\| defined(UDATA_DEBUG_DUMP)
32	# include <stdio.h>
33	#endif
34
35	U_CFUNC uint16_t
36	udata_getHeaderSize(const DataHeader *udh) {
37	if(udh==NULL) {
38	return `0`;
39	} else if(udh->info.isBigEndian==U_IS_BIG_ENDIAN) {
40	/ same endianness /
41	return udh->dataHeader.headerSize;
42	} else {
43	/ opposite endianness /
44	uint16_t x=udh->dataHeader.headerSize;
45	return (uint16_t)((x<<`8`)\|(x>>`8`));
46	}
47	}
48
49	U_CFUNC uint16_t
50	udata_getInfoSize(const UDataInfo *info) {
51	if(info==NULL) {
52	return `0`;
53	} else if(info->isBigEndian==U_IS_BIG_ENDIAN) {
54	/ same endianness /
55	return info->size;
56	} else {
57	/ opposite endianness /
58	uint16_t x=info->size;
59	return (uint16_t)((x<<`8`)\|(x>>`8`));
60	}
61	}
62
63	/-----------------------------------------------------------------------------
64	* *
65	* Pointer TOCs. TODO: This form of table-of-contents should be removed *
66	* because DLLs must be relocated on loading to correct the *
67	* pointer values and this operation makes shared memory *
68	* mapping of the data much less likely to work. *
69	* *
70	-----------------------------------------------------------------------------/
71	typedef struct {
72	const char *entryName;
73	const DataHeader *pHeader;
74	} PointerTOCEntry;
75
76
77	typedef struct {
78	uint32_t count;
79	uint32_t reserved;
80	/**
81	* Variable-length array declared with length 1 to disable bounds checkers.
82	* The actual array length is in the count field.
83	*/
84	PointerTOCEntry entry[`1`];
85	} PointerTOC;
86
87
88	/ definition of OffsetTOC struct types moved to ucmndata.h /
89
90	/-----------------------------------------------------------------------------
91	* *
92	* entry point lookup implementations *
93	* *
94	-----------------------------------------------------------------------------/
95
96	#ifndef MIN
97	#define MIN(a,b) (((a)<(b)) ? (a) : (b))
98	#endif
99
100	/**
101	* Compare strings where we know the shared prefix length,
102	* and advance the prefix length as we find that the strings share even more characters.
103	*/
104	static int32_t
105	strcmpAfterPrefix(const char s1, const* char s2, int32_t pPrefixLength) {
106	int32_t pl=*pPrefixLength;
107	int32_t cmp=`0`;
108	s1+=pl;
109	s2+=pl;
110	for(;;) {
111	int32_t c1=(uint8_t)*s1++;
112	int32_t c2=(uint8_t)*s2++;
113	cmp=c1-c2;
114	if(cmp!=`0` \|\| c1==`0`) { / different or done /
115	break;
116	}
117	++pl; / increment shared same-prefix length /
118	}
119	*pPrefixLength=pl;
120	return cmp;
121	}
122
123	static int32_t
124	offsetTOCPrefixBinarySearch(const char s, const* char *names,
125	const UDataOffsetTOCEntry *toc, int32_t count) {
126	int32_t start=`0`;
127	int32_t limit=count;
128	/*
129	* Remember the shared prefix between s, start and limit,
130	* and don't compare that shared prefix again.
131	* The shared prefix should get longer as we narrow the [start, limit[ range.
132	*/
133	int32_t startPrefixLength=`0`;
134	int32_t limitPrefixLength=`0`;
135	if(count==`0`) {
136	return -`1`;
137	}
138	/*
139	* Prime the prefix lengths so that we don't keep prefixLength at 0 until
140	* both the start and limit indexes have moved.
141	* At the same time, we find if s is one of the start and (limit-1) names,
142	* and if not, exclude them from the actual binary search.
143	*/
144	if(`0`==strcmpAfterPrefix(s, names+toc[`0`].nameOffset, &startPrefixLength)) {
145	return `0`;
146	}
147	++start;
148	--limit;
149	if(`0`==strcmpAfterPrefix(s, names+toc[limit].nameOffset, &limitPrefixLength)) {
150	return limit;
151	}
152	while(start<limit) {
153	int32_t i=(start+limit)/`2`;
154	int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength);
155	int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, &prefixLength);
156	if(cmp<`0`) {
157	limit=i;
158	limitPrefixLength=prefixLength;
159	} else if(cmp==`0`) {
160	return i;
161	} else {
162	start=i+`1`;
163	startPrefixLength=prefixLength;
164	}
165	}
166	return -`1`;
167	}
168
169	static int32_t
170	pointerTOCPrefixBinarySearch(const char s, const* PointerTOCEntry *toc, int32_t count) {
171	int32_t start=`0`;
172	int32_t limit=count;
173	/*
174	* Remember the shared prefix between s, start and limit,
175	* and don't compare that shared prefix again.
176	* The shared prefix should get longer as we narrow the [start, limit[ range.
177	*/
178	int32_t startPrefixLength=`0`;
179	int32_t limitPrefixLength=`0`;
180	if(count==`0`) {
181	return -`1`;
182	}
183	/*
184	* Prime the prefix lengths so that we don't keep prefixLength at 0 until
185	* both the start and limit indexes have moved.
186	* At the same time, we find if s is one of the start and (limit-1) names,
187	* and if not, exclude them from the actual binary search.
188	*/
189	if(`0`==strcmpAfterPrefix(s, toc[`0`].entryName, &startPrefixLength)) {
190	return `0`;
191	}
192	++start;
193	--limit;
194	if(`0`==strcmpAfterPrefix(s, toc[limit].entryName, &limitPrefixLength)) {
195	return limit;
196	}
197	while(start<limit) {
198	int32_t i=(start+limit)/`2`;
199	int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength);
200	int32_t cmp=strcmpAfterPrefix(s, toc[i].entryName, &prefixLength);
201	if(cmp<`0`) {
202	limit=i;
203	limitPrefixLength=prefixLength;
204	} else if(cmp==`0`) {
205	return i;
206	} else {
207	start=i+`1`;
208	startPrefixLength=prefixLength;
209	}
210	}
211	return -`1`;
212	}
213
214	U_CDECL_BEGIN
215	static uint32_t U_CALLCONV
216	offsetTOCEntryCount(const UDataMemory *pData) {
217	int32_t retVal=`0`;
218	const UDataOffsetTOC toc = (UDataOffsetTOC )pData->toc;
219	if (toc != NULL) {
220	retVal = toc->count;
221	}
222	return retVal;
223	}
224
225	static const DataHeader * U_CALLCONV
226	offsetTOCLookupFn(const UDataMemory *pData,
227	const char *tocEntryName,
228	int32_t *pLength,
229	UErrorCode *pErrorCode) {
230	(void)pErrorCode;
231	const UDataOffsetTOC toc = (UDataOffsetTOC )pData->toc;
232	if(toc!=NULL) {
233	const char base=(const* char *)toc;
234	int32_t number, count=(int32_t)toc->count;
235
236	/ perform a binary search for the data in the common data's table of contents /
237	#if defined (UDATA_DEBUG_DUMP)
238	/ list the contents of the TOC each time .. not recommended /
239	for(number=`0`; number<count; ++number) {
240	fprintf(stderr, "\tx%d: %s\n", number, &base[toc->entry[number].nameOffset]);
241	}
242	#endif
243	number=offsetTOCPrefixBinarySearch(tocEntryName, base, toc->entry, count);
244	if(number>=`0`) {
245	/ found it /
246	const UDataOffsetTOCEntry *entry=toc->entry+number;
247	#ifdef UDATA_DEBUG
248	fprintf(stderr, "%s: Found.\n", tocEntryName);
249	#endif
250	if((number+`1`) < count) {
251	*pLength = (int32_t)(entry[`1`].dataOffset - entry->dataOffset);
252	} else {
253	*pLength = -`1`;
254	}
255	return (const DataHeader *)(base+entry->dataOffset);
256	} else {
257	#ifdef UDATA_DEBUG
258	fprintf(stderr, "%s: Not found.\n", tocEntryName);
259	#endif
260	return NULL;
261	}
262	} else {
263	#ifdef UDATA_DEBUG
264	fprintf(stderr, "returning header\n");
265	#endif
266
267	return pData->pHeader;
268	}
269	}
270
271
272	static uint32_t U_CALLCONV pointerTOCEntryCount(const UDataMemory *pData) {
273	const PointerTOC toc = (PointerTOC )pData->toc;
274	return (uint32_t)((toc != NULL) ? (toc->count) : `0`);
275	}
276
277	static const DataHeader * U_CALLCONV pointerTOCLookupFn(const UDataMemory *pData,
278	const char *name,
279	int32_t *pLength,
280	UErrorCode *pErrorCode) {
281	(void)pErrorCode;
282	if(pData->toc!=NULL) {
283	const PointerTOC toc = (PointerTOC )pData->toc;
284	int32_t number, count=(int32_t)toc->count;
285
286	#if defined (UDATA_DEBUG_DUMP)
287	/ list the contents of the TOC each time .. not recommended /
288	for(number=`0`; number<count; ++number) {
289	fprintf(stderr, "\tx%d: %s\n", number, toc->entry[number].entryName);
290	}
291	#endif
292	number=pointerTOCPrefixBinarySearch(name, toc->entry, count);
293	if(number>=`0`) {
294	/ found it /
295	#ifdef UDATA_DEBUG
296	fprintf(stderr, "%s: Found.\n", toc->entry[number].entryName);
297	#endif
298	*pLength=-`1`;
299	return UDataMemory_normalizeDataPointer(toc->entry[number].pHeader);
300	} else {
301	#ifdef UDATA_DEBUG
302	fprintf(stderr, "%s: Not found.\n", name);
303	#endif
304	return NULL;
305	}
306	} else {
307	return pData->pHeader;
308	}
309	}
310	U_CDECL_END
311
312
313	static const commonDataFuncs CmnDFuncs = {offsetTOCLookupFn, offsetTOCEntryCount};
314	static const commonDataFuncs ToCPFuncs = {pointerTOCLookupFn, pointerTOCEntryCount};
315
316
317
318	/----------------------------------------------------------------------
319	* *
320	* checkCommonData Validate the format of a common data file. *
321	* Fill in the virtual function ptr based on TOC type *
322	* If the data is invalid, close the UDataMemory *
323	* and set the appropriate error code. *
324	* *
325	----------------------------------------------------------------------/
326	U_CFUNC void udata_checkCommonData(UDataMemory udm, UErrorCode err) {
327	if (U_FAILURE(*err)) {
328	return;
329	}
330
331	if(udm==NULL \|\| udm->pHeader==NULL) {
332	*err=U_INVALID_FORMAT_ERROR;
333	} else if(!(udm->pHeader->dataHeader.magic1==`0xda` &&
334	udm->pHeader->dataHeader.magic2==`0x27` &&
335	udm->pHeader->info.isBigEndian==U_IS_BIG_ENDIAN &&
336	udm->pHeader->info.charsetFamily==U_CHARSET_FAMILY)
337	) {
338	/ header not valid /
339	*err=U_INVALID_FORMAT_ERROR;
340	}
341	else if (udm->pHeader->info.dataFormat[`0`]==`0x43` &&
342	udm->pHeader->info.dataFormat[`1`]==`0x6d` &&
343	udm->pHeader->info.dataFormat[`2`]==`0x6e` &&
344	udm->pHeader->info.dataFormat[`3`]==`0x44` &&
345	udm->pHeader->info.formatVersion[`0`]==`1`
346	) {
347	/ dataFormat="CmnD" /
348	udm->vFuncs = &CmnDFuncs;
349	udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader);
350	}
351	else if(udm->pHeader->info.dataFormat[`0`]==`0x54` &&
352	udm->pHeader->info.dataFormat[`1`]==`0x6f` &&
353	udm->pHeader->info.dataFormat[`2`]==`0x43` &&
354	udm->pHeader->info.dataFormat[`3`]==`0x50` &&
355	udm->pHeader->info.formatVersion[`0`]==`1`
356	) {
357	/ dataFormat="ToCP" /
358	udm->vFuncs = &ToCPFuncs;
359	udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader);
360	}
361	else {
362	/ dataFormat not recognized /
363	*err=U_INVALID_FORMAT_ERROR;
364	}
365
366	if (U_FAILURE(*err)) {
367	/ If the data is no good and we memory-mapped it ourselves,*
368	* close the memory mapping so it doesn't leak. Note that this has
369	* no effect on non-memory mapped data, other than clearing fields in udm.
370	*/
371	udata_close(udm);
372	}
373	}
374
375	/*
376	* TODO: Add a udata_swapPackageHeader() function that swaps an ICU .dat package
377	* header but not its sub-items.
378	* This function will be needed for automatic runtime swapping.
379	* Sub-items should not be swapped to limit the swapping to the parts of the
380	* package that are actually used.
381	*
382	* Since lengths of items are implicit in the order and offsets of their
383	* ToC entries, and since offsets are relative to the start of the ToC,
384	* a swapped version may need to generate a different data structure
385	* with pointers to the original data items and with their lengths
386	* (-1 for the last one if it is not known), and maybe even pointers to the
387	* swapped versions of the items.
388	* These pointers to swapped versions would establish a cache;
389	* instead, each open data item could simply own the storage for its swapped
390	* data. This fits better with the current design.
391	*
392	* markus 2003sep18 Jitterbug 2235
393	*/
394

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/ucmndata.cpp