cpdtrans.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/cpdtrans.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1999-2011, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 11/17/99 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "unicode/unifilt.h"
18	#include "unicode/uniset.h"
19	#include "cpdtrans.h"
20	#include "uvector.h"
21	#include "tridpars.h"
22	#include "cmemory.h"
23
24	// keep in sync with Transliterator
25	//static const UChar ID_SEP = 0x002D; /-/
26	static const UChar ID_DELIM = `0x003B`; /;/
27	static const UChar NEWLINE = `10`;
28
29	static const UChar COLON_COLON[] = {`0x3A`, `0x3A`, `0`}; //"::"
30
31	U_NAMESPACE_BEGIN
32
33	const UChar CompoundTransliterator::PASS_STRING[] = { `0x0025`, `0x0050`, `0x0061`, `0x0073`, `0x0073`, `0` }; // "%Pass"
34
35	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompoundTransliterator)
36
37	/**
38	* Constructs a new compound transliterator given an array of
39	* transliterators. The array of transliterators may be of any
40	* length, including zero or one, however, useful compound
41	* transliterators have at least two components.
42	* @param transliterators array of <code>Transliterator</code>
43	* objects
44	* @param transliteratorCount The number of
45	* <code>Transliterator</code> objects in transliterators.
46	* @param filter the filter. Any character for which
47	* <tt>filter.contains()</tt> returns <tt>false</tt> will not be
48	* altered by this transliterator. If <tt>filter</tt> is
49	* <tt>null</tt> then no filtering is applied.
50	*/
51	CompoundTransliterator::CompoundTransliterator(
52	Transliterator* const transliterators[],
53	int32_t transliteratorCount,
54	UnicodeFilter* adoptedFilter) :
55	Transliterator (joinIDs(transliterators, transliteratorCount), adoptedFilter),
56	trans(`0`), count(`0`), numAnonymousRBTs(`0`) {
57	setTransliterators(transliterators, transliteratorCount);
58	}
59
60	/**
61	* Splits an ID of the form "ID;ID;..." into a compound using each
62	* of the IDs.
63	* @param id of above form
64	* @param forward if false, does the list in reverse order, and
65	* takes the inverse of each ID.
66	*/
67	CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
68	UTransDirection direction,
69	UnicodeFilter* adoptedFilter,
70	UParseError& /parseError/,
71	UErrorCode& status) :
72	Transliterator (id, adoptedFilter),
73	trans(`0`), numAnonymousRBTs(`0`) {
74	// TODO add code for parseError...currently unused, but
75	// later may be used by parsing code...
76	init(id, direction, TRUE, status);
77	}
78
79	CompoundTransliterator::CompoundTransliterator(const UnicodeString& id,
80	UParseError& /parseError/,
81	UErrorCode& status) :
82	Transliterator (id, `0`), // set filter to 0 here!
83	trans(`0`), numAnonymousRBTs(`0`) {
84	// TODO add code for parseError...currently unused, but
85	// later may be used by parsing code...
86	init(id, UTRANS_FORWARD, TRUE, status);
87	}
88
89
90	/**
91	* Private constructor for use of TransliteratorAlias
92	*/
93	CompoundTransliterator::CompoundTransliterator(const UnicodeString& newID,
94	UVector& list,
95	UnicodeFilter* adoptedFilter,
96	int32_t anonymousRBTs,
97	UParseError& /parseError/,
98	UErrorCode& status) :
99	Transliterator (newID, adoptedFilter),
100	trans(`0`), numAnonymousRBTs(anonymousRBTs)
101	{
102	init(list, UTRANS_FORWARD, FALSE, status);
103	}
104
105	/**
106	* Private constructor for Transliterator from a vector of
107	* transliterators. The caller is responsible for fixing up the
108	* ID.
109	*/
110	CompoundTransliterator::CompoundTransliterator(UVector& list,
111	UParseError& /parseError/,
112	UErrorCode& status) :
113	Transliterator (UnicodeString (), NULL),
114	trans(`0`), numAnonymousRBTs(`0`)
115	{
116	// TODO add code for parseError...currently unused, but
117	// later may be used by parsing code...
118	init(list, UTRANS_FORWARD, FALSE, status);
119	// assume caller will fixup ID
120	}
121
122	CompoundTransliterator::CompoundTransliterator(UVector& list,
123	int32_t anonymousRBTs,
124	UParseError& /parseError/,
125	UErrorCode& status) :
126	Transliterator (UnicodeString (), NULL),
127	trans(`0`), numAnonymousRBTs(anonymousRBTs)
128	{
129	init(list, UTRANS_FORWARD, FALSE, status);
130	}
131
132	/**
133	* Finish constructing a transliterator: only to be called by
134	* constructors. Before calling init(), set trans and filter to NULL.
135	* @param id the id containing ';'-separated entries
136	* @param direction either FORWARD or REVERSE
137	* @param idSplitPoint the index into id at which the
138	* adoptedSplitTransliterator should be inserted, if there is one, or
139	* -1 if there is none.
140	* @param adoptedSplitTransliterator a transliterator to be inserted
141	* before the entry at offset idSplitPoint in the id string. May be
142	* NULL to insert no entry.
143	* @param fixReverseID if TRUE, then reconstruct the ID of reverse
144	* entries by calling getID() of component entries. Some constructors
145	* do not require this because they apply a facade ID anyway.
146	* @param status the error code indicating success or failure
147	*/
148	void CompoundTransliterator::init(const UnicodeString& id,
149	UTransDirection direction,
150	UBool fixReverseID,
151	UErrorCode& status) {
152	// assert(trans == 0);
153
154	if (U_FAILURE(status)) {
155	return;
156	}
157
158	UVector list(status);
159	UnicodeSet* compoundFilter = NULL;
160	UnicodeString regenID;
161	if (!TransliteratorIDParser::parseCompoundID(id, direction,
162	regenID, list, compoundFilter)) {
163	status = U_INVALID_ID;
164	delete compoundFilter;
165	return;
166	}
167
168	TransliteratorIDParser::instantiateList(list, status);
169
170	init(list, direction, fixReverseID, status);
171
172	if (compoundFilter != NULL) {
173	adoptFilter(compoundFilter);
174	}
175	}
176
177	/**
178	* Finish constructing a transliterator: only to be called by
179	* constructors. Before calling init(), set trans and filter to NULL.
180	* @param list a vector of transliterator objects to be adopted. It
181	* should NOT be empty. The list should be in declared order. That
182	* is, it should be in the FORWARD order; if direction is REVERSE then
183	* the list order will be reversed.
184	* @param direction either FORWARD or REVERSE
185	* @param fixReverseID if TRUE, then reconstruct the ID of reverse
186	* entries by calling getID() of component entries. Some constructors
187	* do not require this because they apply a facade ID anyway.
188	* @param status the error code indicating success or failure
189	*/
190	void CompoundTransliterator::init(UVector& list,
191	UTransDirection direction,
192	UBool fixReverseID,
193	UErrorCode& status) {
194	// assert(trans == 0);
195
196	// Allocate array
197	if (U_SUCCESS(status)) {
198	count = list.size();
199	trans = (Transliterator *)uprv_malloc(count sizeof(Transliterator *));
200	/ test for NULL /
201	if (trans == `0`) {
202	status = U_MEMORY_ALLOCATION_ERROR;
203	return;
204	}
205	}
206
207	if (U_FAILURE(status) \|\| trans == `0`) {
208	// assert(trans == 0);
209	return;
210	}
211
212	// Move the transliterators from the vector into an array.
213	// Reverse the order if necessary.
214	int32_t i;
215	for (i=`0`; i<count; ++i) {
216	int32_t j = (direction == UTRANS_FORWARD) ? i : count - `1` - i;
217	trans[i] = (Transliterator*) list.elementAt(j);
218	}
219
220	// If the direction is UTRANS_REVERSE then we may need to fix the
221	// ID.
222	if (direction == UTRANS_REVERSE && fixReverseID) {
223	UnicodeString newID;
224	for (i=`0`; i<count; ++i) {
225	if (i > `0`) {
226	newID.append(ID_DELIM);
227	}
228	newID.append(trans[i]->getID());
229	}
230	setID(newID);
231	}
232
233	computeMaximumContextLength();
234	}
235
236	/**
237	* Return the IDs of the given list of transliterators, concatenated
238	* with ID_DELIM delimiting them. Equivalent to the perlish expression
239	* join(ID_DELIM, map($_.getID(), transliterators).
240	*/
241	UnicodeString CompoundTransliterator::joinIDs(Transliterator* const transliterators[],
242	int32_t transCount) {
243	UnicodeString id;
244	for (int32_t i=`0`; i<transCount; ++i) {
245	if (i > `0`) {
246	id.append(ID_DELIM);
247	}
248	id.append(transliterators[i]->getID());
249	}
250	return id; // Return temporary
251	}
252
253	/**
254	* Copy constructor.
255	*/
256	CompoundTransliterator::CompoundTransliterator(const CompoundTransliterator& t) :
257	Transliterator (t), trans(`0`), count(`0`), numAnonymousRBTs(-`1`) {
258	*this = t;
259	}
260
261	/**
262	* Destructor
263	*/
264	CompoundTransliterator::~CompoundTransliterator() {
265	freeTransliterators();
266	}
267
268	void CompoundTransliterator::freeTransliterators(void) {
269	if (trans != `0`) {
270	for (int32_t i=`0`; i<count; ++i) {
271	delete trans[i];
272	}
273	uprv_free(trans);
274	}
275	trans = `0`;
276	count = `0`;
277	}
278
279	/**
280	* Assignment operator.
281	*/
282	CompoundTransliterator& CompoundTransliterator::operator=(
283	const CompoundTransliterator& t)
284	{
285	Transliterator::operator=(t);
286	int32_t i = `0`;
287	UBool failed = FALSE;
288	if (trans != NULL) {
289	for (i=`0`; i<count; ++i) {
290	delete trans[i];
291	trans[i] = `0`;
292	}
293	}
294	if (t.count > count) {
295	if (trans != NULL) {
296	uprv_free(trans);
297	}
298	trans = (Transliterator *)uprv_malloc(t.count sizeof(Transliterator *));
299	}
300	count = t.count;
301	if (trans != NULL) {
302	for (i=`0`; i<count; ++i) {
303	trans[i] = t.trans[i]->clone();
304	if (trans[i] == NULL) {
305	failed = TRUE;
306	break;
307	}
308	}
309	}
310
311	// if memory allocation failed delete backwards trans array
312	if (failed && i > `0`) {
313	int32_t n;
314	for (n = i-`1`; n >= `0`; n--) {
315	uprv_free(trans[n]);
316	trans[n] = NULL;
317	}
318	}
319	numAnonymousRBTs = t.numAnonymousRBTs;
320	return *this;
321	}
322
323	/**
324	* Transliterator API.
325	*/
326	CompoundTransliterator* CompoundTransliterator::clone() const {
327	return new CompoundTransliterator (*this);
328	}
329
330	/**
331	* Returns the number of transliterators in this chain.
332	* @return number of transliterators in this chain.
333	*/
334	int32_t CompoundTransliterator::getCount(void) const {
335	return count;
336	}
337
338	/**
339	* Returns the transliterator at the given index in this chain.
340	* @param index index into chain, from 0 to <code>getCount() - 1</code>
341	* @return transliterator at the given index
342	*/
343	const Transliterator& CompoundTransliterator::getTransliterator(int32_t index) const {
344	return *trans[index];
345	}
346
347	void CompoundTransliterator::setTransliterators(Transliterator* const transliterators[],
348	int32_t transCount) {
349	Transliterator a = (Transliterator )uprv_malloc(transCount * sizeof(Transliterator *));
350	if (a == NULL) {
351	return;
352	}
353	int32_t i = `0`;
354	UBool failed = FALSE;
355	for (i=`0`; i<transCount; ++i) {
356	a[i] = transliterators[i]->clone();
357	if (a[i] == NULL) {
358	failed = TRUE;
359	break;
360	}
361	}
362	if (failed && i > `0`) {
363	int32_t n;
364	for (n = i-`1`; n >= `0`; n--) {
365	uprv_free(a[n]);
366	a[n] = NULL;
367	}
368	return;
369	}
370	adoptTransliterators(a, transCount);
371	}
372
373	void CompoundTransliterator::adoptTransliterators(Transliterator* adoptedTransliterators[],
374	int32_t transCount) {
375	// First free trans[] and set count to zero. Once this is done,
376	// orphan the filter. Set up the new trans[].
377	freeTransliterators();
378	trans = adoptedTransliterators;
379	count = transCount;
380	computeMaximumContextLength();
381	setID(joinIDs(trans, count));
382	}
383
384	/**
385	* Append c to buf, unless buf is empty or buf already ends in c.
386	*/
387	static void _smartAppend(UnicodeString& buf, UChar c) {
388	if (buf.length() != `0` &&
389	buf.charAt(buf.length() - `1`) != c) {
390	buf.append(c);
391	}
392	}
393
394	UnicodeString& CompoundTransliterator::toRules(UnicodeString& rulesSource,
395	UBool escapeUnprintable) const {
396	// We do NOT call toRules() on our component transliterators, in
397	// general. If we have several rule-based transliterators, this
398	// yields a concatenation of the rules -- not what we want. We do
399	// handle compound RBT transliterators specially -- those for which
400	// compoundRBTIndex >= 0. For the transliterator at compoundRBTIndex,
401	// we do call toRules() recursively.
402	rulesSource.truncate(`0`);
403	if (numAnonymousRBTs >= `1` && getFilter() != NULL) {
404	// If we are a compound RBT and if we have a global
405	// filter, then emit it at the top.
406	UnicodeString pat;
407	rulesSource.append(COLON_COLON, `2`).append(getFilter()->toPattern(pat, escapeUnprintable)).append(ID_DELIM);
408	}
409	for (int32_t i=`0`; i<count; ++i) {
410	UnicodeString rule;
411
412	// Anonymous RuleBasedTransliterators (inline rules and
413	// ::BEGIN/::END blocks) are given IDs that begin with
414	// "%Pass": use toRules() to write all the rules to the output
415	// (and insert "::Null;" if we have two in a row)
416	if (trans[i]->getID().startsWith(PASS_STRING, `5`)) {
417	trans[i]->toRules(rule, escapeUnprintable);
418	if (numAnonymousRBTs > `1` && i > `0` && trans[i - `1`]->getID().startsWith(PASS_STRING, `5`))
419	rule = UNICODE_STRING_SIMPLE("::Null;") + rule;
420
421	// we also use toRules() on CompoundTransliterators (which we
422	// check for by looking for a semicolon in the ID)-- this gets
423	// the list of their child transliterators output in the right
424	// format
425	} else if (trans[i]->getID().indexOf(ID_DELIM) >= `0`) {
426	trans[i]->toRules(rule, escapeUnprintable);
427
428	// for everything else, use Transliterator::toRules()
429	} else {
430	trans[i]->Transliterator::toRules(rule, escapeUnprintable);
431	}
432	_smartAppend(rulesSource, NEWLINE);
433	rulesSource.append(rule);
434	_smartAppend(rulesSource, ID_DELIM);
435	}
436	return rulesSource;
437	}
438
439	/**
440	* Implement Transliterator framework
441	*/
442	void CompoundTransliterator::handleGetSourceSet(UnicodeSet& result) const {
443	UnicodeSet set;
444	result.clear();
445	for (int32_t i=`0`; i<count; ++i) {
446	result.addAll(trans[i]->getSourceSet(set));
447	// Take the example of Hiragana-Latin. This is really
448	// Hiragana-Katakana; Katakana-Latin. The source set of
449	// these two is roughly [:Hiragana:] and [:Katakana:].
450	// But the source set for the entire transliterator is
451	// actually [:Hiragana:] ONLY -- that is, the first
452	// non-empty source set.
453
454	// This is a heuristic, and not 100% reliable.
455	if (!result.isEmpty()) {
456	break;
457	}
458	}
459	}
460
461	/**
462	* Override Transliterator framework
463	*/
464	UnicodeSet& CompoundTransliterator::getTargetSet(UnicodeSet& result) const {
465	UnicodeSet set;
466	result.clear();
467	for (int32_t i=`0`; i<count; ++i) {
468	// This is a heuristic, and not 100% reliable.
469	result.addAll(trans[i]->getTargetSet(set));
470	}
471	return result;
472	}
473
474	/**
475	* Implements {@link Transliterator#handleTransliterate}.
476	*/
477	void CompoundTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
478	UBool incremental) const {
479	/ Call each transliterator with the same contextStart and*
480	* start, but with the limit as modified
481	* by preceding transliterators. The start index must be
482	* reset for each transliterator to give each a chance to
483	* transliterate the text. The initial contextStart index is known
484	* to still point to the same place after each transliterator
485	* is called because each transliterator will not change the
486	* text between contextStart and the initial start index.
487	*
488	* IMPORTANT: After the first transliterator, each subsequent
489	* transliterator only gets to transliterate text committed by
490	* preceding transliterators; that is, the start (output
491	* value) of transliterator i becomes the limit (input value)
492	* of transliterator i+1. Finally, the overall limit is fixed
493	* up before we return.
494	*
495	* Assumptions we make here:
496	* (1) contextStart <= start <= limit <= contextLimit <= text.length()
497	* (2) start <= start' <= limit' ;cursor doesn't move back
498	* (3) start <= limit' ;text before cursor unchanged
499	* - start' is the value of start after calling handleKT
500	* - limit' is the value of limit after calling handleKT
501	*/
502
503	/**
504	* Example: 3 transliterators. This example illustrates the
505	* mechanics we need to implement. C, S, and L are the contextStart,
506	* start, and limit. gl is the globalLimit. contextLimit is
507	* equal to limit throughout.
508	*
509	* 1. h-u, changes hex to Unicode
510	*
511	* 4 7 a d 0 4 7 a
512	* abc/u0061/u => abca/u
513	* C S L C S L gl=f->a
514	*
515	* 2. upup, changes "x" to "XX"
516	*
517	* 4 7 a 4 7 a
518	* abca/u => abcAA/u
519	* C SL C S
520	* L gl=a->b
521	* 3. u-h, changes Unicode to hex
522	*
523	* 4 7 a 4 7 a d 0 3
524	* abcAA/u => abc/u0041/u0041/u
525	* C S L C S
526	* L gl=b->15
527	* 4. return
528	*
529	* 4 7 a d 0 3
530	* abc/u0041/u0041/u
531	* C S L
532	*/
533
534	if (count < `1`) {
535	index.start = index.limit;
536	return; // Short circuit for empty compound transliterators
537	}
538
539	// compoundLimit is the limit value for the entire compound
540	// operation. We overwrite index.limit with the previous
541	// index.start. After each transliteration, we update
542	// compoundLimit for insertions or deletions that have happened.
543	int32_t compoundLimit = index.limit;
544
545	// compoundStart is the start for the entire compound
546	// operation.
547	int32_t compoundStart = index.start;
548
549	int32_t delta = `0`; // delta in length
550
551	// Give each transliterator a crack at the run of characters.
552	// See comments at the top of the method for more detail.
553	for (int32_t i=`0`; i<count; ++i) {
554	index.start = compoundStart; // Reset start
555	int32_t limit = index.limit;
556
557	if (index.start == index.limit) {
558	// Short circuit for empty range
559	break;
560	}
561
562	trans[i]->filteredTransliterate(text, index, incremental);
563
564	// In a properly written transliterator, start == limit after
565	// handleTransliterate() returns when incremental is false.
566	// Catch cases where the subclass doesn't do this, and throw
567	// an exception. (Just pinning start to limit is a bad idea,
568	// because what's probably happening is that the subclass
569	// isn't transliterating all the way to the end, and it should
570	// in non-incremental mode.)
571	if (!incremental && index.start != index.limit) {
572	// We can't throw an exception, so just fudge things
573	index.start = index.limit;
574	}
575
576	// Cumulative delta for insertions/deletions
577	delta += index.limit - limit;
578
579	if (incremental) {
580	// In the incremental case, only allow subsequent
581	// transliterators to modify what has already been
582	// completely processed by prior transliterators. In the
583	// non-incrmental case, allow each transliterator to
584	// process the entire text.
585	index.limit = index.start;
586	}
587	}
588
589	compoundLimit += delta;
590
591	// Start is good where it is -- where the last transliterator left
592	// it. Limit needs to be put back where it was, modulo
593	// adjustments for deletions/insertions.
594	index.limit = compoundLimit;
595	}
596
597	/**
598	* Sets the length of the longest context required by this transliterator.
599	* This is <em>preceding</em> context.
600	*/
601	void CompoundTransliterator::computeMaximumContextLength(void) {
602	int32_t max = `0`;
603	for (int32_t i=`0`; i<count; ++i) {
604	int32_t len = trans[i]->getMaximumContextLength();
605	if (len > max) {
606	max = len;
607	}
608	}
609	setMaximumContextLength(max);
610	}
611
612	U_NAMESPACE_END
613
614	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
615
616	/ eof /
617

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/cpdtrans.cpp