translit.h source code [ClickHouse/contrib/icu/icu4c/source/i18n/unicode/translit.h]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1999-2014, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 11/17/99 aliu Creation.
10	**********************************************************************
11	*/
12	#ifndef TRANSLIT_H
13	#define TRANSLIT_H
14
15	#include "unicode/utypes.h"
16
17	#if U_SHOW_CPLUSPLUS_API
18
19	/**
20	* \file
21	* \brief C++ API: Tranforms text from one format to another.
22	*/
23
24	#if !UCONFIG_NO_TRANSLITERATION
25
26	#include "unicode/uobject.h"
27	#include "unicode/unistr.h"
28	#include "unicode/parseerr.h"
29	#include "unicode/utrans.h" // UTransPosition, UTransDirection
30	#include "unicode/strenum.h"
31
32	U_NAMESPACE_BEGIN
33
34	class UnicodeFilter;
35	class UnicodeSet;
36	class TransliteratorParser;
37	class NormalizationTransliterator;
38	class TransliteratorIDParser;
39
40	/**
41	*
42	* <code>Transliterator</code> is an abstract class that
43	* transliterates text from one format to another. The most common
44	* kind of transliterator is a script, or alphabet, transliterator.
45	* For example, a Russian to Latin transliterator changes Russian text
46	* written in Cyrillic characters to phonetically equivalent Latin
47	* characters. It does not <em>translate</em> Russian to English!
48	* Transliteration, unlike translation, operates on characters, without
49	* reference to the meanings of words and sentences.
50	*
51	* <p>Although script conversion is its most common use, a
52	* transliterator can actually perform a more general class of tasks.
53	* In fact, <code>Transliterator</code> defines a very general API
54	* which specifies only that a segment of the input text is replaced
55	* by new text. The particulars of this conversion are determined
56	* entirely by subclasses of <code>Transliterator</code>.
57	*
58	* <p><b>Transliterators are stateless</b>
59	*
60	* <p><code>Transliterator</code> objects are <em>stateless</em>; they
61	* retain no information between calls to
62	* <code>transliterate()</code>. (However, this does <em>not</em>
63	* mean that threads may share transliterators without synchronizing
64	* them. Transliterators are not immutable, so they must be
65	* synchronized when shared between threads.) This might seem to
66	* limit the complexity of the transliteration operation. In
67	* practice, subclasses perform complex transliterations by delaying
68	* the replacement of text until it is known that no other
69	* replacements are possible. In other words, although the
70	* <code>Transliterator</code> objects are stateless, the source text
71	* itself embodies all the needed information, and delayed operation
72	* allows arbitrary complexity.
73	*
74	* <p><b>Batch transliteration</b>
75	*
76	* <p>The simplest way to perform transliteration is all at once, on a
77	* string of existing text. This is referred to as <em>batch</em>
78	* transliteration. For example, given a string <code>input</code>
79	* and a transliterator <code>t</code>, the call
80	*
81	* String result = t.transliterate(input);
82	*
83	* will transliterate it and return the result. Other methods allow
84	* the client to specify a substring to be transliterated and to use
85	* {@link Replaceable } objects instead of strings, in order to
86	* preserve out-of-band information (such as text styles).
87	*
88	* <p><b>Keyboard transliteration</b>
89	*
90	* <p>Somewhat more involved is <em>keyboard</em>, or incremental
91	* transliteration. This is the transliteration of text that is
92	* arriving from some source (typically the user's keyboard) one
93	* character at a time, or in some other piecemeal fashion.
94	*
95	* <p>In keyboard transliteration, a <code>Replaceable</code> buffer
96	* stores the text. As text is inserted, as much as possible is
97	* transliterated on the fly. This means a GUI that displays the
98	* contents of the buffer may show text being modified as each new
99	* character arrives.
100	*
101	* <p>Consider the simple rule-based Transliterator:
102	* <pre>
103	* th>{theta}
104	* t>{tau}
105	* </pre>
106	*
107	* When the user types 't', nothing will happen, since the
108	* transliterator is waiting to see if the next character is 'h'. To
109	* remedy this, we introduce the notion of a cursor, marked by a '\|'
110	* in the output string:
111	* <pre>
112	* t>\|{tau}
113	* {tau}h>{theta}
114	* </pre>
115	*
116	* Now when the user types 't', tau appears, and if the next character
117	* is 'h', the tau changes to a theta. This is accomplished by
118	* maintaining a cursor position (independent of the insertion point,
119	* and invisible in the GUI) across calls to
120	* <code>transliterate()</code>. Typically, the cursor will
121	* be coincident with the insertion point, but in a case like the one
122	* above, it will precede the insertion point.
123	*
124	* <p>Keyboard transliteration methods maintain a set of three indices
125	* that are updated with each call to
126	* <code>transliterate()</code>, including the cursor, start,
127	* and limit. Since these indices are changed by the method, they are
128	* passed in an <code>int[]</code> array. The <code>START</code> index
129	* marks the beginning of the substring that the transliterator will
130	* look at. It is advanced as text becomes committed (but it is not
131	* the committed index; that's the <code>CURSOR</code>). The
132	* <code>CURSOR</code> index, described above, marks the point at
133	* which the transliterator last stopped, either because it reached
134	* the end, or because it required more characters to disambiguate
135	* between possible inputs. The <code>CURSOR</code> can also be
136	* explicitly set by rules in a rule-based Transliterator.
137	* Any characters before the <code>CURSOR</code> index are frozen;
138	* future keyboard transliteration calls within this input sequence
139	* will not change them. New text is inserted at the
140	* <code>LIMIT</code> index, which marks the end of the substring that
141	* the transliterator looks at.
142	*
143	* <p>Because keyboard transliteration assumes that more characters
144	* are to arrive, it is conservative in its operation. It only
145	* transliterates when it can do so unambiguously. Otherwise it waits
146	* for more characters to arrive. When the client code knows that no
147	* more characters are forthcoming, perhaps because the user has
148	* performed some input termination operation, then it should call
149	* <code>finishTransliteration()</code> to complete any
150	* pending transliterations.
151	*
152	* <p><b>Inverses</b>
153	*
154	* <p>Pairs of transliterators may be inverses of one another. For
155	* example, if transliterator <b>A</b> transliterates characters by
156	* incrementing their Unicode value (so "abc" -> "def"), and
157	* transliterator <b>B</b> decrements character values, then <b>A</b>
158	* is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
159	* with <b>B</b> in a compound transliterator, the result is the
160	* indentity transliterator, that is, a transliterator that does not
161	* change its input text.
162	*
163	* The <code>Transliterator</code> method <code>getInverse()</code>
164	* returns a transliterator's inverse, if one exists, or
165	* <code>null</code> otherwise. However, the result of
166	* <code>getInverse()</code> usually will <em>not</em> be a true
167	* mathematical inverse. This is because true inverse transliterators
168	* are difficult to formulate. For example, consider two
169	* transliterators: <b>AB</b>, which transliterates the character 'A'
170	* to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
171	* seem that these are exact inverses, since
172	*
173	* \htmlonly<blockquote>\endhtmlonly"A" x <b>AB</b> -> "B"<br>
174	* "B" x <b>BA</b> -> "A"\htmlonly</blockquote>\endhtmlonly
175	*
176	* where 'x' represents transliteration. However,
177	*
178	* \htmlonly<blockquote>\endhtmlonly"ABCD" x <b>AB</b> -> "BBCD"<br>
179	* "BBCD" x <b>BA</b> -> "AACD"\htmlonly</blockquote>\endhtmlonly
180	*
181	* so <b>AB</b> composed with <b>BA</b> is not the
182	* identity. Nonetheless, <b>BA</b> may be usefully considered to be
183	* <b>AB</b>'s inverse, and it is on this basis that
184	* <b>AB</b><code>.getInverse()</code> could legitimately return
185	* <b>BA</b>.
186	*
187	* <p><b>IDs and display names</b>
188	*
189	* <p>A transliterator is designated by a short identifier string or
190	* <em>ID</em>. IDs follow the format <em>source-destination</em>,
191	* where <em>source</em> describes the entity being replaced, and
192	* <em>destination</em> describes the entity replacing
193	* <em>source</em>. The entities may be the names of scripts,
194	* particular sequences of characters, or whatever else it is that the
195	* transliterator converts to or from. For example, a transliterator
196	* from Russian to Latin might be named "Russian-Latin". A
197	* transliterator from keyboard escape sequences to Latin-1 characters
198	* might be named "KeyboardEscape-Latin1". By convention, system
199	* entity names are in English, with the initial letters of words
200	* capitalized; user entity names may follow any format so long as
201	* they do not contain dashes.
202	*
203	* <p>In addition to programmatic IDs, transliterator objects have
204	* display names for presentation in user interfaces, returned by
205	* {@link #getDisplayName }.
206	*
207	* <p><b>Factory methods and registration</b>
208	*
209	* <p>In general, client code should use the factory method
210	* {@link #createInstance } to obtain an instance of a
211	* transliterator given its ID. Valid IDs may be enumerated using
212	* <code>getAvailableIDs()</code>. Since transliterators are mutable,
213	* multiple calls to {@link #createInstance } with the same ID will
214	* return distinct objects.
215	*
216	* <p>In addition to the system transliterators registered at startup,
217	* user transliterators may be registered by calling
218	* <code>registerInstance()</code> at run time. A registered instance
219	* acts a template; future calls to {@link #createInstance } with the ID
220	* of the registered object return clones of that object. Thus any
221	* object passed to <tt>registerInstance()</tt> must implement
222	* <tt>clone()</tt> propertly. To register a transliterator subclass
223	* without instantiating it (until it is needed), users may call
224	* {@link #registerFactory }. In this case, the objects are
225	* instantiated by invoking the zero-argument public constructor of
226	* the class.
227	*
228	* <p><b>Subclassing</b>
229	*
230	* Subclasses must implement the abstract method
231	* <code>handleTransliterate()</code>. <p>Subclasses should override
232	* the <code>transliterate()</code> method taking a
233	* <code>Replaceable</code> and the <code>transliterate()</code>
234	* method taking a <code>String</code> and <code>StringBuffer</code>
235	* if the performance of these methods can be improved over the
236	* performance obtained by the default implementations in this class.
237	*
238	* <p><b>Rule syntax</b>
239	*
240	* <p>A set of rules determines how to perform translations.
241	* Rules within a rule set are separated by semicolons (';').
242	* To include a literal semicolon, prefix it with a backslash ('\').
243	* Unicode Pattern_White_Space is ignored.
244	* If the first non-blank character on a line is '#',
245	* the entire line is ignored as a comment.
246	*
247	* <p>Each set of rules consists of two groups, one forward, and one
248	* reverse. This is a convention that is not enforced; rules for one
249	* direction may be omitted, with the result that translations in
250	* that direction will not modify the source text. In addition,
251	* bidirectional forward-reverse rules may be specified for
252	* symmetrical transformations.
253	*
254	* <p>Note: Another description of the Transliterator rule syntax is available in
255	* <a href="https://www.unicode.org/reports/tr35/tr35-general.html#Transform_Rules_Syntax">section
256	* Transform Rules Syntax of UTS #35: Unicode LDML</a>.
257	* The rules are shown there using arrow symbols ← and → and ↔.
258	* ICU supports both those and the equivalent ASCII symbols < and > and <>.
259	*
260	* <p>Rule statements take one of the following forms:
261	*
262	* <dl>
263	* <dt><code>$alefmadda=\\u0622;</code></dt>
264	* <dd><strong>Variable definition.</strong> The name on the
265	* left is assigned the text on the right. In this example,
266	* after this statement, instances of the left hand name,
267	* "<code>$alefmadda</code>", will be replaced by
268	* the Unicode character U+0622. Variable names must begin
269	* with a letter and consist only of letters, digits, and
270	* underscores. Case is significant. Duplicate names cause
271	* an exception to be thrown, that is, variables cannot be
272	* redefined. The right hand side may contain well-formed
273	* text of any length, including no text at all ("<code>$empty=;</code>").
274	* The right hand side may contain embedded <code>UnicodeSet</code>
275	* patterns, for example, "<code>$softvowel=[eiyEIY]</code>".</dd>
276	* <dt><code>ai>$alefmadda;</code></dt>
277	* <dd><strong>Forward translation rule.</strong> This rule
278	* states that the string on the left will be changed to the
279	* string on the right when performing forward
280	* transliteration.</dd>
281	* <dt><code>ai<$alefmadda;</code></dt>
282	* <dd><strong>Reverse translation rule.</strong> This rule
283	* states that the string on the right will be changed to
284	* the string on the left when performing reverse
285	* transliteration.</dd>
286	* </dl>
287	*
288	* <dl>
289	* <dt><code>ai<>$alefmadda;</code></dt>
290	* <dd><strong>Bidirectional translation rule.</strong> This
291	* rule states that the string on the right will be changed
292	* to the string on the left when performing forward
293	* transliteration, and vice versa when performing reverse
294	* transliteration.</dd>
295	* </dl>
296	*
297	* <p>Translation rules consist of a <em>match pattern</em> and an <em>output
298	* string</em>. The match pattern consists of literal characters,
299	* optionally preceded by context, and optionally followed by
300	* context. Context characters, like literal pattern characters,
301	* must be matched in the text being transliterated. However, unlike
302	* literal pattern characters, they are not replaced by the output
303	* text. For example, the pattern "<code>abc{def}</code>"
304	* indicates the characters "<code>def</code>" must be
305	* preceded by "<code>abc</code>" for a successful match.
306	* If there is a successful match, "<code>def</code>" will
307	* be replaced, but not "<code>abc</code>". The final '<code>}</code>'
308	* is optional, so "<code>abc{def</code>" is equivalent to
309	* "<code>abc{def}</code>". Another example is "<code>{123}456</code>"
310	* (or "<code>123}456</code>") in which the literal
311	* pattern "<code>123</code>" must be followed by "<code>456</code>".
312	*
313	* <p>The output string of a forward or reverse rule consists of
314	* characters to replace the literal pattern characters. If the
315	* output string contains the character '<code>\|</code>', this is
316	* taken to indicate the location of the <em>cursor</em> after
317	* replacement. The cursor is the point in the text at which the
318	* next replacement, if any, will be applied. The cursor is usually
319	* placed within the replacement text; however, it can actually be
320	* placed into the precending or following context by using the
321	* special character '@'. Examples:
322	*
323	* <pre>
324	* a {foo} z > \| @ bar; # foo -> bar, move cursor before a
325	* {foo} xyz > bar @@\|; # foo -> bar, cursor between y and z
326	* </pre>
327	*
328	* <p><b>UnicodeSet</b>
329	*
330	* <p><code>UnicodeSet</code> patterns may appear anywhere that
331	* makes sense. They may appear in variable definitions.
332	* Contrariwise, <code>UnicodeSet</code> patterns may themselves
333	* contain variable references, such as "<code>$a=[a-z];$not_a=[^$a]</code>",
334	* or "<code>$range=a-z;$ll=[$range]</code>".
335	*
336	* <p><code>UnicodeSet</code> patterns may also be embedded directly
337	* into rule strings. Thus, the following two rules are equivalent:
338	*
339	* <pre>
340	* $vowel=[aeiou]; $vowel>'*'; # One way to do this
341	* [aeiou]>'*'; # Another way
342	* </pre>
343	*
344	* <p>See {@link UnicodeSet} for more documentation and examples.
345	*
346	* <p><b>Segments</b>
347	*
348	* <p>Segments of the input string can be matched and copied to the
349	* output string. This makes certain sets of rules simpler and more
350	* general, and makes reordering possible. For example:
351	*
352	* <pre>
353	* ([a-z]) > $1 $1; # double lowercase letters
354	* ([:Lu:]) ([:Ll:]) > $2 $1; # reverse order of Lu-Ll pairs
355	* </pre>
356	*
357	* <p>The segment of the input string to be copied is delimited by
358	* "<code>(</code>" and "<code>)</code>". Up to
359	* nine segments may be defined. Segments may not overlap. In the
360	* output string, "<code>$1</code>" through "<code>$9</code>"
361	* represent the input string segments, in left-to-right order of
362	* definition.
363	*
364	* <p><b>Anchors</b>
365	*
366	* <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
367	* special characters '<code>^</code>' and '<code>$</code>'. For example:
368	*
369	* <pre>
370	* ^ a   > 'BEG_A';   # match 'a' at start of text
371	*   a   > 'A'; # match other instances of 'a'
372	*   z $ > 'END_Z';   # match 'z' at end of text
373	*   z   > 'Z';       # match other instances of 'z'
374	* </pre>
375	*
376	* <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
377	* This is done by including a virtual anchor character '<code>$</code>' at the end of the
378	* set pattern. Although this is usually the match chafacter for the end anchor, the set will
379	* match either the beginning or the end of the text, depending on its placement. For
380	* example:
381	*
382	* <pre>
383	* $x = [a-z$];   # match 'a' through 'z' OR anchor
384	* $x 1    > 2;   # match '1' after a-z or at the start
385	*    3 $x > 4;   # match '3' before a-z or at the end
386	* </pre>
387	*
388	* <p><b>Example</b>
389	*
390	* <p>The following example rules illustrate many of the features of
391	* the rule language.
392	*
393	* <table border="0" cellpadding="4">
394	* <tr>
395	* <td style="vertical-align: top;">Rule 1.</td>
396	* <td style="vertical-align: top; write-space: nowrap;"><code>abc{def}>x\|y</code></td>
397	* </tr>
398	* <tr>
399	* <td style="vertical-align: top;">Rule 2.</td>
400	* <td style="vertical-align: top; write-space: nowrap;"><code>xyz>r</code></td>
401	* </tr>
402	* <tr>
403	* <td style="vertical-align: top;">Rule 3.</td>
404	* <td style="vertical-align: top; write-space: nowrap;"><code>yz>q</code></td>
405	* </tr>
406	* </table>
407	*
408	* <p>Applying these rules to the string "<code>adefabcdefz</code>"
409	* yields the following results:
410	*
411	* <table border="0" cellpadding="4">
412	* <tr>
413	* <td style="vertical-align: top; write-space: nowrap;"><code>\|adefabcdefz</code></td>
414	* <td style="vertical-align: top;">Initial state, no rules match. Advance
415	* cursor.</td>
416	* </tr>
417	* <tr>
418	* <td style="vertical-align: top; write-space: nowrap;"><code>a\|defabcdefz</code></td>
419	* <td style="vertical-align: top;">Still no match. Rule 1 does not match
420	* because the preceding context is not present.</td>
421	* </tr>
422	* <tr>
423	* <td style="vertical-align: top; write-space: nowrap;"><code>ad\|efabcdefz</code></td>
424	* <td style="vertical-align: top;">Still no match. Keep advancing until
425	* there is a match...</td>
426	* </tr>
427	* <tr>
428	* <td style="vertical-align: top; write-space: nowrap;"><code>ade\|fabcdefz</code></td>
429	* <td style="vertical-align: top;">...</td>
430	* </tr>
431	* <tr>
432	* <td style="vertical-align: top; write-space: nowrap;"><code>adef\|abcdefz</code></td>
433	* <td style="vertical-align: top;">...</td>
434	* </tr>
435	* <tr>
436	* <td style="vertical-align: top; write-space: nowrap;"><code>adefa\|bcdefz</code></td>
437	* <td style="vertical-align: top;">...</td>
438	* </tr>
439	* <tr>
440	* <td style="vertical-align: top; write-space: nowrap;"><code>adefab\|cdefz</code></td>
441	* <td style="vertical-align: top;">...</td>
442	* </tr>
443	* <tr>
444	* <td style="vertical-align: top; write-space: nowrap;"><code>adefabc\|defz</code></td>
445	* <td style="vertical-align: top;">Rule 1 matches; replace "<code>def</code>"
446	* with "<code>xy</code>" and back up the cursor
447	* to before the '<code>y</code>'.</td>
448	* </tr>
449	* <tr>
450	* <td style="vertical-align: top; write-space: nowrap;"><code>adefabcx\|yz</code></td>
451	* <td style="vertical-align: top;">Although "<code>xyz</code>" is
452	* present, rule 2 does not match because the cursor is
453	* before the '<code>y</code>', not before the '<code>x</code>'.
454	* Rule 3 does match. Replace "<code>yz</code>"
455	* with "<code>q</code>".</td>
456	* </tr>
457	* <tr>
458	* <td style="vertical-align: top; write-space: nowrap;"><code>adefabcxq\|</code></td>
459	* <td style="vertical-align: top;">The cursor is at the end;
460	* transliteration is complete.</td>
461	* </tr>
462	* </table>
463	*
464	* <p>The order of rules is significant. If multiple rules may match
465	* at some point, the first matching rule is applied.
466	*
467	* <p>Forward and reverse rules may have an empty output string.
468	* Otherwise, an empty left or right hand side of any statement is a
469	* syntax error.
470	*
471	* <p>Single quotes are used to quote any character other than a
472	* digit or letter. To specify a single quote itself, inside or
473	* outside of quotes, use two single quotes in a row. For example,
474	* the rule "<code>'>'>o''clock</code>" changes the
475	* string "<code>></code>" to the string "<code>o'clock</code>".
476	*
477	* <p><b>Notes</b>
478	*
479	* <p>While a Transliterator is being built from rules, it checks that
480	* the rules are added in proper order. For example, if the rule
481	* "a>x" is followed by the rule "ab>y",
482	* then the second rule will throw an exception. The reason is that
483	* the second rule can never be triggered, since the first rule
484	* always matches anything it matches. In other words, the first
485	* rule <em>masks</em> the second rule.
486	*
487	* @author Alan Liu
488	* @stable ICU 2.0
489	*/
490	class U_I18N_API Transliterator : public UObject {
491
492	private:
493
494	/**
495	* Programmatic name, e.g., "Latin-Arabic".
496	*/
497	UnicodeString ID;
498
499	/**
500	* This transliterator's filter. Any character for which
501	* <tt>filter.contains()</tt> returns <tt>false</tt> will not be
502	* altered by this transliterator. If <tt>filter</tt> is
503	* <tt>null</tt> then no filtering is applied.
504	*/
505	UnicodeFilter* filter;
506
507	int32_t maximumContextLength;
508
509	public:
510
511	/**
512	* A context integer or pointer for a factory function, passed by
513	* value.
514	* @stable ICU 2.4
515	*/
516	union Token {
517	/**
518	* This token, interpreted as a 32-bit integer.
519	* @stable ICU 2.4
520	*/
521	int32_t integer;
522	/**
523	* This token, interpreted as a native pointer.
524	* @stable ICU 2.4
525	*/
526	void* pointer;
527	};
528
529	#ifndef U_HIDE_INTERNAL_API
530	/**
531	* Return a token containing an integer.
532	* @return a token containing an integer.
533	* @internal
534	*/
535	inline static Token integerToken(int32_t);
536
537	/**
538	* Return a token containing a pointer.
539	* @return a token containing a pointer.
540	* @internal
541	*/
542	inline static Token pointerToken(void*);
543	#endif /* U_HIDE_INTERNAL_API */
544
545	/**
546	* A function that creates and returns a Transliterator. When
547	* invoked, it will be passed the ID string that is being
548	* instantiated, together with the context pointer that was passed
549	* in when the factory function was first registered. Many
550	* factory functions will ignore both parameters, however,
551	* functions that are registered to more than one ID may use the
552	* ID or the context parameter to parameterize the transliterator
553	* they create.
554	* @param ID the string identifier for this transliterator
555	* @param context a context pointer that will be stored and
556	* later passed to the factory function when an ID matching
557	* the registration ID is being instantiated with this factory.
558	* @stable ICU 2.4
559	*/
560	typedef Transliterator* (U_EXPORT2 Factory)(const* UnicodeString& ID, Token context);
561
562	protected:
563
564	/**
565	* Default constructor.
566	* @param ID the string identifier for this transliterator
567	* @param adoptedFilter the filter. Any character for which
568	* <tt>filter.contains()</tt> returns <tt>false</tt> will not be
569	* altered by this transliterator. If <tt>filter</tt> is
570	* <tt>null</tt> then no filtering is applied.
571	* @stable ICU 2.4
572	*/
573	Transliterator(const UnicodeString& ID, UnicodeFilter* adoptedFilter);
574
575	/**
576	* Copy constructor.
577	* @stable ICU 2.4
578	*/
579	Transliterator(const Transliterator&);
580
581	/**
582	* Assignment operator.
583	* @stable ICU 2.4
584	*/
585	Transliterator& operator=(const Transliterator&);
586
587	/**
588	* Create a transliterator from a basic ID. This is an ID
589	* containing only the forward direction source, target, and
590	* variant.
591	* @param id a basic ID of the form S-T or S-T/V.
592	* @param canon canonical ID to assign to the object, or
593	* NULL to leave the ID unchanged
594	* @return a newly created Transliterator or null if the ID is
595	* invalid.
596	* @stable ICU 2.4
597	*/
598	static Transliterator* createBasicInstance(const UnicodeString& id,
599	const UnicodeString* canon);
600
601	friend class TransliteratorParser; // for parseID()
602	friend class TransliteratorIDParser; // for createBasicInstance()
603	friend class TransliteratorAlias; // for setID()
604
605	public:
606
607	/**
608	* Destructor.
609	* @stable ICU 2.0
610	*/
611	virtual ~Transliterator();
612
613	/**
614	* Implements Cloneable.
615	* All subclasses are encouraged to implement this method if it is
616	* possible and reasonable to do so. Subclasses that are to be
617	* registered with the system using <tt>registerInstance()</tt>
618	* are required to implement this method. If a subclass does not
619	* implement clone() properly and is registered with the system
620	* using registerInstance(), then the default clone() implementation
621	* will return null, and calls to createInstance() will fail.
622	*
623	* @return a copy of the object.
624	* @see #registerInstance
625	* @stable ICU 2.0
626	*/
627	virtual Transliterator* clone() const;
628
629	/**
630	* Transliterates a segment of a string, with optional filtering.
631	*
632	* @param text the string to be transliterated
633	* @param start the beginning index, inclusive; <code>0 <= start
634	* <= limit</code>.
635	* @param limit the ending index, exclusive; <code>start <= limit
636	* <= text.length()</code>.
637	* @return The new limit index. The text previously occupying <code>[start,
638	* limit)</code> has been transliterated, possibly to a string of a different
639	* length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
640	* <em>new-limit</em> is the return value. If the input offsets are out of bounds,
641	* the returned value is -1 and the input string remains unchanged.
642	* @stable ICU 2.0
643	*/
644	virtual int32_t transliterate(Replaceable& text,
645	int32_t start, int32_t limit) const;
646
647	/**
648	* Transliterates an entire string in place. Convenience method.
649	* @param text the string to be transliterated
650	* @stable ICU 2.0
651	*/
652	virtual void transliterate(Replaceable& text) const;
653
654	/**
655	* Transliterates the portion of the text buffer that can be
656	* transliterated unambiguosly after new text has been inserted,
657	* typically as a result of a keyboard event. The new text in
658	* <code>insertion</code> will be inserted into <code>text</code>
659	* at <code>index.limit</code>, advancing
660	* <code>index.limit</code> by <code>insertion.length()</code>.
661	* Then the transliterator will try to transliterate characters of
662	* <code>text</code> between <code>index.cursor</code> and
663	* <code>index.limit</code>. Characters before
664	* <code>index.cursor</code> will not be changed.
665	*
666	* <p>Upon return, values in <code>index</code> will be updated.
667	* <code>index.start</code> will be advanced to the first
668	* character that future calls to this method will read.
669	* <code>index.cursor</code> and <code>index.limit</code> will
670	* be adjusted to delimit the range of text that future calls to
671	* this method may change.
672	*
673	* <p>Typical usage of this method begins with an initial call
674	* with <code>index.start</code> and <code>index.limit</code>
675	* set to indicate the portion of <code>text</code> to be
676	* transliterated, and <code>index.cursor == index.start</code>.
677	* Thereafter, <code>index</code> can be used without
678	* modification in future calls, provided that all changes to
679	* <code>text</code> are made via this method.
680	*
681	* <p>This method assumes that future calls may be made that will
682	* insert new text into the buffer. As a result, it only performs
683	* unambiguous transliterations. After the last call to this
684	* method, there may be untransliterated text that is waiting for
685	* more input to resolve an ambiguity. In order to perform these
686	* pending transliterations, clients should call {@link
687	* #finishTransliteration } after the last call to this
688	* method has been made.
689	*
690	* @param text the buffer holding transliterated and untransliterated text
691	* @param index an array of three integers.
692	*
693	* <ul><li><code>index.start</code>: the beginning index,
694	* inclusive; <code>0 <= index.start <= index.limit</code>.
695	*
696	* <li><code>index.limit</code>: the ending index, exclusive;
697	* <code>index.start <= index.limit <= text.length()</code>.
698	* <code>insertion</code> is inserted at
699	* <code>index.limit</code>.
700	*
701	* <li><code>index.cursor</code>: the next character to be
702	* considered for transliteration; <code>index.start <=
703	* index.cursor <= index.limit</code>. Characters before
704	* <code>index.cursor</code> will not be changed by future calls
705	* to this method.</ul>
706	*
707	* @param insertion text to be inserted and possibly
708	* transliterated into the translation buffer at
709	* <code>index.limit</code>. If <code>null</code> then no text
710	* is inserted.
711	* @param status Output param to filled in with a success or an error.
712	* @see #handleTransliterate
713	* @exception IllegalArgumentException if <code>index</code>
714	* is invalid
715	* @see UTransPosition
716	* @stable ICU 2.0
717	*/
718	virtual void transliterate(Replaceable& text, UTransPosition& index,
719	const UnicodeString& insertion,
720	UErrorCode& status) const;
721
722	/**
723	* Transliterates the portion of the text buffer that can be
724	* transliterated unambiguosly after a new character has been
725	* inserted, typically as a result of a keyboard event. This is a
726	* convenience method.
727	* @param text the buffer holding transliterated and
728	* untransliterated text
729	* @param index an array of three integers.
730	* @param insertion text to be inserted and possibly
731	* transliterated into the translation buffer at
732	* <code>index.limit</code>.
733	* @param status Output param to filled in with a success or an error.
734	* @see #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode&) const
735	* @stable ICU 2.0
736	*/
737	virtual void transliterate(Replaceable& text, UTransPosition& index,
738	UChar32 insertion,
739	UErrorCode& status) const;
740
741	/**
742	* Transliterates the portion of the text buffer that can be
743	* transliterated unambiguosly. This is a convenience method; see
744	* {@link
745	* #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode&) const }
746	* for details.
747	* @param text the buffer holding transliterated and
748	* untransliterated text
749	* @param index an array of three integers.
750	* @param status Output param to filled in with a success or an error.
751	* @see #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode &) const
752	* @stable ICU 2.0
753	*/
754	virtual void transliterate(Replaceable& text, UTransPosition& index,
755	UErrorCode& status) const;
756
757	/**
758	* Finishes any pending transliterations that were waiting for
759	* more characters. Clients should call this method as the last
760	* call after a sequence of one or more calls to
761	* <code>transliterate()</code>.
762	* @param text the buffer holding transliterated and
763	* untransliterated text.
764	* @param index the array of indices previously passed to {@link
765	* #transliterate }
766	* @stable ICU 2.0
767	*/
768	virtual void finishTransliteration(Replaceable& text,
769	UTransPosition& index) const;
770
771	private:
772
773	/**
774	* This internal method does incremental transliteration. If the
775	* 'insertion' is non-null then we append it to 'text' before
776	* proceeding. This method calls through to the pure virtual
777	* framework method handleTransliterate() to do the actual
778	* work.
779	* @param text the buffer holding transliterated and
780	* untransliterated text
781	* @param index an array of three integers. See {@link
782	* #transliterate(Replaceable, int[], String)}.
783	* @param insertion text to be inserted and possibly
784	* transliterated into the translation buffer at
785	* <code>index.limit</code>.
786	* @param status Output param to filled in with a success or an error.
787	*/
788	void _transliterate(Replaceable& text,
789	UTransPosition& index,
790	const UnicodeString* insertion,
791	UErrorCode &status) const;
792
793	protected:
794
795	/**
796	* Abstract method that concrete subclasses define to implement
797	* their transliteration algorithm. This method handles both
798	* incremental and non-incremental transliteration. Let
799	* <code>originalStart</code> refer to the value of
800	* <code>pos.start</code> upon entry.
801	*
802	* <ul>
803	* <li>If <code>incremental</code> is false, then this method
804	* should transliterate all characters between
805	* <code>pos.start</code> and <code>pos.limit</code>. Upon return
806	* <code>pos.start</code> must == <code> pos.limit</code>.</li>
807	*
808	* <li>If <code>incremental</code> is true, then this method
809	* should transliterate all characters between
810	* <code>pos.start</code> and <code>pos.limit</code> that can be
811	* unambiguously transliterated, regardless of future insertions
812	* of text at <code>pos.limit</code>. Upon return,
813	* <code>pos.start</code> should be in the range
814	* [<code>originalStart</code>, <code>pos.limit</code>).
815	* <code>pos.start</code> should be positioned such that
816	* characters [<code>originalStart</code>, <code>
817	* pos.start</code>) will not be changed in the future by this
818	* transliterator and characters [<code>pos.start</code>,
819	* <code>pos.limit</code>) are unchanged.</li>
820	* </ul>
821	*
822	* <p>Implementations of this method should also obey the
823	* following invariants:</p>
824	*
825	* <ul>
826	* <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
827	* should be updated to reflect changes in length of the text
828	* between <code>pos.start</code> and <code>pos.limit</code>. The
829	* difference <code> pos.contextLimit - pos.limit</code> should
830	* not change.</li>
831	*
832	* <li><code>pos.contextStart</code> should not change.</li>
833	*
834	* <li>Upon return, neither <code>pos.start</code> nor
835	* <code>pos.limit</code> should be less than
836	* <code>originalStart</code>.</li>
837	*
838	* <li>Text before <code>originalStart</code> and text after
839	* <code>pos.limit</code> should not change.</li>
840	*
841	* <li>Text before <code>pos.contextStart</code> and text after
842	* <code> pos.contextLimit</code> should be ignored.</li>
843	* </ul>
844	*
845	* <p>Subclasses may safely assume that all characters in
846	* [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
847	* In other words, the filter has already been applied by the time
848	* this method is called. See
849	* <code>filteredTransliterate()</code>.
850	*
851	* <p>This method is <b>not</b> for public consumption. Calling
852	* this method directly will transliterate
853	* [<code>pos.start</code>, <code>pos.limit</code>) without
854	* applying the filter. End user code should call <code>
855	* transliterate()</code> instead of this method. Subclass code
856	* and wrapping transliterators should call
857	* <code>filteredTransliterate()</code> instead of this method.<p>
858	*
859	* @param text the buffer holding transliterated and
860	* untransliterated text
861	*
862	* @param pos the indices indicating the start, limit, context
863	* start, and context limit of the text.
864	*
865	* @param incremental if true, assume more text may be inserted at
866	* <code>pos.limit</code> and act accordingly. Otherwise,
867	* transliterate all text between <code>pos.start</code> and
868	* <code>pos.limit</code> and move <code>pos.start</code> up to
869	* <code>pos.limit</code>.
870	*
871	* @see #transliterate
872	* @stable ICU 2.4
873	*/
874	virtual void handleTransliterate(Replaceable& text,
875	UTransPosition& pos,
876	UBool incremental) const = `0`;
877
878	public:
879	/**
880	* Transliterate a substring of text, as specified by index, taking filters
881	* into account. This method is for subclasses that need to delegate to
882	* another transliterator.
883	* @param text the text to be transliterated
884	* @param index the position indices
885	* @param incremental if TRUE, then assume more characters may be inserted
886	* at index.limit, and postpone processing to accomodate future incoming
887	* characters
888	* @stable ICU 2.4
889	*/
890	virtual void filteredTransliterate(Replaceable& text,
891	UTransPosition& index,
892	UBool incremental) const;
893
894	private:
895
896	/**
897	* Top-level transliteration method, handling filtering, incremental and
898	* non-incremental transliteration, and rollback. All transliteration
899	* public API methods eventually call this method with a rollback argument
900	* of TRUE. Other entities may call this method but rollback should be
901	* FALSE.
902	*
903	* <p>If this transliterator has a filter, break up the input text into runs
904	* of unfiltered characters. Pass each run to
905	* subclass.handleTransliterate().
906	*
907	* <p>In incremental mode, if rollback is TRUE, perform a special
908	* incremental procedure in which several passes are made over the input
909	* text, adding one character at a time, and committing successful
910	* transliterations as they occur. Unsuccessful transliterations are rolled
911	* back and retried with additional characters to give correct results.
912	*
913	* @param text the text to be transliterated
914	* @param index the position indices
915	* @param incremental if TRUE, then assume more characters may be inserted
916	* at index.limit, and postpone processing to accomodate future incoming
917	* characters
918	* @param rollback if TRUE and if incremental is TRUE, then perform special
919	* incremental processing, as described above, and undo partial
920	* transliterations where necessary. If incremental is FALSE then this
921	* parameter is ignored.
922	*/
923	virtual void filteredTransliterate(Replaceable& text,
924	UTransPosition& index,
925	UBool incremental,
926	UBool rollback) const;
927
928	public:
929
930	/**
931	* Returns the length of the longest context required by this transliterator.
932	* This is <em>preceding</em> context. The default implementation supplied
933	* by <code>Transliterator</code> returns zero; subclasses
934	* that use preceding context should override this method to return the
935	* correct value. For example, if a transliterator translates "ddd" (where
936	* d is any digit) to "555" when preceded by "(ddd)", then the preceding
937	* context length is 5, the length of "(ddd)".
938	*
939	* @return The maximum number of preceding context characters this
940	* transliterator needs to examine
941	* @stable ICU 2.0
942	*/
943	int32_t getMaximumContextLength(void) const;
944
945	protected:
946
947	/**
948	* Method for subclasses to use to set the maximum context length.
949	* @param maxContextLength the new value to be set.
950	* @see #getMaximumContextLength
951	* @stable ICU 2.4
952	*/
953	void setMaximumContextLength(int32_t maxContextLength);
954
955	public:
956
957	/**
958	* Returns a programmatic identifier for this transliterator.
959	* If this identifier is passed to <code>createInstance()</code>, it
960	* will return this object, if it has been registered.
961	* @return a programmatic identifier for this transliterator.
962	* @see #registerInstance
963	* @see #registerFactory
964	* @see #getAvailableIDs
965	* @stable ICU 2.0
966	*/
967	virtual const UnicodeString& getID(void) const;
968
969	/**
970	* Returns a name for this transliterator that is appropriate for
971	* display to the user in the default locale. See {@link
972	* #getDisplayName } for details.
973	* @param ID the string identifier for this transliterator
974	* @param result Output param to receive the display name
975	* @return A reference to 'result'.
976	* @stable ICU 2.0
977	*/
978	static UnicodeString& U_EXPORT2 getDisplayName(const UnicodeString& ID,
979	UnicodeString& result);
980
981	/**
982	* Returns a name for this transliterator that is appropriate for
983	* display to the user in the given locale. This name is taken
984	* from the locale resource data in the standard manner of the
985	* <code>java.text</code> package.
986	*
987	* <p>If no localized names exist in the system resource bundles,
988	* a name is synthesized using a localized
989	* <code>MessageFormat</code> pattern from the resource data. The
990	* arguments to this pattern are an integer followed by one or two
991	* strings. The integer is the number of strings, either 1 or 2.
992	* The strings are formed by splitting the ID for this
993	* transliterator at the first '-'. If there is no '-', then the
994	* entire ID forms the only string.
995	* @param ID the string identifier for this transliterator
996	* @param inLocale the Locale in which the display name should be
997	* localized.
998	* @param result Output param to receive the display name
999	* @return A reference to 'result'.
1000	* @stable ICU 2.0
1001	*/
1002	static UnicodeString& U_EXPORT2 getDisplayName(const UnicodeString& ID,
1003	const Locale& inLocale,
1004	UnicodeString& result);
1005
1006	/**
1007	* Returns the filter used by this transliterator, or <tt>NULL</tt>
1008	* if this transliterator uses no filter.
1009	* @return the filter used by this transliterator, or <tt>NULL</tt>
1010	* if this transliterator uses no filter.
1011	* @stable ICU 2.0
1012	*/
1013	const UnicodeFilter* getFilter(void) const;
1014
1015	/**
1016	* Returns the filter used by this transliterator, or <tt>NULL</tt> if this
1017	* transliterator uses no filter. The caller must eventually delete the
1018	* result. After this call, this transliterator's filter is set to
1019	* <tt>NULL</tt>.
1020	* @return the filter used by this transliterator, or <tt>NULL</tt> if this
1021	* transliterator uses no filter.
1022	* @stable ICU 2.4
1023	*/
1024	UnicodeFilter* orphanFilter(void);
1025
1026	/**
1027	* Changes the filter used by this transliterator. If the filter
1028	* is set to <tt>null</tt> then no filtering will occur.
1029	*
1030	* <p>Callers must take care if a transliterator is in use by
1031	* multiple threads. The filter should not be changed by one
1032	* thread while another thread may be transliterating.
1033	* @param adoptedFilter the new filter to be adopted.
1034	* @stable ICU 2.0
1035	*/
1036	void adoptFilter(UnicodeFilter* adoptedFilter);
1037
1038	/**
1039	* Returns this transliterator's inverse. See the class
1040	* documentation for details. This implementation simply inverts
1041	* the two entities in the ID and attempts to retrieve the
1042	* resulting transliterator. That is, if <code>getID()</code>
1043	* returns "A-B", then this method will return the result of
1044	* <code>createInstance("B-A")</code>, or <code>null</code> if that
1045	* call fails.
1046	*
1047	* <p>Subclasses with knowledge of their inverse may wish to
1048	* override this method.
1049	*
1050	* @param status Output param to filled in with a success or an error.
1051	* @return a transliterator that is an inverse, not necessarily
1052	* exact, of this transliterator, or <code>null</code> if no such
1053	* transliterator is registered.
1054	* @see #registerInstance
1055	* @stable ICU 2.0
1056	*/
1057	Transliterator* createInverse(UErrorCode& status) const;
1058
1059	/**
1060	* Returns a <code>Transliterator</code> object given its ID.
1061	* The ID must be either a system transliterator ID or a ID registered
1062	* using <code>registerInstance()</code>.
1063	*
1064	* @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1065	* @param dir either FORWARD or REVERSE.
1066	* @param parseError Struct to recieve information on position
1067	* of error if an error is encountered
1068	* @param status Output param to filled in with a success or an error.
1069	* @return A <code>Transliterator</code> object with the given ID
1070	* @see #registerInstance
1071	* @see #getAvailableIDs
1072	* @see #getID
1073	* @stable ICU 2.0
1074	*/
1075	static Transliterator* U_EXPORT2 createInstance(const UnicodeString& ID,
1076	UTransDirection dir,
1077	UParseError& parseError,
1078	UErrorCode& status);
1079
1080	/**
1081	* Returns a <code>Transliterator</code> object given its ID.
1082	* The ID must be either a system transliterator ID or a ID registered
1083	* using <code>registerInstance()</code>.
1084	* @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
1085	* @param dir either FORWARD or REVERSE.
1086	* @param status Output param to filled in with a success or an error.
1087	* @return A <code>Transliterator</code> object with the given ID
1088	* @stable ICU 2.0
1089	*/
1090	static Transliterator* U_EXPORT2 createInstance(const UnicodeString& ID,
1091	UTransDirection dir,
1092	UErrorCode& status);
1093
1094	/**
1095	* Returns a <code>Transliterator</code> object constructed from
1096	* the given rule string. This will be a rule-based Transliterator,
1097	* if the rule string contains only rules, or a
1098	* compound Transliterator, if it contains ID blocks, or a
1099	* null Transliterator, if it contains ID blocks which parse as
1100	* empty for the given direction.
1101	*
1102	* @param ID the id for the transliterator.
1103	* @param rules rules, separated by ';'
1104	* @param dir either FORWARD or REVERSE.
1105	* @param parseError Struct to receive information on position
1106	* of error if an error is encountered
1107	* @param status Output param set to success/failure code.
1108	* @return a newly created Transliterator
1109	* @stable ICU 2.0
1110	*/
1111	static Transliterator* U_EXPORT2 createFromRules(const UnicodeString& ID,
1112	const UnicodeString& rules,
1113	UTransDirection dir,
1114	UParseError& parseError,
1115	UErrorCode& status);
1116
1117	/**
1118	* Create a rule string that can be passed to createFromRules()
1119	* to recreate this transliterator.
1120	* @param result the string to receive the rules. Previous
1121	* contents will be deleted.
1122	* @param escapeUnprintable if TRUE then convert unprintable
1123	* character to their hex escape representations, \\uxxxx or
1124	* \\Uxxxxxxxx. Unprintable characters are those other than
1125	* U+000A, U+0020..U+007E.
1126	* @stable ICU 2.0
1127	*/
1128	virtual UnicodeString& toRules(UnicodeString& result,
1129	UBool escapeUnprintable) const;
1130
1131	/**
1132	* Return the number of elements that make up this transliterator.
1133	* For example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
1134	* were created, the return value of this method would be 3.
1135	*
1136	* <p>If this transliterator is not composed of other
1137	* transliterators, then this method returns 1.
1138	* @return the number of transliterators that compose this
1139	* transliterator, or 1 if this transliterator is not composed of
1140	* multiple transliterators
1141	* @stable ICU 3.0
1142	*/
1143	int32_t countElements() const;
1144
1145	/**
1146	* Return an element that makes up this transliterator. For
1147	* example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
1148	* were created, the return value of this method would be one
1149	* of the three transliterator objects that make up that
1150	* transliterator: [NFD, Jamo-Latin, Latin-Greek].
1151	*
1152	* <p>If this transliterator is not composed of other
1153	* transliterators, then this method will return a reference to
1154	* this transliterator when given the index 0.
1155	* @param index a value from 0..countElements()-1 indicating the
1156	* transliterator to return
1157	* @param ec input-output error code
1158	* @return one of the transliterators that makes up this
1159	* transliterator, if this transliterator is made up of multiple
1160	* transliterators, otherwise a reference to this object if given
1161	* an index of 0
1162	* @stable ICU 3.0
1163	*/
1164	const Transliterator& getElement(int32_t index, UErrorCode& ec) const;
1165
1166	/**
1167	* Returns the set of all characters that may be modified in the
1168	* input text by this Transliterator. This incorporates this
1169	* object's current filter; if the filter is changed, the return
1170	* value of this function will change. The default implementation
1171	* returns an empty set. Some subclasses may override {@link
1172	* #handleGetSourceSet } to return a more precise result. The
1173	* return result is approximate in any case and is intended for
1174	* use by tests, tools, or utilities.
1175	* @param result receives result set; previous contents lost
1176	* @return a reference to result
1177	* @see #getTargetSet
1178	* @see #handleGetSourceSet
1179	* @stable ICU 2.4
1180	*/
1181	UnicodeSet& getSourceSet(UnicodeSet& result) const;
1182
1183	/**
1184	* Framework method that returns the set of all characters that
1185	* may be modified in the input text by this Transliterator,
1186	* ignoring the effect of this object's filter. The base class
1187	* implementation returns the empty set. Subclasses that wish to
1188	* implement this should override this method.
1189	* @return the set of characters that this transliterator may
1190	* modify. The set may be modified, so subclasses should return a
1191	* newly-created object.
1192	* @param result receives result set; previous contents lost
1193	* @see #getSourceSet
1194	* @see #getTargetSet
1195	* @stable ICU 2.4
1196	*/
1197	virtual void handleGetSourceSet(UnicodeSet& result) const;
1198
1199	/**
1200	* Returns the set of all characters that may be generated as
1201	* replacement text by this transliterator. The default
1202	* implementation returns the empty set. Some subclasses may
1203	* override this method to return a more precise result. The
1204	* return result is approximate in any case and is intended for
1205	* use by tests, tools, or utilities requiring such
1206	* meta-information.
1207	* @param result receives result set; previous contents lost
1208	* @return a reference to result
1209	* @see #getTargetSet
1210	* @stable ICU 2.4
1211	*/
1212	virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
1213
1214	public:
1215
1216	/**
1217	* Registers a factory function that creates transliterators of
1218	* a given ID.
1219	*
1220	* Because ICU may choose to cache Transliterators internally, this must
1221	* be called at application startup, prior to any calls to
1222	* Transliterator::createXXX to avoid undefined behavior.
1223	*
1224	* @param id the ID being registered
1225	* @param factory a function pointer that will be copied and
1226	* called later when the given ID is passed to createInstance()
1227	* @param context a context pointer that will be stored and
1228	* later passed to the factory function when an ID matching
1229	* the registration ID is being instantiated with this factory.
1230	* @stable ICU 2.0
1231	*/
1232	static void U_EXPORT2 registerFactory(const UnicodeString& id,
1233	Factory factory,
1234	Token context);
1235
1236	/**
1237	* Registers an instance <tt>obj</tt> of a subclass of
1238	* <code>Transliterator</code> with the system. When
1239	* <tt>createInstance()</tt> is called with an ID string that is
1240	* equal to <tt>obj->getID()</tt>, then <tt>obj->clone()</tt> is
1241	* returned.
1242	*
1243	* After this call the Transliterator class owns the adoptedObj
1244	* and will delete it.
1245	*
1246	* Because ICU may choose to cache Transliterators internally, this must
1247	* be called at application startup, prior to any calls to
1248	* Transliterator::createXXX to avoid undefined behavior.
1249	*
1250	* @param adoptedObj an instance of subclass of
1251	* <code>Transliterator</code> that defines <tt>clone()</tt>
1252	* @see #createInstance
1253	* @see #registerFactory
1254	* @see #unregister
1255	* @stable ICU 2.0
1256	*/
1257	static void U_EXPORT2 registerInstance(Transliterator* adoptedObj);
1258
1259	/**
1260	* Registers an ID string as an alias of another ID string.
1261	* That is, after calling this function, <tt>createInstance(aliasID)</tt>
1262	* will return the same thing as <tt>createInstance(realID)</tt>.
1263	* This is generally used to create shorter, more mnemonic aliases
1264	* for long compound IDs.
1265	*
1266	* @param aliasID The new ID being registered.
1267	* @param realID The ID that the new ID is to be an alias for.
1268	* This can be a compound ID and can include filters and should
1269	* refer to transliterators that have already been registered with
1270	* the framework, although this isn't checked.
1271	* @stable ICU 3.6
1272	*/
1273	static void U_EXPORT2 registerAlias(const UnicodeString& aliasID,
1274	const UnicodeString& realID);
1275
1276	protected:
1277
1278	#ifndef U_HIDE_INTERNAL_API
1279	/**
1280	* @param id the ID being registered
1281	* @param factory a function pointer that will be copied and
1282	* called later when the given ID is passed to createInstance()
1283	* @param context a context pointer that will be stored and
1284	* later passed to the factory function when an ID matching
1285	* the registration ID is being instantiated with this factory.
1286	* @internal
1287	*/
1288	static void _registerFactory(const UnicodeString& id,
1289	Factory factory,
1290	Token context);
1291
1292	/**
1293	* @internal
1294	*/
1295	static void _registerInstance(Transliterator* adoptedObj);
1296
1297	/**
1298	* @internal
1299	*/
1300	static void _registerAlias(const UnicodeString& aliasID, const UnicodeString& realID);
1301
1302	/**
1303	* Register two targets as being inverses of one another. For
1304	* example, calling registerSpecialInverse("NFC", "NFD", true) causes
1305	* Transliterator to form the following inverse relationships:
1306	*
1307	* <pre>NFC => NFD
1308	* Any-NFC => Any-NFD
1309	* NFD => NFC
1310	* Any-NFD => Any-NFC</pre>
1311	*
1312	* (Without the special inverse registration, the inverse of NFC
1313	* would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
1314	* that the presence or absence of "Any-" is preserved.
1315	*
1316	* <p>The relationship is symmetrical; registering (a, b) is
1317	* equivalent to registering (b, a).
1318	*
1319	* <p>The relevant IDs must still be registered separately as
1320	* factories or classes.
1321	*
1322	* <p>Only the targets are specified. Special inverses always
1323	* have the form Any-Target1 <=> Any-Target2. The target should
1324	* have canonical casing (the casing desired to be produced when
1325	* an inverse is formed) and should contain no whitespace or other
1326	* extraneous characters.
1327	*
1328	* @param target the target against which to register the inverse
1329	* @param inverseTarget the inverse of target, that is
1330	* Any-target.getInverse() => Any-inverseTarget
1331	* @param bidirectional if true, register the reverse relation
1332	* as well, that is, Any-inverseTarget.getInverse() => Any-target
1333	* @internal
1334	*/
1335	static void _registerSpecialInverse(const UnicodeString& target,
1336	const UnicodeString& inverseTarget,
1337	UBool bidirectional);
1338	#endif /* U_HIDE_INTERNAL_API */
1339
1340	public:
1341
1342	/**
1343	* Unregisters a transliterator or class. This may be either
1344	* a system transliterator or a user transliterator or class.
1345	* Any attempt to construct an unregistered transliterator based
1346	* on its ID will fail.
1347	*
1348	* Because ICU may choose to cache Transliterators internally, this should
1349	* be called during application shutdown, after all calls to
1350	* Transliterator::createXXX to avoid undefined behavior.
1351	*
1352	* @param ID the ID of the transliterator or class
1353	* @return the <code>Object</code> that was registered with
1354	* <code>ID</code>, or <code>null</code> if none was
1355	* @see #registerInstance
1356	* @see #registerFactory
1357	* @stable ICU 2.0
1358	*/
1359	static void U_EXPORT2 unregister(const UnicodeString& ID);
1360
1361	public:
1362
1363	/**
1364	* Return a StringEnumeration over the IDs available at the time of the
1365	* call, including user-registered IDs.
1366	* @param ec input-output error code
1367	* @return a newly-created StringEnumeration over the transliterators
1368	* available at the time of the call. The caller should delete this object
1369	* when done using it.
1370	* @stable ICU 3.0
1371	*/
1372	static StringEnumeration* U_EXPORT2 getAvailableIDs(UErrorCode& ec);
1373
1374	/**
1375	* Return the number of registered source specifiers.
1376	* @return the number of registered source specifiers.
1377	* @stable ICU 2.0
1378	*/
1379	static int32_t U_EXPORT2 countAvailableSources(void);
1380
1381	/**
1382	* Return a registered source specifier.
1383	* @param index which specifier to return, from 0 to n-1, where
1384	* n = countAvailableSources()
1385	* @param result fill-in paramter to receive the source specifier.
1386	* If index is out of range, result will be empty.
1387	* @return reference to result
1388	* @stable ICU 2.0
1389	*/
1390	static UnicodeString& U_EXPORT2 getAvailableSource(int32_t index,
1391	UnicodeString& result);
1392
1393	/**
1394	* Return the number of registered target specifiers for a given
1395	* source specifier.
1396	* @param source the given source specifier.
1397	* @return the number of registered target specifiers for a given
1398	* source specifier.
1399	* @stable ICU 2.0
1400	*/
1401	static int32_t U_EXPORT2 countAvailableTargets(const UnicodeString& source);
1402
1403	/**
1404	* Return a registered target specifier for a given source.
1405	* @param index which specifier to return, from 0 to n-1, where
1406	* n = countAvailableTargets(source)
1407	* @param source the source specifier
1408	* @param result fill-in paramter to receive the target specifier.
1409	* If source is invalid or if index is out of range, result will
1410	* be empty.
1411	* @return reference to result
1412	* @stable ICU 2.0
1413	*/
1414	static UnicodeString& U_EXPORT2 getAvailableTarget(int32_t index,
1415	const UnicodeString& source,
1416	UnicodeString& result);
1417
1418	/**
1419	* Return the number of registered variant specifiers for a given
1420	* source-target pair.
1421	* @param source the source specifiers.
1422	* @param target the target specifiers.
1423	* @stable ICU 2.0
1424	*/
1425	static int32_t U_EXPORT2 countAvailableVariants(const UnicodeString& source,
1426	const UnicodeString& target);
1427
1428	/**
1429	* Return a registered variant specifier for a given source-target
1430	* pair.
1431	* @param index which specifier to return, from 0 to n-1, where
1432	* n = countAvailableVariants(source, target)
1433	* @param source the source specifier
1434	* @param target the target specifier
1435	* @param result fill-in paramter to receive the variant
1436	* specifier. If source is invalid or if target is invalid or if
1437	* index is out of range, result will be empty.
1438	* @return reference to result
1439	* @stable ICU 2.0
1440	*/
1441	static UnicodeString& U_EXPORT2 getAvailableVariant(int32_t index,
1442	const UnicodeString& source,
1443	const UnicodeString& target,
1444	UnicodeString& result);
1445
1446	protected:
1447
1448	#ifndef U_HIDE_INTERNAL_API
1449	/**
1450	* Non-mutexed internal method
1451	* @internal
1452	*/
1453	static int32_t _countAvailableSources(void);
1454
1455	/**
1456	* Non-mutexed internal method
1457	* @internal
1458	*/
1459	static UnicodeString& _getAvailableSource(int32_t index,
1460	UnicodeString& result);
1461
1462	/**
1463	* Non-mutexed internal method
1464	* @internal
1465	*/
1466	static int32_t _countAvailableTargets(const UnicodeString& source);
1467
1468	/**
1469	* Non-mutexed internal method
1470	* @internal
1471	*/
1472	static UnicodeString& _getAvailableTarget(int32_t index,
1473	const UnicodeString& source,
1474	UnicodeString& result);
1475
1476	/**
1477	* Non-mutexed internal method
1478	* @internal
1479	*/
1480	static int32_t _countAvailableVariants(const UnicodeString& source,
1481	const UnicodeString& target);
1482
1483	/**
1484	* Non-mutexed internal method
1485	* @internal
1486	*/
1487	static UnicodeString& _getAvailableVariant(int32_t index,
1488	const UnicodeString& source,
1489	const UnicodeString& target,
1490	UnicodeString& result);
1491	#endif /* U_HIDE_INTERNAL_API */
1492
1493	protected:
1494
1495	/**
1496	* Set the ID of this transliterators. Subclasses shouldn't do
1497	* this, unless the underlying script behavior has changed.
1498	* @param id the new id t to be set.
1499	* @stable ICU 2.4
1500	*/
1501	void setID(const UnicodeString& id);
1502
1503	public:
1504
1505	/**
1506	* Return the class ID for this class. This is useful only for
1507	* comparing to a return value from getDynamicClassID().
1508	* Note that Transliterator is an abstract base class, and therefor
1509	* no fully constructed object will have a dynamic
1510	* UCLassID that equals the UClassID returned from
1511	* TRansliterator::getStaticClassID().
1512	* @return The class ID for class Transliterator.
1513	* @stable ICU 2.0
1514	*/
1515	static UClassID U_EXPORT2 getStaticClassID(void);
1516
1517	/**
1518	* Returns a unique class ID <b>polymorphically</b>. This method
1519	* is to implement a simple version of RTTI, since not all C++
1520	* compilers support genuine RTTI. Polymorphic operator==() and
1521	* clone() methods call this method.
1522	*
1523	* <p>Concrete subclasses of Transliterator must use the
1524	* UOBJECT_DEFINE_RTTI_IMPLEMENTATION macro from
1525	* uobject.h to provide the RTTI functions.
1526	*
1527	* @return The class ID for this object. All objects of a given
1528	* class have the same class ID. Objects of other classes have
1529	* different class IDs.
1530	* @stable ICU 2.0
1531	*/
1532	virtual UClassID getDynamicClassID(void) const = `0`;
1533
1534	private:
1535	static UBool initializeRegistry(UErrorCode &status);
1536
1537	public:
1538	#ifndef U_HIDE_OBSOLETE_API
1539	/**
1540	* Return the number of IDs currently registered with the system.
1541	* To retrieve the actual IDs, call getAvailableID(i) with
1542	* i from 0 to countAvailableIDs() - 1.
1543	* @return the number of IDs currently registered with the system.
1544	* @obsolete ICU 3.4 use getAvailableIDs() instead
1545	*/
1546	static int32_t U_EXPORT2 countAvailableIDs(void);
1547
1548	/**
1549	* Return the index-th available ID. index must be between 0
1550	* and countAvailableIDs() - 1, inclusive. If index is out of
1551	* range, the result of getAvailableID(0) is returned.
1552	* @param index the given ID index.
1553	* @return the index-th available ID. index must be between 0
1554	* and countAvailableIDs() - 1, inclusive. If index is out of
1555	* range, the result of getAvailableID(0) is returned.
1556	* @obsolete ICU 3.4 use getAvailableIDs() instead; this function
1557	* is not thread safe, since it returns a reference to storage that
1558	* may become invalid if another thread calls unregister
1559	*/
1560	static const UnicodeString& U_EXPORT2 getAvailableID(int32_t index);
1561	#endif /* U_HIDE_OBSOLETE_API */
1562	};
1563
1564	inline int32_t Transliterator::getMaximumContextLength(void) const {
1565	return maximumContextLength;
1566	}
1567
1568	inline void Transliterator::setID(const UnicodeString& id) {
1569	ID = id;
1570	// NUL-terminate the ID string, which is a non-aliased copy.
1571	ID.append((char16_t)`0`);
1572	ID.truncate(ID.length()-`1`);
1573	}
1574
1575	#ifndef U_HIDE_INTERNAL_API
1576	inline Transliterator::Token Transliterator::integerToken(int32_t i) {
1577	Token t;
1578	t.integer = i;
1579	return t;
1580	}
1581
1582	inline Transliterator::Token Transliterator::pointerToken(void* p) {
1583	Token t;
1584	t.pointer = p;
1585	return t;
1586	}
1587	#endif /* U_HIDE_INTERNAL_API */
1588
1589	U_NAMESPACE_END
1590
1591	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
1592
1593	#endif /* U_SHOW_CPLUSPLUS_API */
1594
1595	#endif
1596

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/unicode/translit.h