1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 **************************************************************************
5 * Copyright (c) 2002-2010, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 **************************************************************************
8 * Date Name Description *
9 * 01/28/2002 aliu Creation. *
10 **************************************************************************
11 */
12#ifndef TRIDPARS_H
13#define TRIDPARS_H
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_TRANSLITERATION
18
19#include "unicode/uobject.h"
20#include "unicode/unistr.h"
21
22U_NAMESPACE_BEGIN
23
24class Transliterator;
25class UnicodeSet;
26class UVector;
27
28/**
29 * Parsing component for transliterator IDs. This class contains only
30 * static members; it cannot be instantiated. Methods in this class
31 * parse various ID formats, including the following:
32 *
33 * A basic ID, which contains source, target, and variant, but no
34 * filter and no explicit inverse. Examples include
35 * "Latin-Greek/UNGEGN" and "Null".
36 *
37 * A single ID, which is a basic ID plus optional filter and optional
38 * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and
39 * "Lower (Upper)".
40 *
41 * A compound ID, which is a sequence of one or more single IDs,
42 * separated by semicolons, with optional forward and reverse global
43 * filters. The global filters are UnicodeSet patterns prepended or
44 * appended to the IDs, separated by semicolons. An appended filter
45 * must be enclosed in parentheses and applies in the reverse
46 * direction.
47 *
48 * @author Alan Liu
49 */
50class TransliteratorIDParser /* not : public UObject because all methods are static */ {
51
52 public:
53
54 /**
55 * A structure containing the parsed data of a filtered ID, that
56 * is, a basic ID optionally with a filter.
57 *
58 * 'source' and 'target' will always be non-null. The 'variant'
59 * will be non-null only if a non-empty variant was parsed.
60 *
61 * 'sawSource' is true if there was an explicit source in the
62 * parsed id. If there was no explicit source, then an implied
63 * source of ANY is returned and 'sawSource' is set to false.
64 *
65 * 'filter' is the parsed filter pattern, or null if there was no
66 * filter.
67 */
68 class Specs : public UMemory {
69 public:
70 UnicodeString source; // not null
71 UnicodeString target; // not null
72 UnicodeString variant; // may be null
73 UnicodeString filter; // may be null
74 UBool sawSource;
75 Specs(const UnicodeString& s, const UnicodeString& t,
76 const UnicodeString& v, UBool sawS,
77 const UnicodeString& f);
78
79 private:
80
81 Specs(const Specs &other); // forbid copying of this class
82 Specs &operator=(const Specs &other); // forbid copying of this class
83 };
84
85 /**
86 * A structure containing the canonicalized data of a filtered ID,
87 * that is, a basic ID optionally with a filter.
88 *
89 * 'canonID' is always non-null. It may be the empty string "".
90 * It is the id that should be assigned to the created
91 * transliterator. It _cannot_ be instantiated directly.
92 *
93 * 'basicID' is always non-null and non-empty. It is always of
94 * the form S-T or S-T/V. It is designed to be fed to low-level
95 * instantiation code that only understands these two formats.
96 *
97 * 'filter' may be null, if there is none, or non-null and
98 * non-empty.
99 */
100 class SingleID : public UMemory {
101 public:
102 UnicodeString canonID;
103 UnicodeString basicID;
104 UnicodeString filter;
105 SingleID(const UnicodeString& c, const UnicodeString& b,
106 const UnicodeString& f);
107 SingleID(const UnicodeString& c, const UnicodeString& b);
108 Transliterator* createInstance();
109
110 private:
111
112 SingleID(const SingleID &other); // forbid copying of this class
113 SingleID &operator=(const SingleID &other); // forbid copying of this class
114 };
115
116 /**
117 * Parse a filter ID, that is, an ID of the general form
118 * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
119 * @param id the id to be parsed
120 * @param pos INPUT-OUTPUT parameter. On input, the position of
121 * the first character to parse. On output, the position after
122 * the last character parsed.
123 * @return a SingleID object or null if the parse fails
124 */
125 static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
126
127 /**
128 * Parse a single ID, that is, an ID of the general form
129 * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
130 * optional, the filters optional, and the variants optional.
131 * @param id the id to be parsed
132 * @param pos INPUT-OUTPUT parameter. On input, the position of
133 * the first character to parse. On output, the position after
134 * the last character parsed.
135 * @param dir the direction. If the direction is REVERSE then the
136 * SingleID is constructed for the reverse direction.
137 * @return a SingleID object or null
138 */
139 static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
140 int32_t dir, UErrorCode& status);
141
142 /**
143 * Parse a global filter of the form "[f]" or "([f])", depending
144 * on 'withParens'.
145 * @param id the pattern the parse
146 * @param pos INPUT-OUTPUT parameter. On input, the position of
147 * the first character to parse. On output, the position after
148 * the last character parsed.
149 * @param dir the direction.
150 * @param withParens INPUT-OUTPUT parameter. On entry, if
151 * withParens[0] is 0, then parens are disallowed. If it is 1,
152 * then parens are required. If it is -1, then parens are
153 * optional, and the return result will be set to 0 or 1.
154 * @param canonID OUTPUT parameter. The pattern for the filter
155 * added to the canonID, either at the end, if dir is FORWARD, or
156 * at the start, if dir is REVERSE. The pattern will be enclosed
157 * in parentheses if appropriate, and will be suffixed with an
158 * ID_DELIM character. May be null.
159 * @return a UnicodeSet object or null. A non-null results
160 * indicates a successful parse, regardless of whether the filter
161 * applies to the given direction. The caller should discard it
162 * if withParens != (dir == REVERSE).
163 */
164 static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
165 int32_t dir,
166 int32_t& withParens,
167 UnicodeString* canonID);
168
169 /**
170 * Parse a compound ID, consisting of an optional forward global
171 * filter, a separator, one or more single IDs delimited by
172 * separators, an an optional reverse global filter. The
173 * separator is a semicolon. The global filters are UnicodeSet
174 * patterns. The reverse global filter must be enclosed in
175 * parentheses.
176 * @param id the pattern the parse
177 * @param dir the direction.
178 * @param canonID OUTPUT parameter that receives the canonical ID,
179 * consisting of canonical IDs for all elements, as returned by
180 * parseSingleID(), separated by semicolons. Previous contents
181 * are discarded.
182 * @param list OUTPUT parameter that receives a list of SingleID
183 * objects representing the parsed IDs. Previous contents are
184 * discarded.
185 * @param globalFilter OUTPUT parameter that receives a pointer to
186 * a newly created global filter for this ID in this direction, or
187 * null if there is none.
188 * @return true if the parse succeeds, that is, if the entire
189 * id is consumed without syntax error.
190 */
191 static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
192 UnicodeString& canonID,
193 UVector& list,
194 UnicodeSet*& globalFilter);
195
196 /**
197 * Convert the elements of the 'list' vector, which are SingleID
198 * objects, into actual Transliterator objects. In the course of
199 * this, some (or all) entries may be removed. If all entries
200 * are removed, the Null transliterator will be added.
201 *
202 * Delete entries with empty basicIDs; these are generated by
203 * elements like "(A)" in the forward direction, or "A()" in
204 * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
205 * SingleID entries to actual transliterators.
206 *
207 * @param list vector of SingleID objects. On exit, vector
208 * of one or more Transliterators.
209 * @param ec Output param to receive a success or an error code.
210 * @return new value of insertIndex. The index will shift if
211 * there are empty items, like "(Lower)", with indices less than
212 * insertIndex.
213 */
214 static void instantiateList(UVector& list,
215 UErrorCode& ec);
216
217 /**
218 * Parse an ID into pieces. Take IDs of the form T, T/V, S-T,
219 * S-T/V, or S/V-T. If the source is missing, return a source of
220 * ANY.
221 * @param id the id string, in any of several forms
222 * @param source the given source.
223 * @param target the given target.
224 * @param variant the given variant
225 * @param isSourcePresent If TRUE then the source is present.
226 * If the source is not present, ANY will be
227 * given as the source, and isSourcePresent will be null
228 * @return an array of 4 strings: source, target, variant, and
229 * isSourcePresent. If the source is not present, ANY will be
230 * given as the source, and isSourcePresent will be null. Otherwise
231 * isSourcePresent will be non-null. The target may be empty if the
232 * id is not well-formed. The variant may be empty.
233 */
234 static void IDtoSTV(const UnicodeString& id,
235 UnicodeString& source,
236 UnicodeString& target,
237 UnicodeString& variant,
238 UBool& isSourcePresent);
239
240 /**
241 * Given source, target, and variant strings, concatenate them into a
242 * full ID. If the source is empty, then "Any" will be used for the
243 * source, so the ID will always be of the form s-t/v or s-t.
244 */
245 static void STVtoID(const UnicodeString& source,
246 const UnicodeString& target,
247 const UnicodeString& variant,
248 UnicodeString& id);
249
250 /**
251 * Register two targets as being inverses of one another. For
252 * example, calling registerSpecialInverse("NFC", "NFD", true) causes
253 * Transliterator to form the following inverse relationships:
254 *
255 * <pre>NFC => NFD
256 * Any-NFC => Any-NFD
257 * NFD => NFC
258 * Any-NFD => Any-NFC</pre>
259 *
260 * (Without the special inverse registration, the inverse of NFC
261 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
262 * that the presence or absence of "Any-" is preserved.
263 *
264 * <p>The relationship is symmetrical; registering (a, b) is
265 * equivalent to registering (b, a).
266 *
267 * <p>The relevant IDs must still be registered separately as
268 * factories or classes.
269 *
270 * <p>Only the targets are specified. Special inverses always
271 * have the form Any-Target1 <=> Any-Target2. The target should
272 * have canonical casing (the casing desired to be produced when
273 * an inverse is formed) and should contain no whitespace or other
274 * extraneous characters.
275 *
276 * @param target the target against which to register the inverse
277 * @param inverseTarget the inverse of target, that is
278 * Any-target.getInverse() => Any-inverseTarget
279 * @param bidirectional if true, register the reverse relation
280 * as well, that is, Any-inverseTarget.getInverse() => Any-target
281 */
282 static void registerSpecialInverse(const UnicodeString& target,
283 const UnicodeString& inverseTarget,
284 UBool bidirectional,
285 UErrorCode &status);
286
287 /**
288 * Free static memory.
289 */
290 static void cleanup();
291
292 private:
293 //----------------------------------------------------------------
294 // Private implementation
295 //----------------------------------------------------------------
296
297 // forbid instantiation
298 TransliteratorIDParser();
299
300 /**
301 * Parse an ID into component pieces. Take IDs of the form T,
302 * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a
303 * source of ANY.
304 * @param id the id string, in any of several forms
305 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
306 * offset of the first character to parse in id. On output,
307 * pos[0] is the offset after the last parsed character. If the
308 * parse failed, pos[0] will be unchanged.
309 * @param allowFilter if true, a UnicodeSet pattern is allowed
310 * at any location between specs or delimiters, and is returned
311 * as the fifth string in the array.
312 * @return a Specs object, or null if the parse failed. If
313 * neither source nor target was seen in the parsed id, then the
314 * parse fails. If allowFilter is true, then the parsed filter
315 * pattern is returned in the Specs object, otherwise the returned
316 * filter reference is null. If the parse fails for any reason
317 * null is returned.
318 */
319 static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
320 UBool allowFilter);
321
322 /**
323 * Givens a Specs object, convert it to a SingleID object. The
324 * Spec object is a more unprocessed parse result. The SingleID
325 * object contains information about canonical and basic IDs.
326 * @param specs the given Specs object.
327 * @param dir either FORWARD or REVERSE.
328 * @return a SingleID; never returns null. Returned object always
329 * has 'filter' field of null.
330 */
331 static SingleID* specsToID(const Specs* specs, int32_t dir);
332
333 /**
334 * Given a Specs object, return a SingleID representing the
335 * special inverse of that ID. If there is no special inverse
336 * then return null.
337 * @param specs the given Specs.
338 * @return a SingleID or null. Returned object always has
339 * 'filter' field of null.
340 */
341 static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
342
343 /**
344 * Glue method to get around access problems in C++.
345 * @param id the id string for the transliterator, in any of several forms
346 * @param canonID the given canonical ID
347 */
348 static Transliterator* createBasicInstance(const UnicodeString& id,
349 const UnicodeString* canonID);
350
351 /**
352 * Initialize static memory.
353 */
354 static void U_CALLCONV init(UErrorCode &status);
355
356 friend class SingleID;
357};
358
359U_NAMESPACE_END
360
361#endif /* #if !UCONFIG_NO_TRANSLITERATION */
362
363#endif
364