rbt_pars.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/rbt_pars.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 1999-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 11/17/99 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "unicode/uobject.h"
18	#include "unicode/parseerr.h"
19	#include "unicode/parsepos.h"
20	#include "unicode/putil.h"
21	#include "unicode/uchar.h"
22	#include "unicode/ustring.h"
23	#include "unicode/uniset.h"
24	#include "unicode/utf16.h"
25	#include "cstring.h"
26	#include "funcrepl.h"
27	#include "hash.h"
28	#include "quant.h"
29	#include "rbt.h"
30	#include "rbt_data.h"
31	#include "rbt_pars.h"
32	#include "rbt_rule.h"
33	#include "strmatch.h"
34	#include "strrepl.h"
35	#include "unicode/symtable.h"
36	#include "tridpars.h"
37	#include "uvector.h"
38	#include "hash.h"
39	#include "patternprops.h"
40	#include "util.h"
41	#include "cmemory.h"
42	#include "uprops.h"
43	#include "putilimp.h"
44
45	// Operators
46	#define VARIABLE_DEF_OP ((UChar)0x003D) /=/
47	#define FORWARD_RULE_OP ((UChar)0x003E) />/
48	#define REVERSE_RULE_OP ((UChar)0x003C) /</
49	#define FWDREV_RULE_OP ((UChar)0x007E) /~/ // internal rep of <> op
50
51	// Other special characters
52	#define QUOTE ((UChar)0x0027) /'/
53	#define ESCAPE ((UChar)0x005C) /\/
54	#define END_OF_RULE ((UChar)0x003B) /;/
55	#define RULE_COMMENT_CHAR ((UChar)0x0023) /#/
56
57	#define SEGMENT_OPEN ((UChar)0x0028) /(/
58	#define SEGMENT_CLOSE ((UChar)0x0029) /)/
59	#define CONTEXT_ANTE ((UChar)0x007B) /{/
60	#define CONTEXT_POST ((UChar)0x007D) /}/
61	#define CURSOR_POS ((UChar)0x007C) /\|/
62	#define CURSOR_OFFSET ((UChar)0x0040) /@/
63	#define ANCHOR_START ((UChar)0x005E) /^/
64	#define KLEENE_STAR ((UChar)0x002A) /***/
65	#define ONE_OR_MORE ((UChar)0x002B) /+/
66	#define ZERO_OR_ONE ((UChar)0x003F) /?/
67
68	#define DOT ((UChar)46) /./
69
70	static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
71	`91`, `94`, `91`, `58`, `90`, `112`, `58`, `93`, `91`, `58`, `90`,
72	`108`, `58`, `93`, `92`, `114`, `92`, `110`, `36`, `93`, `0`
73	};
74
75	// A function is denoted &Source-Target/Variant(text)
76	#define FUNCTION ((UChar)38) /&/
77
78	// Aliases for some of the syntax characters. These are provided so
79	// transliteration rules can be expressed in XML without clashing with
80	// XML syntax characters '<', '>', and '&'.
81	#define ALT_REVERSE_RULE_OP ((UChar)0x2190) // Left Arrow
82	#define ALT_FORWARD_RULE_OP ((UChar)0x2192) // Right Arrow
83	#define ALT_FWDREV_RULE_OP ((UChar)0x2194) // Left Right Arrow
84	#define ALT_FUNCTION ((UChar)0x2206) // Increment (~Greek Capital Delta)
85
86	// Special characters disallowed at the top level
87	static const UChar ILLEGAL_TOP[] = {`41`,`0`}; // ")"
88
89	// Special characters disallowed within a segment
90	static const UChar ILLEGAL_SEG[] = {`123`,`125`,`124`,`64`,`0`}; // "{}\|@"
91
92	// Special characters disallowed within a function argument
93	static const UChar ILLEGAL_FUNC[] = {`94`,`40`,`46`,`42`,`43`,`63`,`123`,`125`,`124`,`64`,`0`}; // "^(.+?{}\|@"*
94
95	// By definition, the ANCHOR_END special character is a
96	// trailing SymbolTable.SYMBOL_REF character.
97	// private static final char ANCHOR_END = '$';
98
99	static const UChar gOPERATORS[] = { // "=><"
100	VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
101	ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
102	`0`
103	};
104
105	static const UChar HALF_ENDERS[] = { // "=><;"
106	VARIABLE_DEF_OP, FORWARD_RULE_OP, REVERSE_RULE_OP,
107	ALT_FORWARD_RULE_OP, ALT_REVERSE_RULE_OP, ALT_FWDREV_RULE_OP,
108	END_OF_RULE,
109	`0`
110	};
111
112	// These are also used in Transliterator::toRules()
113	static const int32_t ID_TOKEN_LEN = `2`;
114	static const UChar ID_TOKEN[] = { `0x3A`, `0x3A` }; // ':', ':'
115
116	/*
117	commented out until we do real ::BEGIN/::END functionality
118	static const int32_t BEGIN_TOKEN_LEN = 5;
119	static const UChar BEGIN_TOKEN[] = { 0x42, 0x45, 0x47, 0x49, 0x4e }; // 'BEGIN'
120
121	static const int32_t END_TOKEN_LEN = 3;
122	static const UChar END_TOKEN[] = { 0x45, 0x4e, 0x44 }; // 'END'
123	*/
124
125	U_NAMESPACE_BEGIN
126
127	//----------------------------------------------------------------------
128	// BEGIN ParseData
129	//----------------------------------------------------------------------
130
131	/**
132	* This class implements the SymbolTable interface. It is used
133	* during parsing to give UnicodeSet access to variables that
134	* have been defined so far. Note that it uses variablesVector,
135	* _not_ data.setVariables.
136	*/
137	class ParseData : public UMemory, public SymbolTable {
138	public:
139	const TransliterationRuleData* data; // alias
140
141	const UVector* variablesVector; // alias
142
143	const Hashtable* variableNames; // alias
144
145	ParseData(const TransliterationRuleData* data = `0`,
146	const UVector* variablesVector = `0`,
147	const Hashtable* variableNames = `0`);
148
149	virtual ~ParseData();
150
151	virtual const UnicodeString* lookup(const UnicodeString& s) const;
152
153	virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
154
155	virtual UnicodeString parseReference(const UnicodeString& text,
156	ParsePosition& pos, int32_t limit) const;
157	/**
158	* Return true if the given character is a matcher standin or a plain
159	* character (non standin).
160	*/
161	UBool isMatcher(UChar32 ch);
162
163	/**
164	* Return true if the given character is a replacer standin or a plain
165	* character (non standin).
166	*/
167	UBool isReplacer(UChar32 ch);
168
169	private:
170	ParseData(const ParseData &other); // forbid copying of this class
171	ParseData &operator=(const ParseData &other); // forbid copying of this class
172	};
173
174	ParseData::ParseData(const TransliterationRuleData* d,
175	const UVector* sets,
176	const Hashtable* vNames) :
177	data(d), variablesVector(sets), variableNames(vNames) {}
178
179	ParseData::~ParseData() {}
180
181	/**
182	* Implement SymbolTable API.
183	*/
184	const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
185	return (const UnicodeString*) variableNames->get(name);
186	}
187
188	/**
189	* Implement SymbolTable API.
190	*/
191	const UnicodeFunctor* ParseData::lookupMatcher(UChar32 ch) const {
192	// Note that we cannot use data.lookupSet() because the
193	// set array has not been constructed yet.
194	const UnicodeFunctor* set = NULL;
195	int32_t i = ch - data->variablesBase;
196	if (i >= `0` && i < variablesVector->size()) {
197	int32_t j = ch - data->variablesBase;
198	set = (j < variablesVector->size()) ?
199	(UnicodeFunctor*) variablesVector->elementAt(j) : `0`;
200	}
201	return set;
202	}
203
204	/**
205	* Implement SymbolTable API. Parse out a symbol reference
206	* name.
207	*/
208	UnicodeString ParseData::parseReference(const UnicodeString& text,
209	ParsePosition& pos, int32_t limit) const {
210	int32_t start = pos.getIndex();
211	int32_t i = start;
212	UnicodeString result;
213	while (i < limit) {
214	UChar c = text.charAt(i);
215	if ((i==start && !u_isIDStart(c)) \|\| !u_isIDPart(c)) {
216	break;
217	}
218	++i;
219	}
220	if (i == start) { // No valid name chars
221	return result; // Indicate failure with empty string
222	}
223	pos.setIndex(i);
224	text.extractBetween(start, i, result);
225	return result;
226	}
227
228	UBool ParseData::isMatcher(UChar32 ch) {
229	// Note that we cannot use data.lookup() because the
230	// set array has not been constructed yet.
231	int32_t i = ch - data->variablesBase;
232	if (i >= `0` && i < variablesVector->size()) {
233	UnicodeFunctor f = (UnicodeFunctor) variablesVector->elementAt(i);
234	return f != NULL && f->toMatcher() != NULL;
235	}
236	return TRUE;
237	}
238
239	/**
240	* Return true if the given character is a replacer standin or a plain
241	* character (non standin).
242	*/
243	UBool ParseData::isReplacer(UChar32 ch) {
244	// Note that we cannot use data.lookup() because the
245	// set array has not been constructed yet.
246	int i = ch - data->variablesBase;
247	if (i >= `0` && i < variablesVector->size()) {
248	UnicodeFunctor f = (UnicodeFunctor) variablesVector->elementAt(i);
249	return f != NULL && f->toReplacer() != NULL;
250	}
251	return TRUE;
252	}
253
254	//----------------------------------------------------------------------
255	// BEGIN RuleHalf
256	//----------------------------------------------------------------------
257
258	/**
259	* A class representing one side of a rule. This class knows how to
260	* parse half of a rule. It is tightly coupled to the method
261	* RuleBasedTransliterator.Parser.parseRule().
262	*/
263	class RuleHalf : public UMemory {
264
265	public:
266
267	UnicodeString text;
268
269	int32_t cursor; // position of cursor in text
270	int32_t ante; // position of ante context marker '{' in text
271	int32_t post; // position of post context marker '}' in text
272
273	// Record the offset to the cursor either to the left or to the
274	// right of the key. This is indicated by characters on the output
275	// side that allow the cursor to be positioned arbitrarily within
276	// the matching text. For example, abc{def} > \| @@@ xyz; changes
277	// def to xyz and moves the cursor to before abc. Offset characters
278	// must be at the start or end, and they cannot move the cursor past
279	// the ante- or postcontext text. Placeholders are only valid in
280	// output text. The length of the ante and post context is
281	// determined at runtime, because of supplementals and quantifiers.
282	int32_t cursorOffset; // only nonzero on output side
283
284	// Position of first CURSOR_OFFSET on _right_. This will be -1
285	// for \|@, -2 for \|@@, etc., and 1 for @\|, 2 for @@\|, etc.
286	int32_t cursorOffsetPos;
287
288	UBool anchorStart;
289	UBool anchorEnd;
290
291	/**
292	* The segment number from 1..n of the next '(' we see
293	* during parsing; 1-based.
294	*/
295	int32_t nextSegmentNumber;
296
297	TransliteratorParser& parser;
298
299	//--------------------------------------------------
300	// Methods
301
302	RuleHalf(TransliteratorParser& parser);
303	~RuleHalf();
304
305	int32_t parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);
306
307	int32_t parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
308	UnicodeString& buf,
309	const UnicodeString& illegal,
310	UBool isSegment,
311	UErrorCode& status);
312
313	/**
314	* Remove context.
315	*/
316	void removeContext();
317
318	/**
319	* Return true if this half looks like valid output, that is, does not
320	* contain quantifiers or other special input-only elements.
321	*/
322	UBool isValidOutput(TransliteratorParser& parser);
323
324	/**
325	* Return true if this half looks like valid input, that is, does not
326	* contain functions or other special output-only elements.
327	*/
328	UBool isValidInput(TransliteratorParser& parser);
329
330	int syntaxError(UErrorCode code,
331	const UnicodeString& rule,
332	int32_t start,
333	UErrorCode& status) {
334	return parser.syntaxError(code, rule, start, status);
335	}
336
337	private:
338	// Disallowed methods; no impl.
339	RuleHalf(const RuleHalf&);
340	RuleHalf& operator=(const RuleHalf&);
341	};
342
343	RuleHalf::RuleHalf(TransliteratorParser& p) :
344	parser(p)
345	{
346	cursor = -`1`;
347	ante = -`1`;
348	post = -`1`;
349	cursorOffset = `0`;
350	cursorOffsetPos = `0`;
351	anchorStart = anchorEnd = FALSE;
352	nextSegmentNumber = `1`;
353	}
354
355	RuleHalf::~RuleHalf() {
356	}
357
358	/**
359	* Parse one side of a rule, stopping at either the limit,
360	* the END_OF_RULE character, or an operator.
361	* @return the index after the terminating character, or
362	* if limit was reached, limit
363	*/
364	int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
365	int32_t start = pos;
366	text.truncate(`0`);
367	pos = parseSection(rule, pos, limit, text, UnicodeString (TRUE, ILLEGAL_TOP, -`1`), FALSE, status);
368
369	if (cursorOffset > `0` && cursor != cursorOffsetPos) {
370	return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
371	}
372
373	return pos;
374	}
375
376	/**
377	* Parse a section of one side of a rule, stopping at either
378	* the limit, the END_OF_RULE character, an operator, or a
379	* segment close character. This method parses both a
380	* top-level rule half and a segment within such a rule half.
381	* It calls itself recursively to parse segments and nested
382	* segments.
383	* @param buf buffer into which to accumulate the rule pattern
384	* characters, either literal characters from the rule or
385	* standins for UnicodeMatcher objects including segments.
386	* @param illegal the set of special characters that is illegal during
387	* this parse.
388	* @param isSegment if true, then we've already seen a '(' and
389	* pos on entry points right after it. Accumulate everything
390	* up to the closing ')', put it in a segment matcher object,
391	* generate a standin for it, and add the standin to buf. As
392	* a side effect, update the segments vector with a reference
393	* to the segment matcher. This works recursively for nested
394	* segments. If isSegment is false, just accumulate
395	* characters into buf.
396	* @return the index after the terminating character, or
397	* if limit was reached, limit
398	*/
399	int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t limit,
400	UnicodeString& buf,
401	const UnicodeString& illegal,
402	UBool isSegment, UErrorCode& status) {
403	int32_t start = pos;
404	ParsePosition pp;
405	UnicodeString scratch;
406	UBool done = FALSE;
407	int32_t quoteStart = -`1`; // Most recent 'single quoted string'
408	int32_t quoteLimit = -`1`;
409	int32_t varStart = -`1`; // Most recent $variableReference
410	int32_t varLimit = -`1`;
411	int32_t bufStart = buf.length();
412
413	while (pos < limit && !done) {
414	// Since all syntax characters are in the BMP, fetching
415	// 16-bit code units suffices here.
416	UChar c = rule.charAt(pos++);
417	if (PatternProps::isWhiteSpace(c)) {
418	// Ignore whitespace. Note that this is not Unicode
419	// spaces, but Java spaces -- a subset, representing
420	// whitespace likely to be seen in code.
421	continue;
422	}
423	if (u_strchr(HALF_ENDERS, c) != NULL) {
424	if (isSegment) {
425	// Unclosed segment
426	return syntaxError(U_UNCLOSED_SEGMENT, rule, start, status);
427	}
428	break;
429	}
430	if (anchorEnd) {
431	// Text after a presumed end anchor is a syntax err
432	return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start, status);
433	}
434	if (UnicodeSet::resemblesPattern(rule, pos-`1`)) {
435	pp.setIndex(pos-`1`); // Backup to opening '['
436	buf.append(parser.parseSet(rule, pp, status));
437	if (U_FAILURE(status)) {
438	return syntaxError(U_MALFORMED_SET, rule, start, status);
439	}
440	pos = pp.getIndex();
441	continue;
442	}
443	// Handle escapes
444	if (c == ESCAPE) {
445	if (pos == limit) {
446	return syntaxError(U_TRAILING_BACKSLASH, rule, start, status);
447	}
448	UChar32 escaped = rule.unescapeAt(pos); // pos is already past '\\'
449	if (escaped == (UChar32) -`1`) {
450	return syntaxError(U_MALFORMED_UNICODE_ESCAPE, rule, start, status);
451	}
452	if (!parser.checkVariableRange(escaped)) {
453	return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
454	}
455	buf.append(escaped);
456	continue;
457	}
458	// Handle quoted matter
459	if (c == QUOTE) {
460	int32_t iq = rule.indexOf(QUOTE, pos);
461	if (iq == pos) {
462	buf.append(c); // Parse [''] outside quotes as [']
463	++pos;
464	} else {
465	/ This loop picks up a run of quoted text of the*
466	* form 'aaaa' each time through. If this run
467	* hasn't really ended ('aaaa''bbbb') then it keeps
468	* looping, each time adding on a new run. When it
469	* reaches the final quote it breaks.
470	*/
471	quoteStart = buf.length();
472	for (;;) {
473	if (iq < `0`) {
474	return syntaxError(U_UNTERMINATED_QUOTE, rule, start, status);
475	}
476	scratch.truncate(`0`);
477	rule.extractBetween(pos, iq, scratch);
478	buf.append(scratch);
479	pos = iq+`1`;
480	if (pos < limit && rule.charAt(pos) == QUOTE) {
481	// Parse [''] inside quotes as [']
482	iq = rule.indexOf(QUOTE, pos+`1`);
483	// Continue looping
484	} else {
485	break;
486	}
487	}
488	quoteLimit = buf.length();
489
490	for (iq=quoteStart; iq<quoteLimit; ++iq) {
491	if (!parser.checkVariableRange(buf.charAt(iq))) {
492	return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
493	}
494	}
495	}
496	continue;
497	}
498
499	if (!parser.checkVariableRange(c)) {
500	return syntaxError(U_VARIABLE_RANGE_OVERLAP, rule, start, status);
501	}
502
503	if (illegal.indexOf(c) >= `0`) {
504	syntaxError(U_ILLEGAL_CHARACTER, rule, start, status);
505	}
506
507	switch (c) {
508
509	//------------------------------------------------------
510	// Elements allowed within and out of segments
511	//------------------------------------------------------
512	case ANCHOR_START:
513	if (buf.length() == `0` && !anchorStart) {
514	anchorStart = TRUE;
515	} else {
516	return syntaxError(U_MISPLACED_ANCHOR_START,
517	rule, start, status);
518	}
519	break;
520	case SEGMENT_OPEN:
521	{
522	// bufSegStart is the offset in buf to the first
523	// character of the segment we are parsing.
524	int32_t bufSegStart = buf.length();
525
526	// Record segment number now, since nextSegmentNumber
527	// will be incremented during the call to parseSection
528	// if there are nested segments.
529	int32_t segmentNumber = nextSegmentNumber++; // 1-based
530
531	// Parse the segment
532	pos = parseSection(rule, pos, limit, buf, UnicodeString (TRUE, ILLEGAL_SEG, -`1`), TRUE, status);
533
534	// After parsing a segment, the relevant characters are
535	// in buf, starting at offset bufSegStart. Extract them
536	// into a string matcher, and replace them with a
537	// standin for that matcher.
538	StringMatcher* m =
539	new StringMatcher (buf, bufSegStart, buf.length(),
540	segmentNumber, *parser.curData);
541	if (m == NULL) {
542	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
543	}
544
545	// Record and associate object and segment number
546	parser.setSegmentObject(segmentNumber, m, status);
547	buf.truncate(bufSegStart);
548	buf.append(parser.getSegmentStandin(segmentNumber, status));
549	}
550	break;
551	case FUNCTION:
552	case ALT_FUNCTION:
553	{
554	int32_t iref = pos;
555	TransliteratorIDParser::SingleID* single =
556	TransliteratorIDParser::parseFilterID(rule, iref);
557	// The next character MUST be a segment open
558	if (single == NULL \|\|
559	!ICU_Utility::parseChar(rule, iref, SEGMENT_OPEN)) {
560	return syntaxError(U_INVALID_FUNCTION, rule, start, status);
561	}
562
563	Transliterator *t = single->createInstance();
564	delete single;
565	if (t == NULL) {
566	return syntaxError(U_INVALID_FUNCTION, rule, start, status);
567	}
568
569	// bufSegStart is the offset in buf to the first
570	// character of the segment we are parsing.
571	int32_t bufSegStart = buf.length();
572
573	// Parse the segment
574	pos = parseSection(rule, iref, limit, buf, UnicodeString (TRUE, ILLEGAL_FUNC, -`1`), TRUE, status);
575
576	// After parsing a segment, the relevant characters are
577	// in buf, starting at offset bufSegStart.
578	UnicodeString output;
579	buf.extractBetween(bufSegStart, buf.length(), output);
580	FunctionReplacer *r =
581	new FunctionReplacer (t, new StringReplacer (output, parser.curData));
582	if (r == NULL) {
583	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
584	}
585
586	// Replace the buffer contents with a stand-in
587	buf.truncate(bufSegStart);
588	buf.append(parser.generateStandInFor(r, status));
589	}
590	break;
591	case SymbolTable::SYMBOL_REF:
592	// Handle variable references and segment references "$1" .. "$9"
593	{
594	// A variable reference must be followed immediately
595	// by a Unicode identifier start and zero or more
596	// Unicode identifier part characters, or by a digit
597	// 1..9 if it is a segment reference.
598	if (pos == limit) {
599	// A variable ref character at the end acts as
600	// an anchor to the context limit, as in perl.
601	anchorEnd = TRUE;
602	break;
603	}
604	// Parse "$1" "$2" .. "$9" .. (no upper limit)
605	c = rule.charAt(pos);
606	int32_t r = u_digit(c, `10`);
607	if (r >= `1` && r <= `9`) {
608	r = ICU_Utility::parseNumber(rule, pos, `10`);
609	if (r < `0`) {
610	return syntaxError(U_UNDEFINED_SEGMENT_REFERENCE,
611	rule, start, status);
612	}
613	buf.append(parser.getSegmentStandin(r, status));
614	} else {
615	pp.setIndex(pos);
616	UnicodeString name = parser.parseData->
617	parseReference(rule, pp, limit);
618	if (name.length() == `0`) {
619	// This means the '$' was not followed by a
620	// valid name. Try to interpret it as an
621	// end anchor then. If this also doesn't work
622	// (if we see a following character) then signal
623	// an error.
624	anchorEnd = TRUE;
625	break;
626	}
627	pos = pp.getIndex();
628	// If this is a variable definition statement,
629	// then the LHS variable will be undefined. In
630	// that case appendVariableDef() will append the
631	// special placeholder char variableLimit-1.
632	varStart = buf.length();
633	parser.appendVariableDef(name, buf, status);
634	varLimit = buf.length();
635	}
636	}
637	break;
638	case DOT:
639	buf.append(parser.getDotStandIn(status));
640	break;
641	case KLEENE_STAR:
642	case ONE_OR_MORE:
643	case ZERO_OR_ONE:
644	// Quantifiers. We handle single characters, quoted strings,
645	// variable references, and segments.
646	// a+ matches aaa
647	// 'foo'+ matches foofoofoo
648	// $v+ matches xyxyxy if $v == xy
649	// (seg)+ matches segsegseg
650	{
651	if (isSegment && buf.length() == bufStart) {
652	// The /+ immediately follows '('*
653	return syntaxError(U_MISPLACED_QUANTIFIER, rule, start, status);
654	}
655
656	int32_t qstart, qlimit;
657	// The /+ follows an isolated character or quote*
658	// or variable reference
659	if (buf.length() == quoteLimit) {
660	// The /+ follows a 'quoted string'*
661	qstart = quoteStart;
662	qlimit = quoteLimit;
663	} else if (buf.length() == varLimit) {
664	// The /+ follows a $variableReference*
665	qstart = varStart;
666	qlimit = varLimit;
667	} else {
668	// The /+ follows a single character, possibly*
669	// a segment standin
670	qstart = buf.length() - `1`;
671	qlimit = qstart + `1`;
672	}
673
674	UnicodeFunctor *m =
675	new StringMatcher (buf, qstart, qlimit, `0`, *parser.curData);
676	if (m == NULL) {
677	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
678	}
679	int32_t min = `0`;
680	int32_t max = Quantifier::MAX;
681	switch (c) {
682	case ONE_OR_MORE:
683	min = `1`;
684	break;
685	case ZERO_OR_ONE:
686	min = `0`;
687	max = `1`;
688	break;
689	// case KLEENE_STAR:
690	// do nothing -- min, max already set
691	}
692	m = new Quantifier (m, min, max);
693	if (m == NULL) {
694	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
695	}
696	buf.truncate(qstart);
697	buf.append(parser.generateStandInFor(m, status));
698	}
699	break;
700
701	//------------------------------------------------------
702	// Elements allowed ONLY WITHIN segments
703	//------------------------------------------------------
704	case SEGMENT_CLOSE:
705	// assert(isSegment);
706	// We're done parsing a segment.
707	done = TRUE;
708	break;
709
710	//------------------------------------------------------
711	// Elements allowed ONLY OUTSIDE segments
712	//------------------------------------------------------
713	case CONTEXT_ANTE:
714	if (ante >= `0`) {
715	return syntaxError(U_MULTIPLE_ANTE_CONTEXTS, rule, start, status);
716	}
717	ante = buf.length();
718	break;
719	case CONTEXT_POST:
720	if (post >= `0`) {
721	return syntaxError(U_MULTIPLE_POST_CONTEXTS, rule, start, status);
722	}
723	post = buf.length();
724	break;
725	case CURSOR_POS:
726	if (cursor >= `0`) {
727	return syntaxError(U_MULTIPLE_CURSORS, rule, start, status);
728	}
729	cursor = buf.length();
730	break;
731	case CURSOR_OFFSET:
732	if (cursorOffset < `0`) {
733	if (buf.length() > `0`) {
734	return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
735	}
736	--cursorOffset;
737	} else if (cursorOffset > `0`) {
738	if (buf.length() != cursorOffsetPos \|\| cursor >= `0`) {
739	return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
740	}
741	++cursorOffset;
742	} else {
743	if (cursor == `0` && buf.length() == `0`) {
744	cursorOffset = -`1`;
745	} else if (cursor < `0`) {
746	cursorOffsetPos = buf.length();
747	cursorOffset = `1`;
748	} else {
749	return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
750	}
751	}
752	break;
753
754
755	//------------------------------------------------------
756	// Non-special characters
757	//------------------------------------------------------
758	default:
759	// Disallow unquoted characters other than [0-9A-Za-z]
760	// in the printable ASCII range. These characters are
761	// reserved for possible future use.
762	if (c >= `0x0021` && c <= `0x007E` &&
763	!((c >= `0x0030`/'0'/ && c <= `0x0039`/'9'/) \|\|
764	(c >= `0x0041`/'A'/ && c <= `0x005A`/'Z'/) \|\|
765	(c >= `0x0061`/'a'/ && c <= `0x007A`/'z'/))) {
766	return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);
767	}
768	buf.append(c);
769	break;
770	}
771	}
772
773	return pos;
774	}
775
776	/**
777	* Remove context.
778	*/
779	void RuleHalf::removeContext() {
780	//text = text.substring(ante < 0 ? 0 : ante,
781	// post < 0 ? text.length() : post);
782	if (post >= `0`) {
783	text.remove(post);
784	}
785	if (ante >= `0`) {
786	text.removeBetween(`0`, ante);
787	}
788	ante = post = -`1`;
789	anchorStart = anchorEnd = FALSE;
790	}
791
792	/**
793	* Return true if this half looks like valid output, that is, does not
794	* contain quantifiers or other special input-only elements.
795	*/
796	UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) {
797	for (int32_t i=`0`; i<text.length(); ) {
798	UChar32 c = text.char32At(i);
799	i += U16_LENGTH(c);
800	if (!transParser.parseData->isReplacer(c)) {
801	return FALSE;
802	}
803	}
804	return TRUE;
805	}
806
807	/**
808	* Return true if this half looks like valid input, that is, does not
809	* contain functions or other special output-only elements.
810	*/
811	UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {
812	for (int32_t i=`0`; i<text.length(); ) {
813	UChar32 c = text.char32At(i);
814	i += U16_LENGTH(c);
815	if (!transParser.parseData->isMatcher(c)) {
816	return FALSE;
817	}
818	}
819	return TRUE;
820	}
821
822	//----------------------------------------------------------------------
823	// PUBLIC API
824	//----------------------------------------------------------------------
825
826	/**
827	* Constructor.
828	*/
829	TransliteratorParser::TransliteratorParser(UErrorCode &statusReturn) :
830	dataVector (statusReturn),
831	idBlockVector (statusReturn),
832	variablesVector (statusReturn),
833	segmentObjects (statusReturn)
834	{
835	idBlockVector.setDeleter(uprv_deleteUObject);
836	curData = NULL;
837	compoundFilter = NULL;
838	parseData = NULL;
839	variableNames.setValueDeleter(uprv_deleteUObject);
840	}
841
842	/**
843	* Destructor.
844	*/
845	TransliteratorParser::~TransliteratorParser() {
846	while (!dataVector.isEmpty())
847	delete (TransliterationRuleData*)(dataVector.orphanElementAt(`0`));
848	delete compoundFilter;
849	delete parseData;
850	while (!variablesVector.isEmpty())
851	delete (UnicodeFunctor*)variablesVector.orphanElementAt(`0`);
852	}
853
854	void
855	TransliteratorParser::parse(const UnicodeString& rules,
856	UTransDirection transDirection,
857	UParseError& pe,
858	UErrorCode& ec) {
859	if (U_SUCCESS(ec)) {
860	parseRules(rules, transDirection, ec);
861	pe = parseError;
862	}
863	}
864
865	/**
866	* Return the compound filter parsed by parse(). Caller owns result.
867	*/
868	UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
869	UnicodeSet* f = compoundFilter;
870	compoundFilter = NULL;
871	return f;
872	}
873
874	//----------------------------------------------------------------------
875	// Private implementation
876	//----------------------------------------------------------------------
877
878	/**
879	* Parse the given string as a sequence of rules, separated by newline
880	* characters ('\n'), and cause this object to implement those rules. Any
881	* previous rules are discarded. Typically this method is called exactly
882	* once, during construction.
883	* @exception IllegalArgumentException if there is a syntax error in the
884	* rules
885	*/
886	void TransliteratorParser::parseRules(const UnicodeString& rule,
887	UTransDirection theDirection,
888	UErrorCode& status)
889	{
890	// Clear error struct
891	uprv_memset(&parseError, `0`, sizeof(parseError));
892	parseError.line = parseError.offset = -`1`;
893
894	UBool parsingIDs = TRUE;
895	int32_t ruleCount = `0`;
896
897	while (!dataVector.isEmpty()) {
898	delete (TransliterationRuleData*)(dataVector.orphanElementAt(`0`));
899	}
900	if (U_FAILURE(status)) {
901	return;
902	}
903
904	idBlockVector.removeAllElements();
905	curData = NULL;
906	direction = theDirection;
907	ruleCount = `0`;
908
909	delete compoundFilter;
910	compoundFilter = NULL;
911
912	while (!variablesVector.isEmpty()) {
913	delete (UnicodeFunctor*)variablesVector.orphanElementAt(`0`);
914	}
915	variableNames.removeAll();
916	parseData = new ParseData (`0`, &variablesVector, &variableNames);
917	if (parseData == NULL) {
918	status = U_MEMORY_ALLOCATION_ERROR;
919	return;
920	}
921
922	dotStandIn = (UChar) -`1`;
923
924	UnicodeString tempstr = NULL; // used for memory allocation error checking*
925	UnicodeString str; // scratch
926	UnicodeString idBlockResult;
927	int32_t pos = `0`;
928	int32_t limit = rule.length();
929
930	// The compound filter offset is an index into idBlockResult.
931	// If it is 0, then the compound filter occurred at the start,
932	// and it is the offset to the _start_ of the compound filter
933	// pattern. Otherwise it is the offset to the _limit_ of the
934	// compound filter pattern within idBlockResult.
935	compoundFilter = NULL;
936	int32_t compoundFilterOffset = -`1`;
937
938	while (pos < limit && U_SUCCESS(status)) {
939	UChar c = rule.charAt(pos++);
940	if (PatternProps::isWhiteSpace(c)) {
941	// Ignore leading whitespace.
942	continue;
943	}
944	// Skip lines starting with the comment character
945	if (c == RULE_COMMENT_CHAR) {
946	pos = rule.indexOf((UChar)`0x000A` /\n/, pos) + `1`;
947	if (pos == `0`) {
948	break; // No "\n" found; rest of rule is a commnet
949	}
950	continue; // Either fall out or restart with next line
951	}
952
953	// skip empty rules
954	if (c == END_OF_RULE)
955	continue;
956
957	// keep track of how many rules we've seen
958	++ruleCount;
959
960	// We've found the start of a rule or ID. c is its first
961	// character, and pos points past c.
962	--pos;
963	// Look for an ID token. Must have at least ID_TOKEN_LEN + 1
964	// chars left.
965	if ((pos + ID_TOKEN_LEN + `1`) <= limit &&
966	rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == `0`) {
967	pos += ID_TOKEN_LEN;
968	c = rule.charAt(pos);
969	while (PatternProps::isWhiteSpace(c) && pos < limit) {
970	++pos;
971	c = rule.charAt(pos);
972	}
973
974	int32_t p = pos;
975
976	if (!parsingIDs) {
977	if (curData != NULL) {
978	if (direction == UTRANS_FORWARD)
979	dataVector.addElement(curData, status);
980	else
981	dataVector.insertElementAt(curData, `0`, status);
982	curData = NULL;
983	}
984	parsingIDs = TRUE;
985	}
986
987	TransliteratorIDParser::SingleID* id =
988	TransliteratorIDParser::parseSingleID(rule, p, direction, status);
989	if (p != pos && ICU_Utility::parseChar(rule, p, END_OF_RULE)) {
990	// Successful ::ID parse.
991
992	if (direction == UTRANS_FORWARD) {
993	idBlockResult.append(id->canonID).append(END_OF_RULE);
994	} else {
995	idBlockResult.insert(`0`, END_OF_RULE);
996	idBlockResult.insert(`0`, id->canonID);
997	}
998
999	} else {
1000	// Couldn't parse an ID. Try to parse a global filter
1001	int32_t withParens = -`1`;
1002	UnicodeSet* f = TransliteratorIDParser::parseGlobalFilter(rule, p, direction, withParens, NULL);
1003	if (f != NULL) {
1004	if (ICU_Utility::parseChar(rule, p, END_OF_RULE)
1005	&& (direction == UTRANS_FORWARD) == (withParens == `0`))
1006	{
1007	if (compoundFilter != NULL) {
1008	// Multiple compound filters
1009	syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rule, pos, status);
1010	delete f;
1011	} else {
1012	compoundFilter = f;
1013	compoundFilterOffset = ruleCount;
1014	}
1015	} else {
1016	delete f;
1017	}
1018	} else {
1019	// Invalid ::id
1020	// Can be parsed as neither an ID nor a global filter
1021	syntaxError(U_INVALID_ID, rule, pos, status);
1022	}
1023	}
1024	delete id;
1025	pos = p;
1026	} else {
1027	if (parsingIDs) {
1028	tempstr = new UnicodeString (idBlockResult);
1029	// NULL pointer check
1030	if (tempstr == NULL) {
1031	status = U_MEMORY_ALLOCATION_ERROR;
1032	return;
1033	}
1034	if (direction == UTRANS_FORWARD)
1035	idBlockVector.addElement(tempstr, status);
1036	else
1037	idBlockVector.insertElementAt(tempstr, `0`, status);
1038	idBlockResult.remove();
1039	parsingIDs = FALSE;
1040	curData = new TransliterationRuleData (status);
1041	// NULL pointer check
1042	if (curData == NULL) {
1043	status = U_MEMORY_ALLOCATION_ERROR;
1044	return;
1045	}
1046	parseData->data = curData;
1047
1048	// By default, rules use part of the private use area
1049	// E000..F8FF for variables and other stand-ins. Currently
1050	// the range F000..F8FF is typically sufficient. The 'use
1051	// variable range' pragma allows rule sets to modify this.
1052	setVariableRange(`0xF000`, `0xF8FF`, status);
1053	}
1054
1055	if (resemblesPragma(rule, pos, limit)) {
1056	int32_t ppp = parsePragma(rule, pos, limit, status);
1057	if (ppp < `0`) {
1058	syntaxError(U_MALFORMED_PRAGMA, rule, pos, status);
1059	}
1060	pos = ppp;
1061	// Parse a rule
1062	} else {
1063	pos = parseRule(rule, pos, limit, status);
1064	}
1065	}
1066	}
1067
1068	if (parsingIDs && idBlockResult.length() > `0`) {
1069	tempstr = new UnicodeString (idBlockResult);
1070	// NULL pointer check
1071	if (tempstr == NULL) {
1072	status = U_MEMORY_ALLOCATION_ERROR;
1073	return;
1074	}
1075	if (direction == UTRANS_FORWARD)
1076	idBlockVector.addElement(tempstr, status);
1077	else
1078	idBlockVector.insertElementAt(tempstr, `0`, status);
1079	}
1080	else if (!parsingIDs && curData != NULL) {
1081	if (direction == UTRANS_FORWARD)
1082	dataVector.addElement(curData, status);
1083	else
1084	dataVector.insertElementAt(curData, `0`, status);
1085	}
1086
1087	if (U_SUCCESS(status)) {
1088	// Convert the set vector to an array
1089	int32_t i, dataVectorSize = dataVector.size();
1090	for (i = `0`; i < dataVectorSize; i++) {
1091	TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i);
1092	data->variablesLength = variablesVector.size();
1093	if (data->variablesLength == `0`) {
1094	data->variables = `0`;
1095	} else {
1096	data->variables = (UnicodeFunctor*)uprv_malloc(data->variablesLength sizeof(UnicodeFunctor*));
1097	// NULL pointer check
1098	if (data->variables == NULL) {
1099	status = U_MEMORY_ALLOCATION_ERROR;
1100	return;
1101	}
1102	data->variablesAreOwned = (i == `0`);
1103	}
1104
1105	for (int32_t j = `0`; j < data->variablesLength; j++) {
1106	data->variables[j] =
1107	static_cast<UnicodeFunctor *>(variablesVector.elementAt(j));
1108	}
1109
1110	data->variableNames.removeAll();
1111	int32_t p = UHASH_FIRST;
1112	const UHashElement* he = variableNames.nextElement(p);
1113	while (he != NULL) {
1114	UnicodeString* tempus = ((UnicodeString*)(he->value.pointer))->clone();
1115	if (tempus == NULL) {
1116	status = U_MEMORY_ALLOCATION_ERROR;
1117	return;
1118	}
1119	data->variableNames.put(((UnicodeString)(he->key.pointer)),
1120	tempus, status);
1121	he = variableNames.nextElement(p);
1122	}
1123	}
1124	variablesVector.removeAllElements(); // keeps them from getting deleted when we succeed
1125
1126	// Index the rules
1127	if (compoundFilter != NULL) {
1128	if ((direction == UTRANS_FORWARD && compoundFilterOffset != `1`) \|\|
1129	(direction == UTRANS_REVERSE && compoundFilterOffset != ruleCount)) {
1130	status = U_MISPLACED_COMPOUND_FILTER;
1131	}
1132	}
1133
1134	for (i = `0`; i < dataVectorSize; i++) {
1135	TransliterationRuleData* data = (TransliterationRuleData*)dataVector.elementAt(i);
1136	data->ruleSet.freeze(parseError, status);
1137	}
1138	if (idBlockVector.size() == `1` && ((UnicodeString*)idBlockVector.elementAt(`0`))->isEmpty()) {
1139	idBlockVector.removeElementAt(`0`);
1140	}
1141	}
1142	}
1143
1144	/**
1145	* Set the variable range to [start, end] (inclusive).
1146	*/
1147	void TransliteratorParser::setVariableRange(int32_t start, int32_t end, UErrorCode& status) {
1148	if (start > end \|\| start < `0` \|\| end > `0xFFFF`) {
1149	status = U_MALFORMED_PRAGMA;
1150	return;
1151	}
1152
1153	curData->variablesBase = (UChar) start;
1154	if (dataVector.size() == `0`) {
1155	variableNext = (UChar) start;
1156	variableLimit = (UChar) (end + `1`);
1157	}
1158	}
1159
1160	/**
1161	* Assert that the given character is NOT within the variable range.
1162	* If it is, return FALSE. This is neccesary to ensure that the
1163	* variable range does not overlap characters used in a rule.
1164	*/
1165	UBool TransliteratorParser::checkVariableRange(UChar32 ch) const {
1166	return !(ch >= curData->variablesBase && ch < variableLimit);
1167	}
1168
1169	/**
1170	* Set the maximum backup to 'backup', in response to a pragma
1171	* statement.
1172	*/
1173	void TransliteratorParser::pragmaMaximumBackup(int32_t /backup/) {
1174	//TODO Finish
1175	}
1176
1177	/**
1178	* Begin normalizing all rules using the given mode, in response
1179	* to a pragma statement.
1180	*/
1181	void TransliteratorParser::pragmaNormalizeRules(UNormalizationMode /mode/) {
1182	//TODO Finish
1183	}
1184
1185	static const UChar PRAGMA_USE[] = {`0x75`,`0x73`,`0x65`,`0x20`,`0`}; // "use "
1186
1187	static const UChar PRAGMA_VARIABLE_RANGE[] = {`0x7E`,`0x76`,`0x61`,`0x72`,`0x69`,`0x61`,`0x62`,`0x6C`,`0x65`,`0x20`,`0x72`,`0x61`,`0x6E`,`0x67`,`0x65`,`0x20`,`0x23`,`0x20`,`0x23`,`0x7E`,`0x3B`,`0`}; // "~variable range # #~;"
1188
1189	static const UChar PRAGMA_MAXIMUM_BACKUP[] = {`0x7E`,`0x6D`,`0x61`,`0x78`,`0x69`,`0x6D`,`0x75`,`0x6D`,`0x20`,`0x62`,`0x61`,`0x63`,`0x6B`,`0x75`,`0x70`,`0x20`,`0x23`,`0x7E`,`0x3B`,`0`}; // "~maximum backup #~;"
1190
1191	static const UChar PRAGMA_NFD_RULES[] = {`0x7E`,`0x6E`,`0x66`,`0x64`,`0x20`,`0x72`,`0x75`,`0x6C`,`0x65`,`0x73`,`0x7E`,`0x3B`,`0`}; // "~nfd rules~;"
1192
1193	static const UChar PRAGMA_NFC_RULES[] = {`0x7E`,`0x6E`,`0x66`,`0x63`,`0x20`,`0x72`,`0x75`,`0x6C`,`0x65`,`0x73`,`0x7E`,`0x3B`,`0`}; // "~nfc rules~;"
1194
1195	/**
1196	* Return true if the given rule looks like a pragma.
1197	* @param pos offset to the first non-whitespace character
1198	* of the rule.
1199	* @param limit pointer past the last character of the rule.
1200	*/
1201	UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) {
1202	// Must start with /use\s/i
1203	return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString (TRUE, PRAGMA_USE, `4`), NULL) >= `0`;
1204	}
1205
1206	/**
1207	* Parse a pragma. This method assumes resemblesPragma() has
1208	* already returned true.
1209	* @param pos offset to the first non-whitespace character
1210	* of the rule.
1211	* @param limit pointer past the last character of the rule.
1212	* @return the position index after the final ';' of the pragma,
1213	* or -1 on failure.
1214	*/
1215	int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
1216	int32_t array[`2`];
1217
1218	// resemblesPragma() has already returned true, so we
1219	// know that pos points to /use\s/i; we can skip 4 characters
1220	// immediately
1221	pos += `4`;
1222
1223	// Here are the pragmas we recognize:
1224	// use variable range 0xE000 0xEFFF;
1225	// use maximum backup 16;
1226	// use nfd rules;
1227	// use nfc rules;
1228	int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString (TRUE, PRAGMA_VARIABLE_RANGE, -`1`), array);
1229	if (p >= `0`) {
1230	setVariableRange(array[`0`], array[`1`], status);
1231	return p;
1232	}
1233
1234	p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString (TRUE, PRAGMA_MAXIMUM_BACKUP, -`1`), array);
1235	if (p >= `0`) {
1236	pragmaMaximumBackup(array[`0`]);
1237	return p;
1238	}
1239
1240	p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString (TRUE, PRAGMA_NFD_RULES, -`1`), NULL);
1241	if (p >= `0`) {
1242	pragmaNormalizeRules(UNORM_NFD);
1243	return p;
1244	}
1245
1246	p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString (TRUE, PRAGMA_NFC_RULES, -`1`), NULL);
1247	if (p >= `0`) {
1248	pragmaNormalizeRules(UNORM_NFC);
1249	return p;
1250	}
1251
1252	// Syntax error: unable to parse pragma
1253	return -`1`;
1254	}
1255
1256	/**
1257	* MAIN PARSER. Parse the next rule in the given rule string, starting
1258	* at pos. Return the index after the last character parsed. Do not
1259	* parse characters at or after limit.
1260	*
1261	* Important: The character at pos must be a non-whitespace character
1262	* that is not the comment character.
1263	*
1264	* This method handles quoting, escaping, and whitespace removal. It
1265	* parses the end-of-rule character. It recognizes context and cursor
1266	* indicators. Once it does a lexical breakdown of the rule at pos, it
1267	* creates a rule object and adds it to our rule list.
1268	*/
1269	int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
1270	// Locate the left side, operator, and right side
1271	int32_t start = pos;
1272	UChar op = `0`;
1273	int32_t i;
1274
1275	// Set up segments data
1276	segmentStandins.truncate(`0`);
1277	segmentObjects.removeAllElements();
1278
1279	// Use pointers to automatics to make swapping possible.
1280	RuleHalf _left(*this), _right(*this);
1281	RuleHalf* left = &_left;
1282	RuleHalf* right = &_right;
1283
1284	undefinedVariableName.remove();
1285	pos = left->parse(rule, pos, limit, status);
1286	if (U_FAILURE(status)) {
1287	return start;
1288	}
1289
1290	if (pos == limit \|\| u_strchr(gOPERATORS, (op = rule.charAt(--pos))) == NULL) {
1291	return syntaxError(U_MISSING_OPERATOR, rule, start, status);
1292	}
1293	++pos;
1294
1295	// Found an operator char. Check for forward-reverse operator.
1296	if (op == REVERSE_RULE_OP &&
1297	(pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
1298	++pos;
1299	op = FWDREV_RULE_OP;
1300	}
1301
1302	// Translate alternate op characters.
1303	switch (op) {
1304	case ALT_FORWARD_RULE_OP:
1305	op = FORWARD_RULE_OP;
1306	break;
1307	case ALT_REVERSE_RULE_OP:
1308	op = REVERSE_RULE_OP;
1309	break;
1310	case ALT_FWDREV_RULE_OP:
1311	op = FWDREV_RULE_OP;
1312	break;
1313	}
1314
1315	pos = right->parse(rule, pos, limit, status);
1316	if (U_FAILURE(status)) {
1317	return start;
1318	}
1319
1320	if (pos < limit) {
1321	if (rule.charAt(--pos) == END_OF_RULE) {
1322	++pos;
1323	} else {
1324	// RuleHalf parser must have terminated at an operator
1325	return syntaxError(U_UNQUOTED_SPECIAL, rule, start, status);
1326	}
1327	}
1328
1329	if (op == VARIABLE_DEF_OP) {
1330	// LHS is the name. RHS is a single character, either a literal
1331	// or a set (already parsed). If RHS is longer than one
1332	// character, it is either a multi-character string, or multiple
1333	// sets, or a mixture of chars and sets -- syntax error.
1334
1335	// We expect to see a single undefined variable (the one being
1336	// defined).
1337	if (undefinedVariableName.length() == `0`) {
1338	// "Missing '$' or duplicate definition"
1339	return syntaxError(U_BAD_VARIABLE_DEFINITION, rule, start, status);
1340	}
1341	if (left->text.length() != `1` \|\| left->text.charAt(`0`) != variableLimit) {
1342	// "Malformed LHS"
1343	return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status);
1344	}
1345	if (left->anchorStart \|\| left->anchorEnd \|\|
1346	right->anchorStart \|\| right->anchorEnd) {
1347	return syntaxError(U_MALFORMED_VARIABLE_DEFINITION, rule, start, status);
1348	}
1349	// We allow anything on the right, including an empty string.
1350	UnicodeString* value = new UnicodeString (right->text);
1351	// NULL pointer check
1352	if (value == NULL) {
1353	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
1354	}
1355	variableNames.put(undefinedVariableName, value, status);
1356	++variableLimit;
1357	return pos;
1358	}
1359
1360	// If this is not a variable definition rule, we shouldn't have
1361	// any undefined variable names.
1362	if (undefinedVariableName.length() != `0`) {
1363	return syntaxError(// "Undefined variable $" + undefinedVariableName,
1364	U_UNDEFINED_VARIABLE,
1365	rule, start, status);
1366	}
1367
1368	// Verify segments
1369	if (segmentStandins.length() > segmentObjects.size()) {
1370	syntaxError(U_UNDEFINED_SEGMENT_REFERENCE, rule, start, status);
1371	}
1372	for (i=`0`; i<segmentStandins.length(); ++i) {
1373	if (segmentStandins.charAt(i) == `0`) {
1374	syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen
1375	}
1376	}
1377	for (i=`0`; i<segmentObjects.size(); ++i) {
1378	if (segmentObjects.elementAt(i) == NULL) {
1379	syntaxError(U_INTERNAL_TRANSLITERATOR_ERROR, rule, start, status); // will never happen
1380	}
1381	}
1382
1383	// If the direction we want doesn't match the rule
1384	// direction, do nothing.
1385	if (op != FWDREV_RULE_OP &&
1386	((direction == UTRANS_FORWARD) != (op == FORWARD_RULE_OP))) {
1387	return pos;
1388	}
1389
1390	// Transform the rule into a forward rule by swapping the
1391	// sides if necessary.
1392	if (direction == UTRANS_REVERSE) {
1393	left = &_right;
1394	right = &_left;
1395	}
1396
1397	// Remove non-applicable elements in forward-reverse
1398	// rules. Bidirectional rules ignore elements that do not
1399	// apply.
1400	if (op == FWDREV_RULE_OP) {
1401	right->removeContext();
1402	left->cursor = -`1`;
1403	left->cursorOffset = `0`;
1404	}
1405
1406	// Normalize context
1407	if (left->ante < `0`) {
1408	left->ante = `0`;
1409	}
1410	if (left->post < `0`) {
1411	left->post = left->text.length();
1412	}
1413
1414	// Context is only allowed on the input side. Cursors are only
1415	// allowed on the output side. Segment delimiters can only appear
1416	// on the left, and references on the right. Cursor offset
1417	// cannot appear without an explicit cursor. Cursor offset
1418	// cannot place the cursor outside the limits of the context.
1419	// Anchors are only allowed on the input side.
1420	if (right->ante >= `0` \|\| right->post >= `0` \|\| left->cursor >= `0` \|\|
1421	(right->cursorOffset != `0` && right->cursor < `0`) \|\|
1422	// - The following two checks were used to ensure that the
1423	// - the cursor offset stayed within the ante- or postcontext.
1424	// - However, with the addition of quantifiers, we have to
1425	// - allow arbitrary cursor offsets and do runtime checking.
1426	//(right->cursorOffset > (left->text.length() - left->post)) \|\|
1427	//(-right->cursorOffset > left->ante) \|\|
1428	right->anchorStart \|\| right->anchorEnd \|\|
1429	!left->isValidInput(*this) \|\| !right->isValidOutput(*this) \|\|
1430	left->ante > left->post) {
1431
1432	return syntaxError(U_MALFORMED_RULE, rule, start, status);
1433	}
1434
1435	// Flatten segment objects vector to an array
1436	UnicodeFunctor** segmentsArray = NULL;
1437	if (segmentObjects.size() > `0`) {
1438	segmentsArray = (UnicodeFunctor *)uprv_malloc(segmentObjects.size() sizeof(UnicodeFunctor *));
1439	// Null pointer check
1440	if (segmentsArray == NULL) {
1441	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
1442	}
1443	segmentObjects.toArray((void**) segmentsArray);
1444	}
1445	TransliterationRule* temptr = new TransliterationRule (
1446	left->text, left->ante, left->post,
1447	right->text, right->cursor, right->cursorOffset,
1448	segmentsArray,
1449	segmentObjects.size(),
1450	left->anchorStart, left->anchorEnd,
1451	curData,
1452	status);
1453	//Null pointer check
1454	if (temptr == NULL) {
1455	uprv_free(segmentsArray);
1456	return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
1457	}
1458
1459	curData->ruleSet.addRule(temptr, status);
1460
1461	return pos;
1462	}
1463
1464	/**
1465	* Called by main parser upon syntax error. Search the rule string
1466	* for the probable end of the rule. Of course, if the error is that
1467	* the end of rule marker is missing, then the rule end will not be found.
1468	* In any case the rule start will be correctly reported.
1469	* @param msg error description
1470	* @param rule pattern string
1471	* @param start position of first character of current rule
1472	*/
1473	int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,
1474	const UnicodeString& rule,
1475	int32_t pos,
1476	UErrorCode& status)
1477	{
1478	parseError.offset = pos;
1479	parseError.line = `0` ; / we are not using line numbers /
1480
1481	// for pre-context
1482	const int32_t LEN = U_PARSE_CONTEXT_LEN - `1`;
1483	int32_t start = uprv_max(pos - LEN, `0`);
1484	int32_t stop = pos;
1485
1486	rule.extract(start,stop-start,parseError.preContext);
1487	//null terminate the buffer
1488	parseError.preContext[stop-start] = `0`;
1489
1490	//for post-context
1491	start = pos;
1492	stop = uprv_min(pos + LEN, rule.length());
1493
1494	rule.extract(start,stop-start,parseError.postContext);
1495	//null terminate the buffer
1496	parseError.postContext[stop-start]= `0`;
1497
1498	status = (UErrorCode)parseErrorCode;
1499	return pos;
1500
1501	}
1502
1503	/**
1504	* Parse a UnicodeSet out, store it, and return the stand-in character
1505	* used to represent it.
1506	*/
1507	UChar TransliteratorParser::parseSet(const UnicodeString& rule,
1508	ParsePosition& pos,
1509	UErrorCode& status) {
1510	UnicodeSet* set = new UnicodeSet (rule, pos, USET_IGNORE_SPACE, parseData, status);
1511	// Null pointer check
1512	if (set == NULL) {
1513	status = U_MEMORY_ALLOCATION_ERROR;
1514	return (UChar)`0x0000`; // Return empty character with error.
1515	}
1516	set->compact();
1517	return generateStandInFor(set, status);
1518	}
1519
1520	/**
1521	* Generate and return a stand-in for a new UnicodeFunctor. Store
1522	* the matcher (adopt it).
1523	*/
1524	UChar TransliteratorParser::generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status) {
1525	// assert(obj != null);
1526
1527	// Look up previous stand-in, if any. This is a short list
1528	// (typical n is 0, 1, or 2); linear search is optimal.
1529	for (int32_t i=`0`; i<variablesVector.size(); ++i) {
1530	if (variablesVector.elementAt(i) == adopted) { // [sic] pointer comparison
1531	return (UChar) (curData->variablesBase + i);
1532	}
1533	}
1534
1535	if (variableNext >= variableLimit) {
1536	delete adopted;
1537	status = U_VARIABLE_RANGE_EXHAUSTED;
1538	return `0`;
1539	}
1540	variablesVector.addElement(adopted, status);
1541	return variableNext++;
1542	}
1543
1544	/**
1545	* Return the standin for segment seg (1-based).
1546	*/
1547	UChar TransliteratorParser::getSegmentStandin(int32_t seg, UErrorCode& status) {
1548	// Special character used to indicate an empty spot
1549	UChar empty = curData->variablesBase - `1`;
1550	while (segmentStandins.length() < seg) {
1551	segmentStandins.append(empty);
1552	}
1553	UChar c = segmentStandins.charAt(seg-`1`);
1554	if (c == empty) {
1555	if (variableNext >= variableLimit) {
1556	status = U_VARIABLE_RANGE_EXHAUSTED;
1557	return `0`;
1558	}
1559	c = variableNext++;
1560	// Set a placeholder in the master variables vector that will be
1561	// filled in later by setSegmentObject(). We know that we will get
1562	// called first because setSegmentObject() will call us.
1563	variablesVector.addElement((void*) NULL, status);
1564	segmentStandins.setCharAt(seg-`1`, c);
1565	}
1566	return c;
1567	}
1568
1569	/**
1570	* Set the object for segment seg (1-based).
1571	*/
1572	void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status) {
1573	// Since we call parseSection() recursively, nested
1574	// segments will result in segment i+1 getting parsed
1575	// and stored before segment i; be careful with the
1576	// vector handling here.
1577	if (segmentObjects.size() < seg) {
1578	segmentObjects.setSize(seg, status);
1579	}
1580	int32_t index = getSegmentStandin(seg, status) - curData->variablesBase;
1581	if (segmentObjects.elementAt(seg-`1`) != NULL \|\|
1582	variablesVector.elementAt(index) != NULL) {
1583	// should never happen
1584	status = U_INTERNAL_TRANSLITERATOR_ERROR;
1585	return;
1586	}
1587	segmentObjects.setElementAt(adopted, seg-`1`);
1588	variablesVector.setElementAt(adopted, index);
1589	}
1590
1591	/**
1592	* Return the stand-in for the dot set. It is allocated the first
1593	* time and reused thereafter.
1594	*/
1595	UChar TransliteratorParser::getDotStandIn(UErrorCode& status) {
1596	if (dotStandIn == (UChar) -`1`) {
1597	UnicodeSet* tempus = new UnicodeSet (UnicodeString (TRUE, DOT_SET, -`1`), status);
1598	// Null pointer check.
1599	if (tempus == NULL) {
1600	status = U_MEMORY_ALLOCATION_ERROR;
1601	return (UChar)`0x0000`;
1602	}
1603	dotStandIn = generateStandInFor(tempus, status);
1604	}
1605	return dotStandIn;
1606	}
1607
1608	/**
1609	* Append the value of the given variable name to the given
1610	* UnicodeString.
1611	*/
1612	void TransliteratorParser::appendVariableDef(const UnicodeString& name,
1613	UnicodeString& buf,
1614	UErrorCode& status) {
1615	const UnicodeString* s = (const UnicodeString*) variableNames.get(name);
1616	if (s == NULL) {
1617	// We allow one undefined variable so that variable definition
1618	// statements work. For the first undefined variable we return
1619	// the special placeholder variableLimit-1, and save the variable
1620	// name.
1621	if (undefinedVariableName.length() == `0`) {
1622	undefinedVariableName = name;
1623	if (variableNext >= variableLimit) {
1624	// throw new RuntimeException("Private use variables exhausted");
1625	status = U_ILLEGAL_ARGUMENT_ERROR;
1626	return;
1627	}
1628	buf.append((UChar) --variableLimit);
1629	} else {
1630	//throw new IllegalArgumentException("Undefined variable $"
1631	// + name);
1632	status = U_ILLEGAL_ARGUMENT_ERROR;
1633	return;
1634	}
1635	} else {
1636	buf.append(*s);
1637	}
1638	}
1639
1640	/**
1641	* Glue method to get around access restrictions in C++.
1642	*/
1643	/Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {*
1644	return Transliterator::createBasicInstance(id, canonID);
1645	}/*
1646
1647	U_NAMESPACE_END
1648
1649	U_CAPI int32_t
1650	utrans_stripRules(const UChar source, int32_t sourceLen, UChar target, UErrorCode *status) {
1651	U_NAMESPACE_USE
1652
1653	//const UChar sourceStart = source;*
1654	const UChar *targetStart = target;
1655	const UChar *sourceLimit = source+sourceLen;
1656	UChar *targetLimit = target+sourceLen;
1657	UChar32 c = `0`;
1658	UBool quoted = FALSE;
1659	int32_t index;
1660
1661	uprv_memset(target, `0`, sourceLen*U_SIZEOF_UCHAR);
1662
1663	/ read the rules into the buffer /
1664	while (source < sourceLimit)
1665	{
1666	index=`0`;
1667	U16_NEXT_UNSAFE(source, index, c);
1668	source+=index;
1669	if(c == QUOTE) {
1670	quoted = (UBool)!quoted;
1671	}
1672	else if (!quoted) {
1673	if (c == RULE_COMMENT_CHAR) {
1674	/ skip comments and all preceding spaces /
1675	while (targetStart < target && *(target - `1`) == `0x0020`) {
1676	target--;
1677	}
1678	do {
1679	if (source == sourceLimit) {
1680	c = U_SENTINEL;
1681	break;
1682	}
1683	c = *(source++);
1684	}
1685	while (c != CR && c != LF);
1686	if (c < `0`) {
1687	break;
1688	}
1689	}
1690	else if (c == ESCAPE && source < sourceLimit) {
1691	UChar32 c2 = *source;
1692	if (c2 == CR \|\| c2 == LF) {
1693	/ A backslash at the end of a line. /
1694	/ Since we're stripping lines, ignore the backslash. /
1695	source++;
1696	continue;
1697	}
1698	if (c2 == `0x0075` && source+`5` < sourceLimit) { / \u seen. \U isn't unescaped. /
1699	int32_t escapeOffset = `0`;
1700	UnicodeString escapedStr(source, `5`);
1701	c2 = escapedStr.unescapeAt(escapeOffset);
1702
1703	if (c2 == (UChar32)`0xFFFFFFFF` \|\| escapeOffset == `0`)
1704	{
1705	*status = U_PARSE_ERROR;
1706	return `0`;
1707	}
1708	if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) {
1709	/ It was escaped for a reason. Write what it was suppose to be. /
1710	source+=`5`;
1711	c = c2;
1712	}
1713	}
1714	else if (c2 == QUOTE) {
1715	/ \' seen. Make sure we don't do anything when we see it again. /
1716	quoted = (UBool)!quoted;
1717	}
1718	}
1719	}
1720	if (c == CR \|\| c == LF)
1721	{
1722	/ ignore spaces carriage returns, and all leading spaces on the next line.*
1723	* and line feed unless in the form \uXXXX
1724	*/
1725	quoted = FALSE;
1726	while (source < sourceLimit) {
1727	c = *(source);
1728	if (c != CR && c != LF && c != `0x0020`) {
1729	break;
1730	}
1731	source++;
1732	}
1733	continue;
1734	}
1735
1736	/ Append UChar * after dissembling if c > 0xffff/
1737	index=`0`;
1738	U16_APPEND_UNSAFE(target, index, c);
1739	target+=index;
1740	}
1741	if (target < targetLimit) {
1742	*target = `0`;
1743	}
1744	return (int32_t)(target-targetStart);
1745	}
1746
1747	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
1748

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/rbt_pars.cpp