1/*-------------------------------------------------------------------------
2 *
3 * wparser_def.c
4 * Default text search parser
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/wparser_def.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include <limits.h>
18
19#include "catalog/pg_collation.h"
20#include "commands/defrem.h"
21#include "tsearch/ts_locale.h"
22#include "tsearch/ts_public.h"
23#include "tsearch/ts_type.h"
24#include "tsearch/ts_utils.h"
25#include "utils/builtins.h"
26
27
28/* Define me to enable tracing of parser behavior */
29/* #define WPARSER_TRACE */
30
31
32/* Output token categories */
33
34#define ASCIIWORD 1
35#define WORD_T 2
36#define NUMWORD 3
37#define EMAIL 4
38#define URL_T 5
39#define HOST 6
40#define SCIENTIFIC 7
41#define VERSIONNUMBER 8
42#define NUMPARTHWORD 9
43#define PARTHWORD 10
44#define ASCIIPARTHWORD 11
45#define SPACE 12
46#define TAG_T 13
47#define PROTOCOL 14
48#define NUMHWORD 15
49#define ASCIIHWORD 16
50#define HWORD 17
51#define URLPATH 18
52#define FILEPATH 19
53#define DECIMAL_T 20
54#define SIGNEDINT 21
55#define UNSIGNEDINT 22
56#define XMLENTITY 23
57
58#define LASTNUM 23
59
60static const char *const tok_alias[] = {
61 "",
62 "asciiword",
63 "word",
64 "numword",
65 "email",
66 "url",
67 "host",
68 "sfloat",
69 "version",
70 "hword_numpart",
71 "hword_part",
72 "hword_asciipart",
73 "blank",
74 "tag",
75 "protocol",
76 "numhword",
77 "asciihword",
78 "hword",
79 "url_path",
80 "file",
81 "float",
82 "int",
83 "uint",
84 "entity"
85};
86
87static const char *const lex_descr[] = {
88 "",
89 "Word, all ASCII",
90 "Word, all letters",
91 "Word, letters and digits",
92 "Email address",
93 "URL",
94 "Host",
95 "Scientific notation",
96 "Version number",
97 "Hyphenated word part, letters and digits",
98 "Hyphenated word part, all letters",
99 "Hyphenated word part, all ASCII",
100 "Space symbols",
101 "XML tag",
102 "Protocol head",
103 "Hyphenated word, letters and digits",
104 "Hyphenated word, all ASCII",
105 "Hyphenated word, all letters",
106 "URL path",
107 "File or path name",
108 "Decimal notation",
109 "Signed integer",
110 "Unsigned integer",
111 "XML entity"
112};
113
114
115/* Parser states */
116
117typedef enum
118{
119 TPS_Base = 0,
120 TPS_InNumWord,
121 TPS_InAsciiWord,
122 TPS_InWord,
123 TPS_InUnsignedInt,
124 TPS_InSignedIntFirst,
125 TPS_InSignedInt,
126 TPS_InSpace,
127 TPS_InUDecimalFirst,
128 TPS_InUDecimal,
129 TPS_InDecimalFirst,
130 TPS_InDecimal,
131 TPS_InVerVersion,
132 TPS_InSVerVersion,
133 TPS_InVersionFirst,
134 TPS_InVersion,
135 TPS_InMantissaFirst,
136 TPS_InMantissaSign,
137 TPS_InMantissa,
138 TPS_InXMLEntityFirst,
139 TPS_InXMLEntity,
140 TPS_InXMLEntityNumFirst,
141 TPS_InXMLEntityNum,
142 TPS_InXMLEntityHexNumFirst,
143 TPS_InXMLEntityHexNum,
144 TPS_InXMLEntityEnd,
145 TPS_InTagFirst,
146 TPS_InXMLBegin,
147 TPS_InTagCloseFirst,
148 TPS_InTagName,
149 TPS_InTagBeginEnd,
150 TPS_InTag,
151 TPS_InTagEscapeK,
152 TPS_InTagEscapeKK,
153 TPS_InTagBackSleshed,
154 TPS_InTagEnd,
155 TPS_InCommentFirst,
156 TPS_InCommentLast,
157 TPS_InComment,
158 TPS_InCloseCommentFirst,
159 TPS_InCloseCommentLast,
160 TPS_InCommentEnd,
161 TPS_InHostFirstDomain,
162 TPS_InHostDomainSecond,
163 TPS_InHostDomain,
164 TPS_InPortFirst,
165 TPS_InPort,
166 TPS_InHostFirstAN,
167 TPS_InHost,
168 TPS_InEmail,
169 TPS_InFileFirst,
170 TPS_InFileTwiddle,
171 TPS_InPathFirst,
172 TPS_InPathFirstFirst,
173 TPS_InPathSecond,
174 TPS_InFile,
175 TPS_InFileNext,
176 TPS_InURLPathFirst,
177 TPS_InURLPathStart,
178 TPS_InURLPath,
179 TPS_InFURL,
180 TPS_InProtocolFirst,
181 TPS_InProtocolSecond,
182 TPS_InProtocolEnd,
183 TPS_InHyphenAsciiWordFirst,
184 TPS_InHyphenAsciiWord,
185 TPS_InHyphenWordFirst,
186 TPS_InHyphenWord,
187 TPS_InHyphenNumWordFirst,
188 TPS_InHyphenNumWord,
189 TPS_InHyphenDigitLookahead,
190 TPS_InParseHyphen,
191 TPS_InParseHyphenHyphen,
192 TPS_InHyphenWordPart,
193 TPS_InHyphenAsciiWordPart,
194 TPS_InHyphenNumWordPart,
195 TPS_InHyphenUnsignedInt,
196 TPS_Null /* last state (fake value) */
197} TParserState;
198
199/* forward declaration */
200struct TParser;
201
202typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203 * except p_iseq */
204typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205 * special cases... */
206
207typedef struct
208{
209 TParserCharTest isclass;
210 char c;
211 uint16 flags;
212 TParserState tostate;
213 int type;
214 TParserSpecial special;
215} TParserStateActionItem;
216
217/* Flag bits in TParserStateActionItem.flags */
218#define A_NEXT 0x0000
219#define A_BINGO 0x0001
220#define A_POP 0x0002
221#define A_PUSH 0x0004
222#define A_RERUN 0x0008
223#define A_CLEAR 0x0010
224#define A_MERGE 0x0020
225#define A_CLRALL 0x0040
226
227typedef struct TParserPosition
228{
229 int posbyte; /* position of parser in bytes */
230 int poschar; /* position of parser in characters */
231 int charlen; /* length of current char */
232 int lenbytetoken; /* length of token-so-far in bytes */
233 int lenchartoken; /* and in chars */
234 TParserState state;
235 struct TParserPosition *prev;
236 const TParserStateActionItem *pushedAtAction;
237} TParserPosition;
238
239typedef struct TParser
240{
241 /* string and position information */
242 char *str; /* multibyte string */
243 int lenstr; /* length of mbstring */
244 wchar_t *wstr; /* wide character string */
245 pg_wchar *pgwstr; /* wide character string for C-locale */
246 bool usewide;
247
248 /* State of parse */
249 int charmaxlen;
250 TParserPosition *state;
251 bool ignore;
252 bool wanthost;
253
254 /* silly char */
255 char c;
256
257 /* out */
258 char *token;
259 int lenbytetoken;
260 int lenchartoken;
261 int type;
262} TParser;
263
264
265/* forward decls here */
266static bool TParserGet(TParser *prs);
267
268
269static TParserPosition *
270newTParserPosition(TParserPosition *prev)
271{
272 TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
273
274 if (prev)
275 memcpy(res, prev, sizeof(TParserPosition));
276 else
277 memset(res, 0, sizeof(TParserPosition));
278
279 res->prev = prev;
280
281 res->pushedAtAction = NULL;
282
283 return res;
284}
285
286static TParser *
287TParserInit(char *str, int len)
288{
289 TParser *prs = (TParser *) palloc0(sizeof(TParser));
290
291 prs->charmaxlen = pg_database_encoding_max_length();
292 prs->str = str;
293 prs->lenstr = len;
294
295 /*
296 * Use wide char code only when max encoding length > 1.
297 */
298 if (prs->charmaxlen > 1)
299 {
300 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
301 pg_locale_t mylocale = 0; /* TODO */
302
303 prs->usewide = true;
304 if (lc_ctype_is_c(collation))
305 {
306 /*
307 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
308 * be different from sizeof(wchar_t)
309 */
310 prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
311 pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
312 }
313 else
314 {
315 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
316 char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
317 mylocale);
318 }
319 }
320 else
321 prs->usewide = false;
322
323 prs->state = newTParserPosition(NULL);
324 prs->state->state = TPS_Base;
325
326#ifdef WPARSER_TRACE
327
328 /*
329 * Use of %.*s here is a bit risky since it can misbehave if the data is
330 * not in what libc thinks is the prevailing encoding. However, since
331 * this is just a debugging aid, we choose to live with that.
332 */
333 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
334#endif
335
336 return prs;
337}
338
339/*
340 * As an alternative to a full TParserInit one can create a
341 * TParserCopy which basically is a regular TParser without a private
342 * copy of the string - instead it uses the one from another TParser.
343 * This is useful because at some places TParsers are created
344 * recursively and the repeated copying around of the strings can
345 * cause major inefficiency if the source string is long.
346 * The new parser starts parsing at the original's current position.
347 *
348 * Obviously one must not close the original TParser before the copy.
349 */
350static TParser *
351TParserCopyInit(const TParser *orig)
352{
353 TParser *prs = (TParser *) palloc0(sizeof(TParser));
354
355 prs->charmaxlen = orig->charmaxlen;
356 prs->str = orig->str + orig->state->posbyte;
357 prs->lenstr = orig->lenstr - orig->state->posbyte;
358 prs->usewide = orig->usewide;
359
360 if (orig->pgwstr)
361 prs->pgwstr = orig->pgwstr + orig->state->poschar;
362 if (orig->wstr)
363 prs->wstr = orig->wstr + orig->state->poschar;
364
365 prs->state = newTParserPosition(NULL);
366 prs->state->state = TPS_Base;
367
368#ifdef WPARSER_TRACE
369 /* See note above about %.*s */
370 fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
371#endif
372
373 return prs;
374}
375
376
377static void
378TParserClose(TParser *prs)
379{
380 while (prs->state)
381 {
382 TParserPosition *ptr = prs->state->prev;
383
384 pfree(prs->state);
385 prs->state = ptr;
386 }
387
388 if (prs->wstr)
389 pfree(prs->wstr);
390 if (prs->pgwstr)
391 pfree(prs->pgwstr);
392
393#ifdef WPARSER_TRACE
394 fprintf(stderr, "closing parser\n");
395#endif
396 pfree(prs);
397}
398
399/*
400 * Close a parser created with TParserCopyInit
401 */
402static void
403TParserCopyClose(TParser *prs)
404{
405 while (prs->state)
406 {
407 TParserPosition *ptr = prs->state->prev;
408
409 pfree(prs->state);
410 prs->state = ptr;
411 }
412
413#ifdef WPARSER_TRACE
414 fprintf(stderr, "closing parser copy\n");
415#endif
416 pfree(prs);
417}
418
419
420/*
421 * Character-type support functions, equivalent to is* macros, but
422 * working with any possible encodings and locales. Notes:
423 * - with multibyte encoding and C-locale isw* function may fail
424 * or give wrong result.
425 * - multibyte encoding and C-locale often are used for
426 * Asian languages.
427 * - if locale is C then we use pgwstr instead of wstr.
428 */
429
430#define p_iswhat(type, nonascii) \
431 \
432static int \
433p_is##type(TParser *prs) \
434{ \
435 Assert(prs->state); \
436 if (prs->usewide) \
437 { \
438 if (prs->pgwstr) \
439 { \
440 unsigned int c = *(prs->pgwstr + prs->state->poschar); \
441 if (c > 0x7f) \
442 return nonascii; \
443 return is##type(c); \
444 } \
445 return isw##type(*(prs->wstr + prs->state->poschar)); \
446 } \
447 return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
448} \
449 \
450static int \
451p_isnot##type(TParser *prs) \
452{ \
453 return !p_is##type(prs); \
454}
455
456/*
457 * In C locale with a multibyte encoding, any non-ASCII symbol is considered
458 * an alpha character, but not a member of other char classes.
459 */
460p_iswhat(alnum, 1)
461p_iswhat(alpha, 1)
462p_iswhat(digit, 0)
463p_iswhat(lower, 0)
464p_iswhat(print, 0)
465p_iswhat(punct, 0)
466p_iswhat(space, 0)
467p_iswhat(upper, 0)
468p_iswhat(xdigit, 0)
469
470/* p_iseq should be used only for ascii symbols */
471
472static int
473p_iseq(TParser *prs, char c)
474{
475 Assert(prs->state);
476 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
477}
478
479static int
480p_isEOF(TParser *prs)
481{
482 Assert(prs->state);
483 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
484}
485
486static int
487p_iseqC(TParser *prs)
488{
489 return p_iseq(prs, prs->c);
490}
491
492static int
493p_isneC(TParser *prs)
494{
495 return !p_iseq(prs, prs->c);
496}
497
498static int
499p_isascii(TParser *prs)
500{
501 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
502}
503
504static int
505p_isasclet(TParser *prs)
506{
507 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508}
509
510static int
511p_isurlchar(TParser *prs)
512{
513 char ch;
514
515 /* no non-ASCII need apply */
516 if (prs->state->charlen != 1)
517 return 0;
518 ch = *(prs->str + prs->state->posbyte);
519 /* no spaces or control characters */
520 if (ch <= 0x20 || ch >= 0x7F)
521 return 0;
522 /* reject characters disallowed by RFC 3986 */
523 switch (ch)
524 {
525 case '"':
526 case '<':
527 case '>':
528 case '\\':
529 case '^':
530 case '`':
531 case '{':
532 case '|':
533 case '}':
534 return 0;
535 }
536 return 1;
537}
538
539
540/* deliberately suppress unused-function complaints for the above */
541void _make_compiler_happy(void);
542void
543_make_compiler_happy(void)
544{
545 p_isalnum(NULL);
546 p_isnotalnum(NULL);
547 p_isalpha(NULL);
548 p_isnotalpha(NULL);
549 p_isdigit(NULL);
550 p_isnotdigit(NULL);
551 p_islower(NULL);
552 p_isnotlower(NULL);
553 p_isprint(NULL);
554 p_isnotprint(NULL);
555 p_ispunct(NULL);
556 p_isnotpunct(NULL);
557 p_isspace(NULL);
558 p_isnotspace(NULL);
559 p_isupper(NULL);
560 p_isnotupper(NULL);
561 p_isxdigit(NULL);
562 p_isnotxdigit(NULL);
563 p_isEOF(NULL);
564 p_iseqC(NULL);
565 p_isneC(NULL);
566}
567
568
569static void
570SpecialTags(TParser *prs)
571{
572 switch (prs->state->lenchartoken)
573 {
574 case 8: /* </script */
575 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
576 prs->ignore = false;
577 break;
578 case 7: /* <script || </style */
579 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
580 prs->ignore = false;
581 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
582 prs->ignore = true;
583 break;
584 case 6: /* <style */
585 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
586 prs->ignore = true;
587 break;
588 default:
589 break;
590 }
591}
592
593static void
594SpecialFURL(TParser *prs)
595{
596 prs->wanthost = true;
597 prs->state->posbyte -= prs->state->lenbytetoken;
598 prs->state->poschar -= prs->state->lenchartoken;
599}
600
601static void
602SpecialHyphen(TParser *prs)
603{
604 prs->state->posbyte -= prs->state->lenbytetoken;
605 prs->state->poschar -= prs->state->lenchartoken;
606}
607
608static void
609SpecialVerVersion(TParser *prs)
610{
611 prs->state->posbyte -= prs->state->lenbytetoken;
612 prs->state->poschar -= prs->state->lenchartoken;
613 prs->state->lenbytetoken = 0;
614 prs->state->lenchartoken = 0;
615}
616
617static int
618p_isstophost(TParser *prs)
619{
620 if (prs->wanthost)
621 {
622 prs->wanthost = false;
623 return 1;
624 }
625 return 0;
626}
627
628static int
629p_isignore(TParser *prs)
630{
631 return (prs->ignore) ? 1 : 0;
632}
633
634static int
635p_ishost(TParser *prs)
636{
637 TParser *tmpprs = TParserCopyInit(prs);
638 int res = 0;
639
640 tmpprs->wanthost = true;
641
642 if (TParserGet(tmpprs) && tmpprs->type == HOST)
643 {
644 prs->state->posbyte += tmpprs->lenbytetoken;
645 prs->state->poschar += tmpprs->lenchartoken;
646 prs->state->lenbytetoken += tmpprs->lenbytetoken;
647 prs->state->lenchartoken += tmpprs->lenchartoken;
648 prs->state->charlen = tmpprs->state->charlen;
649 res = 1;
650 }
651 TParserCopyClose(tmpprs);
652
653 return res;
654}
655
656static int
657p_isURLPath(TParser *prs)
658{
659 TParser *tmpprs = TParserCopyInit(prs);
660 int res = 0;
661
662 tmpprs->state = newTParserPosition(tmpprs->state);
663 tmpprs->state->state = TPS_InURLPathFirst;
664
665 if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
666 {
667 prs->state->posbyte += tmpprs->lenbytetoken;
668 prs->state->poschar += tmpprs->lenchartoken;
669 prs->state->lenbytetoken += tmpprs->lenbytetoken;
670 prs->state->lenchartoken += tmpprs->lenchartoken;
671 prs->state->charlen = tmpprs->state->charlen;
672 res = 1;
673 }
674 TParserCopyClose(tmpprs);
675
676 return res;
677}
678
679/*
680 * returns true if current character has zero display length or
681 * it's a special sign in several languages. Such characters
682 * aren't a word-breaker although they aren't an isalpha.
683 * In beginning of word they aren't a part of it.
684 */
685static int
686p_isspecial(TParser *prs)
687{
688 /*
689 * pg_dsplen could return -1 which means error or control character
690 */
691 if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
692 return 1;
693
694 /*
695 * Unicode Characters in the 'Mark, Spacing Combining' Category That
696 * characters are not alpha although they are not breakers of word too.
697 * Check that only in utf encoding, because other encodings aren't
698 * supported by postgres or even exists.
699 */
700 if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
701 {
702 static const pg_wchar strange_letter[] = {
703 /*
704 * use binary search, so elements should be ordered
705 */
706 0x0903, /* DEVANAGARI SIGN VISARGA */
707 0x093E, /* DEVANAGARI VOWEL SIGN AA */
708 0x093F, /* DEVANAGARI VOWEL SIGN I */
709 0x0940, /* DEVANAGARI VOWEL SIGN II */
710 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
711 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
712 0x094B, /* DEVANAGARI VOWEL SIGN O */
713 0x094C, /* DEVANAGARI VOWEL SIGN AU */
714 0x0982, /* BENGALI SIGN ANUSVARA */
715 0x0983, /* BENGALI SIGN VISARGA */
716 0x09BE, /* BENGALI VOWEL SIGN AA */
717 0x09BF, /* BENGALI VOWEL SIGN I */
718 0x09C0, /* BENGALI VOWEL SIGN II */
719 0x09C7, /* BENGALI VOWEL SIGN E */
720 0x09C8, /* BENGALI VOWEL SIGN AI */
721 0x09CB, /* BENGALI VOWEL SIGN O */
722 0x09CC, /* BENGALI VOWEL SIGN AU */
723 0x09D7, /* BENGALI AU LENGTH MARK */
724 0x0A03, /* GURMUKHI SIGN VISARGA */
725 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
726 0x0A3F, /* GURMUKHI VOWEL SIGN I */
727 0x0A40, /* GURMUKHI VOWEL SIGN II */
728 0x0A83, /* GUJARATI SIGN VISARGA */
729 0x0ABE, /* GUJARATI VOWEL SIGN AA */
730 0x0ABF, /* GUJARATI VOWEL SIGN I */
731 0x0AC0, /* GUJARATI VOWEL SIGN II */
732 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
733 0x0ACB, /* GUJARATI VOWEL SIGN O */
734 0x0ACC, /* GUJARATI VOWEL SIGN AU */
735 0x0B02, /* ORIYA SIGN ANUSVARA */
736 0x0B03, /* ORIYA SIGN VISARGA */
737 0x0B3E, /* ORIYA VOWEL SIGN AA */
738 0x0B40, /* ORIYA VOWEL SIGN II */
739 0x0B47, /* ORIYA VOWEL SIGN E */
740 0x0B48, /* ORIYA VOWEL SIGN AI */
741 0x0B4B, /* ORIYA VOWEL SIGN O */
742 0x0B4C, /* ORIYA VOWEL SIGN AU */
743 0x0B57, /* ORIYA AU LENGTH MARK */
744 0x0BBE, /* TAMIL VOWEL SIGN AA */
745 0x0BBF, /* TAMIL VOWEL SIGN I */
746 0x0BC1, /* TAMIL VOWEL SIGN U */
747 0x0BC2, /* TAMIL VOWEL SIGN UU */
748 0x0BC6, /* TAMIL VOWEL SIGN E */
749 0x0BC7, /* TAMIL VOWEL SIGN EE */
750 0x0BC8, /* TAMIL VOWEL SIGN AI */
751 0x0BCA, /* TAMIL VOWEL SIGN O */
752 0x0BCB, /* TAMIL VOWEL SIGN OO */
753 0x0BCC, /* TAMIL VOWEL SIGN AU */
754 0x0BD7, /* TAMIL AU LENGTH MARK */
755 0x0C01, /* TELUGU SIGN CANDRABINDU */
756 0x0C02, /* TELUGU SIGN ANUSVARA */
757 0x0C03, /* TELUGU SIGN VISARGA */
758 0x0C41, /* TELUGU VOWEL SIGN U */
759 0x0C42, /* TELUGU VOWEL SIGN UU */
760 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
761 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
762 0x0C82, /* KANNADA SIGN ANUSVARA */
763 0x0C83, /* KANNADA SIGN VISARGA */
764 0x0CBE, /* KANNADA VOWEL SIGN AA */
765 0x0CC0, /* KANNADA VOWEL SIGN II */
766 0x0CC1, /* KANNADA VOWEL SIGN U */
767 0x0CC2, /* KANNADA VOWEL SIGN UU */
768 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
769 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
770 0x0CC7, /* KANNADA VOWEL SIGN EE */
771 0x0CC8, /* KANNADA VOWEL SIGN AI */
772 0x0CCA, /* KANNADA VOWEL SIGN O */
773 0x0CCB, /* KANNADA VOWEL SIGN OO */
774 0x0CD5, /* KANNADA LENGTH MARK */
775 0x0CD6, /* KANNADA AI LENGTH MARK */
776 0x0D02, /* MALAYALAM SIGN ANUSVARA */
777 0x0D03, /* MALAYALAM SIGN VISARGA */
778 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
779 0x0D3F, /* MALAYALAM VOWEL SIGN I */
780 0x0D40, /* MALAYALAM VOWEL SIGN II */
781 0x0D46, /* MALAYALAM VOWEL SIGN E */
782 0x0D47, /* MALAYALAM VOWEL SIGN EE */
783 0x0D48, /* MALAYALAM VOWEL SIGN AI */
784 0x0D4A, /* MALAYALAM VOWEL SIGN O */
785 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
786 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
787 0x0D57, /* MALAYALAM AU LENGTH MARK */
788 0x0D82, /* SINHALA SIGN ANUSVARAYA */
789 0x0D83, /* SINHALA SIGN VISARGAYA */
790 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
791 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
792 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
793 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
794 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
795 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
796 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
797 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
798 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
799 * AELA-PILLA */
800 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
801 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
802 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
803 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
804 0x0F3E, /* TIBETAN SIGN YAR TSHES */
805 0x0F3F, /* TIBETAN SIGN MAR TSHES */
806 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
807 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
808 0x102C, /* MYANMAR VOWEL SIGN AA */
809 0x1031, /* MYANMAR VOWEL SIGN E */
810 0x1038, /* MYANMAR SIGN VISARGA */
811 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
812 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
813 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
814 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
815 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
816 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
817 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
818 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
819 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
820 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
821 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
822 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
823 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
824 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
825 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
826 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
827 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
828 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
829 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
830 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
831 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
832 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
833 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
834 0x17B6, /* KHMER VOWEL SIGN AA */
835 0x17BE, /* KHMER VOWEL SIGN OE */
836 0x17BF, /* KHMER VOWEL SIGN YA */
837 0x17C0, /* KHMER VOWEL SIGN IE */
838 0x17C1, /* KHMER VOWEL SIGN E */
839 0x17C2, /* KHMER VOWEL SIGN AE */
840 0x17C3, /* KHMER VOWEL SIGN AI */
841 0x17C4, /* KHMER VOWEL SIGN OO */
842 0x17C5, /* KHMER VOWEL SIGN AU */
843 0x17C7, /* KHMER SIGN REAHMUK */
844 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
845 0x1923, /* LIMBU VOWEL SIGN EE */
846 0x1924, /* LIMBU VOWEL SIGN AI */
847 0x1925, /* LIMBU VOWEL SIGN OO */
848 0x1926, /* LIMBU VOWEL SIGN AU */
849 0x1929, /* LIMBU SUBJOINED LETTER YA */
850 0x192A, /* LIMBU SUBJOINED LETTER RA */
851 0x192B, /* LIMBU SUBJOINED LETTER WA */
852 0x1930, /* LIMBU SMALL LETTER KA */
853 0x1931, /* LIMBU SMALL LETTER NGA */
854 0x1933, /* LIMBU SMALL LETTER TA */
855 0x1934, /* LIMBU SMALL LETTER NA */
856 0x1935, /* LIMBU SMALL LETTER PA */
857 0x1936, /* LIMBU SMALL LETTER MA */
858 0x1937, /* LIMBU SMALL LETTER RA */
859 0x1938, /* LIMBU SMALL LETTER LA */
860 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
861 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
862 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
863 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
864 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
865 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
866 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
867 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
868 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
869 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
870 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
871 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
872 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
873 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
874 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
875 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
876 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
877 0x19C8, /* NEW TAI LUE TONE MARK-1 */
878 0x19C9, /* NEW TAI LUE TONE MARK-2 */
879 0x1A19, /* BUGINESE VOWEL SIGN E */
880 0x1A1A, /* BUGINESE VOWEL SIGN O */
881 0x1A1B, /* BUGINESE VOWEL SIGN AE */
882 0x1B04, /* BALINESE SIGN BISAH */
883 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
884 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
885 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
886 0x1B3E, /* BALINESE VOWEL SIGN TALING */
887 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
888 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
889 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
890 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
891 0x1B44, /* BALINESE ADEG ADEG */
892 0x1B82, /* SUNDANESE SIGN PANGWISAD */
893 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
894 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
895 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
896 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
897 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
898 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
899 0x1C26, /* LEPCHA VOWEL SIGN AA */
900 0x1C27, /* LEPCHA VOWEL SIGN I */
901 0x1C28, /* LEPCHA VOWEL SIGN O */
902 0x1C29, /* LEPCHA VOWEL SIGN OO */
903 0x1C2A, /* LEPCHA VOWEL SIGN U */
904 0x1C2B, /* LEPCHA VOWEL SIGN UU */
905 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
906 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
907 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
908 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
909 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
910 0xA880, /* SAURASHTRA SIGN ANUSVARA */
911 0xA881, /* SAURASHTRA SIGN VISARGA */
912 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
913 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
914 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
915 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
916 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
917 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
918 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
919 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
920 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
921 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
922 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
923 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
924 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
925 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
926 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
927 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
928 0xA952, /* REJANG CONSONANT SIGN H */
929 0xA953, /* REJANG VIRAMA */
930 0xAA2F, /* CHAM VOWEL SIGN O */
931 0xAA30, /* CHAM VOWEL SIGN AI */
932 0xAA33, /* CHAM CONSONANT SIGN YA */
933 0xAA34, /* CHAM CONSONANT SIGN RA */
934 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
935 };
936 const pg_wchar *StopLow = strange_letter,
937 *StopHigh = strange_letter + lengthof(strange_letter),
938 *StopMiddle;
939 pg_wchar c;
940
941 if (prs->pgwstr)
942 c = *(prs->pgwstr + prs->state->poschar);
943 else
944 c = (pg_wchar) *(prs->wstr + prs->state->poschar);
945
946 while (StopLow < StopHigh)
947 {
948 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
949 if (*StopMiddle == c)
950 return 1;
951 else if (*StopMiddle < c)
952 StopLow = StopMiddle + 1;
953 else
954 StopHigh = StopMiddle;
955 }
956 }
957
958 return 0;
959}
960
961/*
962 * Table of state/action of parser
963 */
964
965static const TParserStateActionItem actionTPS_Base[] = {
966 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
967 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
968 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
969 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
970 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
971 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
972 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
973 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
974 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
975 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
976 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
977 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
978 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
979};
980
981
982static const TParserStateActionItem actionTPS_InNumWord[] = {
983 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
984 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
985 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
986 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
987 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
988 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
989 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
990 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
991};
992
993static const TParserStateActionItem actionTPS_InAsciiWord[] = {
994 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
995 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
996 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
997 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
998 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
999 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1000 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1001 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1002 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1003 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1004 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1005 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1006 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1007 {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1008 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1009};
1010
1011static const TParserStateActionItem actionTPS_InWord[] = {
1012 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1013 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1014 {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1015 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1016 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1017 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1018};
1019
1020static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
1021 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1022 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1023 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1024 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1025 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1026 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1027 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1028 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1029 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1030 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1031 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1032 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1033 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1034 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1035};
1036
1037static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
1038 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1039 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1040 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1041};
1042
1043static const TParserStateActionItem actionTPS_InSignedInt[] = {
1044 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1045 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1046 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1047 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1048 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1049 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1050};
1051
1052static const TParserStateActionItem actionTPS_InSpace[] = {
1053 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1054 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1055 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1056 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1057 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1058 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1059 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1060 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1061 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1062};
1063
1064static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1065 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1066 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1067 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1068};
1069
1070static const TParserStateActionItem actionTPS_InUDecimal[] = {
1071 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1072 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1073 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1074 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1075 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1076 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1077};
1078
1079static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1080 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1081 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1082 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1083};
1084
1085static const TParserStateActionItem actionTPS_InDecimal[] = {
1086 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1087 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1088 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1089 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1090 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1091 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1092};
1093
1094static const TParserStateActionItem actionTPS_InVerVersion[] = {
1095 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1096 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1097 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1098};
1099
1100static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1101 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1102 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1103 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1104};
1105
1106
1107static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1108 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1109 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1110 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1111};
1112
1113static const TParserStateActionItem actionTPS_InVersion[] = {
1114 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1115 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1116 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1117 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1118};
1119
1120static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1121 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1122 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1123 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1124 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1125 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1126};
1127
1128static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1129 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1130 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1131 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1132};
1133
1134static const TParserStateActionItem actionTPS_InMantissa[] = {
1135 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1136 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1137 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1138};
1139
1140static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1141 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1142 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1143 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1144 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1145 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1146 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1147};
1148
1149static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1150 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1151 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1152 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1153 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1154 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1155 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1156 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1157 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1158};
1159
1160static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1161 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1162 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1163 {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1164 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1165 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1166};
1167
1168static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1169 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1170 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1171 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1172};
1173
1174static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1175 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1176 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1177 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1178 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1179};
1180
1181static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1182 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1183 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1184 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1185 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1186};
1187
1188static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1189 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1190};
1191
1192static const TParserStateActionItem actionTPS_InTagFirst[] = {
1193 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1194 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1195 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1196 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1197 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1198 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1199 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1200 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1201};
1202
1203static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1204 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1205 /* <?xml ... */
1206 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1207 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1208 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1209};
1210
1211static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1212 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1213 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1214 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1215};
1216
1217static const TParserStateActionItem actionTPS_InTagName[] = {
1218 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219 /* <br/> case */
1220 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1221 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1222 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1223 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1224 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1225 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1226 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1227 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1228 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1229};
1230
1231static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1232 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1233 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1234 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1235};
1236
1237static const TParserStateActionItem actionTPS_InTag[] = {
1238 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1240 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1241 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1242 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1243 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1244 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1245 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1246 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1247 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1248 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1249 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1250 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1251 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1252 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1253 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1254 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1255 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1256 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1257};
1258
1259static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1260 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1261 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1262 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1263 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1264};
1265
1266static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1267 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1268 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1269 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1270 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1271};
1272
1273static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1274 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1275 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1276};
1277
1278static const TParserStateActionItem actionTPS_InTagEnd[] = {
1279 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1280};
1281
1282static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1283 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1284 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1285 /* <!DOCTYPE ...> */
1286 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1287 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1288 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1289};
1290
1291static const TParserStateActionItem actionTPS_InCommentLast[] = {
1292 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1293 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1294 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1295};
1296
1297static const TParserStateActionItem actionTPS_InComment[] = {
1298 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1299 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1300 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1301};
1302
1303static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1304 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1305 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1306 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1307};
1308
1309static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1310 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1311 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1312 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1313 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1314};
1315
1316static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1317 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1318};
1319
1320static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1321 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1322 {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1323 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1324 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1325};
1326
1327static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1328 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1329 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1330 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1331 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1332 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1333 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1334 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1335 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1336};
1337
1338static const TParserStateActionItem actionTPS_InHostDomain[] = {
1339 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1340 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1341 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1342 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1343 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1344 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1345 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1346 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1347 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1348 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1349 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1350 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1351};
1352
1353static const TParserStateActionItem actionTPS_InPortFirst[] = {
1354 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1355 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1356 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1357};
1358
1359static const TParserStateActionItem actionTPS_InPort[] = {
1360 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1361 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1362 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1363 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1364 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1365};
1366
1367static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1368 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1369 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1370 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1371 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1372};
1373
1374static const TParserStateActionItem actionTPS_InHost[] = {
1375 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1376 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1377 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1378 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1379 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1380 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1381 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1382 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1383};
1384
1385static const TParserStateActionItem actionTPS_InEmail[] = {
1386 {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1387 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1388 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1389};
1390
1391static const TParserStateActionItem actionTPS_InFileFirst[] = {
1392 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1393 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1394 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1395 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1396 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1397 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1398 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1399};
1400
1401static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1402 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1403 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1404 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1405 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1406 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1407 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1408};
1409
1410static const TParserStateActionItem actionTPS_InPathFirst[] = {
1411 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1412 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1413 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1414 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1415 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1416 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1417 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1418};
1419
1420static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1421 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1422 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1423 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1424 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1425};
1426
1427static const TParserStateActionItem actionTPS_InPathSecond[] = {
1428 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1429 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1430 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1431 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1432 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1433};
1434
1435static const TParserStateActionItem actionTPS_InFile[] = {
1436 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1437 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1438 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1439 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1440 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1441 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1442 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1443 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1444};
1445
1446static const TParserStateActionItem actionTPS_InFileNext[] = {
1447 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1448 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1449 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1450 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1451 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1452};
1453
1454static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1455 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1456 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1457 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1458};
1459
1460static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1461 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1462};
1463
1464static const TParserStateActionItem actionTPS_InURLPath[] = {
1465 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1466 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1467 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1468};
1469
1470static const TParserStateActionItem actionTPS_InFURL[] = {
1471 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1472 {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1473 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1474};
1475
1476static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1477 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1478 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1479 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480};
1481
1482static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1483 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1484 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1485 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486};
1487
1488static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1489 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1490};
1491
1492static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1493 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1494 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1495 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1496 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1497 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1498};
1499
1500static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1501 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1502 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1503 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1504 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1505 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1506 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1507 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1508};
1509
1510static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1511 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1512 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1513 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1514 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1515};
1516
1517static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1518 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1519 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1520 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1521 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1522 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1523 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1524};
1525
1526static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1527 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1528 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1529 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1530 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1531};
1532
1533static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1534 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1535 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1536 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1537 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1538 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1539};
1540
1541static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1542 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1543 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1544 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1545 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1546 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1547};
1548
1549static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1550 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1551 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1552 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1553 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1554 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1555 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1556};
1557
1558static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1559 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1560 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1561 {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1562 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1563};
1564
1565static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1566 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1567 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1568 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1569 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1570 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1571};
1572
1573static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1574 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1575 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1576 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1577 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1578 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1579 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1580};
1581
1582static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1583 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1584 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1585 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1586 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1587};
1588
1589static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1590 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1591 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1592 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1593 {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1594 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1595};
1596
1597
1598/*
1599 * main table of per-state parser actions
1600 */
1601typedef struct
1602{
1603 const TParserStateActionItem *action; /* the actual state info */
1604 TParserState state; /* only for Assert crosscheck */
1605#ifdef WPARSER_TRACE
1606 const char *state_name; /* only for debug printout */
1607#endif
1608} TParserStateAction;
1609
1610#ifdef WPARSER_TRACE
1611#define TPARSERSTATEACTION(state) \
1612 { CppConcat(action,state), state, CppAsString(state) }
1613#else
1614#define TPARSERSTATEACTION(state) \
1615 { CppConcat(action,state), state }
1616#endif
1617
1618/*
1619 * order must be the same as in typedef enum {} TParserState!!
1620 */
1621
1622static const TParserStateAction Actions[] = {
1623 TPARSERSTATEACTION(TPS_Base),
1624 TPARSERSTATEACTION(TPS_InNumWord),
1625 TPARSERSTATEACTION(TPS_InAsciiWord),
1626 TPARSERSTATEACTION(TPS_InWord),
1627 TPARSERSTATEACTION(TPS_InUnsignedInt),
1628 TPARSERSTATEACTION(TPS_InSignedIntFirst),
1629 TPARSERSTATEACTION(TPS_InSignedInt),
1630 TPARSERSTATEACTION(TPS_InSpace),
1631 TPARSERSTATEACTION(TPS_InUDecimalFirst),
1632 TPARSERSTATEACTION(TPS_InUDecimal),
1633 TPARSERSTATEACTION(TPS_InDecimalFirst),
1634 TPARSERSTATEACTION(TPS_InDecimal),
1635 TPARSERSTATEACTION(TPS_InVerVersion),
1636 TPARSERSTATEACTION(TPS_InSVerVersion),
1637 TPARSERSTATEACTION(TPS_InVersionFirst),
1638 TPARSERSTATEACTION(TPS_InVersion),
1639 TPARSERSTATEACTION(TPS_InMantissaFirst),
1640 TPARSERSTATEACTION(TPS_InMantissaSign),
1641 TPARSERSTATEACTION(TPS_InMantissa),
1642 TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1643 TPARSERSTATEACTION(TPS_InXMLEntity),
1644 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1645 TPARSERSTATEACTION(TPS_InXMLEntityNum),
1646 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1647 TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1648 TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1649 TPARSERSTATEACTION(TPS_InTagFirst),
1650 TPARSERSTATEACTION(TPS_InXMLBegin),
1651 TPARSERSTATEACTION(TPS_InTagCloseFirst),
1652 TPARSERSTATEACTION(TPS_InTagName),
1653 TPARSERSTATEACTION(TPS_InTagBeginEnd),
1654 TPARSERSTATEACTION(TPS_InTag),
1655 TPARSERSTATEACTION(TPS_InTagEscapeK),
1656 TPARSERSTATEACTION(TPS_InTagEscapeKK),
1657 TPARSERSTATEACTION(TPS_InTagBackSleshed),
1658 TPARSERSTATEACTION(TPS_InTagEnd),
1659 TPARSERSTATEACTION(TPS_InCommentFirst),
1660 TPARSERSTATEACTION(TPS_InCommentLast),
1661 TPARSERSTATEACTION(TPS_InComment),
1662 TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1663 TPARSERSTATEACTION(TPS_InCloseCommentLast),
1664 TPARSERSTATEACTION(TPS_InCommentEnd),
1665 TPARSERSTATEACTION(TPS_InHostFirstDomain),
1666 TPARSERSTATEACTION(TPS_InHostDomainSecond),
1667 TPARSERSTATEACTION(TPS_InHostDomain),
1668 TPARSERSTATEACTION(TPS_InPortFirst),
1669 TPARSERSTATEACTION(TPS_InPort),
1670 TPARSERSTATEACTION(TPS_InHostFirstAN),
1671 TPARSERSTATEACTION(TPS_InHost),
1672 TPARSERSTATEACTION(TPS_InEmail),
1673 TPARSERSTATEACTION(TPS_InFileFirst),
1674 TPARSERSTATEACTION(TPS_InFileTwiddle),
1675 TPARSERSTATEACTION(TPS_InPathFirst),
1676 TPARSERSTATEACTION(TPS_InPathFirstFirst),
1677 TPARSERSTATEACTION(TPS_InPathSecond),
1678 TPARSERSTATEACTION(TPS_InFile),
1679 TPARSERSTATEACTION(TPS_InFileNext),
1680 TPARSERSTATEACTION(TPS_InURLPathFirst),
1681 TPARSERSTATEACTION(TPS_InURLPathStart),
1682 TPARSERSTATEACTION(TPS_InURLPath),
1683 TPARSERSTATEACTION(TPS_InFURL),
1684 TPARSERSTATEACTION(TPS_InProtocolFirst),
1685 TPARSERSTATEACTION(TPS_InProtocolSecond),
1686 TPARSERSTATEACTION(TPS_InProtocolEnd),
1687 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1688 TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1689 TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1690 TPARSERSTATEACTION(TPS_InHyphenWord),
1691 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1692 TPARSERSTATEACTION(TPS_InHyphenNumWord),
1693 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1694 TPARSERSTATEACTION(TPS_InParseHyphen),
1695 TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1696 TPARSERSTATEACTION(TPS_InHyphenWordPart),
1697 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1698 TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1699 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1700};
1701
1702
1703static bool
1704TParserGet(TParser *prs)
1705{
1706 const TParserStateActionItem *item = NULL;
1707
1708 Assert(prs->state);
1709
1710 if (prs->state->posbyte >= prs->lenstr)
1711 return false;
1712
1713 prs->token = prs->str + prs->state->posbyte;
1714 prs->state->pushedAtAction = NULL;
1715
1716 /* look at string */
1717 while (prs->state->posbyte <= prs->lenstr)
1718 {
1719 if (prs->state->posbyte == prs->lenstr)
1720 prs->state->charlen = 0;
1721 else
1722 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1723 pg_mblen(prs->str + prs->state->posbyte);
1724
1725 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1726 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1727 Assert(Actions[prs->state->state].state == prs->state->state);
1728
1729 if (prs->state->pushedAtAction)
1730 {
1731 /* After a POP, pick up at the next test */
1732 item = prs->state->pushedAtAction + 1;
1733 prs->state->pushedAtAction = NULL;
1734 }
1735 else
1736 {
1737 item = Actions[prs->state->state].action;
1738 Assert(item != NULL);
1739 }
1740
1741 /* find action by character class */
1742 while (item->isclass)
1743 {
1744 prs->c = item->c;
1745 if (item->isclass(prs) != 0)
1746 break;
1747 item++;
1748 }
1749
1750#ifdef WPARSER_TRACE
1751 {
1752 TParserPosition *ptr;
1753
1754 fprintf(stderr, "state ");
1755 /* indent according to stack depth */
1756 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1757 fprintf(stderr, " ");
1758 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1759 if (prs->state->posbyte < prs->lenstr)
1760 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1761 else
1762 fprintf(stderr, "at EOF");
1763 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1764 (int) (item - Actions[prs->state->state].action),
1765 (item->flags & A_BINGO) ? " BINGO" : "",
1766 (item->flags & A_POP) ? " POP" : "",
1767 (item->flags & A_PUSH) ? " PUSH" : "",
1768 (item->flags & A_RERUN) ? " RERUN" : "",
1769 (item->flags & A_CLEAR) ? " CLEAR" : "",
1770 (item->flags & A_MERGE) ? " MERGE" : "",
1771 (item->flags & A_CLRALL) ? " CLRALL" : "",
1772 (item->tostate != TPS_Null) ? " tostate " : "",
1773 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1774 (item->type > 0) ? " type " : "",
1775 tok_alias[item->type]);
1776 }
1777#endif
1778
1779 /* call special handler if exists */
1780 if (item->special)
1781 item->special(prs);
1782
1783 /* BINGO, token is found */
1784 if (item->flags & A_BINGO)
1785 {
1786 Assert(item->type > 0);
1787 prs->lenbytetoken = prs->state->lenbytetoken;
1788 prs->lenchartoken = prs->state->lenchartoken;
1789 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1790 prs->type = item->type;
1791 }
1792
1793 /* do various actions by flags */
1794 if (item->flags & A_POP)
1795 { /* pop stored state in stack */
1796 TParserPosition *ptr = prs->state->prev;
1797
1798 pfree(prs->state);
1799 prs->state = ptr;
1800 Assert(prs->state);
1801 }
1802 else if (item->flags & A_PUSH)
1803 { /* push (store) state in stack */
1804 prs->state->pushedAtAction = item; /* remember where we push */
1805 prs->state = newTParserPosition(prs->state);
1806 }
1807 else if (item->flags & A_CLEAR)
1808 { /* clear previous pushed state */
1809 TParserPosition *ptr;
1810
1811 Assert(prs->state->prev);
1812 ptr = prs->state->prev->prev;
1813 pfree(prs->state->prev);
1814 prs->state->prev = ptr;
1815 }
1816 else if (item->flags & A_CLRALL)
1817 { /* clear all previous pushed state */
1818 TParserPosition *ptr;
1819
1820 while (prs->state->prev)
1821 {
1822 ptr = prs->state->prev->prev;
1823 pfree(prs->state->prev);
1824 prs->state->prev = ptr;
1825 }
1826 }
1827 else if (item->flags & A_MERGE)
1828 { /* merge posinfo with current and pushed state */
1829 TParserPosition *ptr = prs->state;
1830
1831 Assert(prs->state->prev);
1832 prs->state = prs->state->prev;
1833
1834 prs->state->posbyte = ptr->posbyte;
1835 prs->state->poschar = ptr->poschar;
1836 prs->state->charlen = ptr->charlen;
1837 prs->state->lenbytetoken = ptr->lenbytetoken;
1838 prs->state->lenchartoken = ptr->lenchartoken;
1839 pfree(ptr);
1840 }
1841
1842 /* set new state if pointed */
1843 if (item->tostate != TPS_Null)
1844 prs->state->state = item->tostate;
1845
1846 /* check for go away */
1847 if ((item->flags & A_BINGO) ||
1848 (prs->state->posbyte >= prs->lenstr &&
1849 (item->flags & A_RERUN) == 0))
1850 break;
1851
1852 /* go to beginning of loop if we should rerun or we just restore state */
1853 if (item->flags & (A_RERUN | A_POP))
1854 continue;
1855
1856 /* move forward */
1857 if (prs->state->charlen)
1858 {
1859 prs->state->posbyte += prs->state->charlen;
1860 prs->state->lenbytetoken += prs->state->charlen;
1861 prs->state->poschar++;
1862 prs->state->lenchartoken++;
1863 }
1864 }
1865
1866 return (item && (item->flags & A_BINGO)) ? true : false;
1867}
1868
1869Datum
1870prsd_lextype(PG_FUNCTION_ARGS)
1871{
1872 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1873 int i;
1874
1875 for (i = 1; i <= LASTNUM; i++)
1876 {
1877 descr[i - 1].lexid = i;
1878 descr[i - 1].alias = pstrdup(tok_alias[i]);
1879 descr[i - 1].descr = pstrdup(lex_descr[i]);
1880 }
1881
1882 descr[LASTNUM].lexid = 0;
1883
1884 PG_RETURN_POINTER(descr);
1885}
1886
1887Datum
1888prsd_start(PG_FUNCTION_ARGS)
1889{
1890 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1891}
1892
1893Datum
1894prsd_nexttoken(PG_FUNCTION_ARGS)
1895{
1896 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1897 char **t = (char **) PG_GETARG_POINTER(1);
1898 int *tlen = (int *) PG_GETARG_POINTER(2);
1899
1900 if (!TParserGet(p))
1901 PG_RETURN_INT32(0);
1902
1903 *t = p->token;
1904 *tlen = p->lenbytetoken;
1905
1906 PG_RETURN_INT32(p->type);
1907}
1908
1909Datum
1910prsd_end(PG_FUNCTION_ARGS)
1911{
1912 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1913
1914 TParserClose(p);
1915 PG_RETURN_VOID();
1916}
1917
1918#define LEAVETOKEN(x) ( (x)==SPACE )
1919#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1920#define ENDPUNCTOKEN(x) ( (x)==SPACE )
1921
1922#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1923#define HLIDREPLACE(x) ( (x)==TAG_T )
1924#define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1925#define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1926#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1927#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1928
1929typedef struct
1930{
1931 HeadlineWordEntry *words;
1932 int len;
1933} hlCheck;
1934
1935static bool
1936checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
1937{
1938 int i;
1939 hlCheck *checkval = (hlCheck *) opaque;
1940
1941 for (i = 0; i < checkval->len; i++)
1942 {
1943 if (checkval->words[i].item == val)
1944 {
1945 /* don't need to find all positions */
1946 if (!data)
1947 return true;
1948
1949 if (!data->pos)
1950 {
1951 data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1952 data->allocated = true;
1953 data->npos = 1;
1954 data->pos[0] = checkval->words[i].pos;
1955 }
1956 else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1957 {
1958 data->pos[data->npos++] = checkval->words[i].pos;
1959 }
1960 }
1961 }
1962
1963 if (data && data->npos > 0)
1964 return true;
1965
1966 return false;
1967}
1968
1969
1970static bool
1971hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
1972{
1973 int i,
1974 j;
1975 QueryItem *item = GETQUERY(query);
1976 int pos = *p;
1977
1978 *q = -1;
1979 *p = INT_MAX;
1980
1981 for (j = 0; j < query->size; j++)
1982 {
1983 if (item->type != QI_VAL)
1984 {
1985 item++;
1986 continue;
1987 }
1988 for (i = pos; i < prs->curwords; i++)
1989 {
1990 if (prs->words[i].item == &item->qoperand)
1991 {
1992 if (i > *q)
1993 *q = i;
1994 break;
1995 }
1996 }
1997 item++;
1998 }
1999
2000 if (*q < 0)
2001 return false;
2002
2003 item = GETQUERY(query);
2004 for (j = 0; j < query->size; j++)
2005 {
2006 if (item->type != QI_VAL)
2007 {
2008 item++;
2009 continue;
2010 }
2011 for (i = *q; i >= pos; i--)
2012 {
2013 if (prs->words[i].item == &item->qoperand)
2014 {
2015 if (i < *p)
2016 *p = i;
2017 break;
2018 }
2019 }
2020 item++;
2021 }
2022
2023 if (*p <= *q)
2024 {
2025 hlCheck ch;
2026
2027 ch.words = &(prs->words[*p]);
2028 ch.len = *q - *p + 1;
2029 if (TS_execute(GETQUERY(query), &ch, TS_EXEC_EMPTY, checkcondition_HL))
2030 return true;
2031 else
2032 {
2033 (*p)++;
2034 return hlCover(prs, query, p, q);
2035 }
2036 }
2037
2038 return false;
2039}
2040
2041static void
2042mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
2043{
2044 int i;
2045
2046 for (i = startpos; i <= endpos; i++)
2047 {
2048 if (prs->words[i].item)
2049 prs->words[i].selected = 1;
2050 if (highlight == 0)
2051 {
2052 if (HLIDREPLACE(prs->words[i].type))
2053 prs->words[i].replace = 1;
2054 else if (HLIDSKIP(prs->words[i].type))
2055 prs->words[i].skip = 1;
2056 }
2057 else
2058 {
2059 if (XMLHLIDSKIP(prs->words[i].type))
2060 prs->words[i].skip = 1;
2061 }
2062
2063 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2064 }
2065}
2066
2067typedef struct
2068{
2069 int32 startpos;
2070 int32 endpos;
2071 int32 poslen;
2072 int32 curlen;
2073 int16 in;
2074 int16 excluded;
2075} CoverPos;
2076
2077static void
2078get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2079 int *curlen, int *poslen, int max_words)
2080{
2081 int i;
2082
2083 /*
2084 * Objective: Generate a fragment of words between startpos and endpos
2085 * such that it has at most max_words and both ends has query words. If
2086 * the startpos and endpos are the endpoints of the cover and the cover
2087 * has fewer words than max_words, then this function should just return
2088 * the cover
2089 */
2090 /* first move startpos to an item */
2091 for (i = *startpos; i <= *endpos; i++)
2092 {
2093 *startpos = i;
2094 if (prs->words[i].item && !prs->words[i].repeated)
2095 break;
2096 }
2097 /* cut endpos to have only max_words */
2098 *curlen = 0;
2099 *poslen = 0;
2100 for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2101 {
2102 if (!NONWORDTOKEN(prs->words[i].type))
2103 *curlen += 1;
2104 if (prs->words[i].item && !prs->words[i].repeated)
2105 *poslen += 1;
2106 }
2107 /* if the cover was cut then move back endpos to a query item */
2108 if (*endpos > i)
2109 {
2110 *endpos = i;
2111 for (i = *endpos; i >= *startpos; i--)
2112 {
2113 *endpos = i;
2114 if (prs->words[i].item && !prs->words[i].repeated)
2115 break;
2116 if (!NONWORDTOKEN(prs->words[i].type))
2117 *curlen -= 1;
2118 }
2119 }
2120}
2121
2122static void
2123mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
2124 int shortword, int min_words,
2125 int max_words, int max_fragments)
2126{
2127 int32 poslen,
2128 curlen,
2129 i,
2130 f,
2131 num_f = 0;
2132 int32 stretch,
2133 maxstretch,
2134 posmarker;
2135
2136 int32 startpos = 0,
2137 endpos = 0,
2138 p = 0,
2139 q = 0;
2140
2141 int32 numcovers = 0,
2142 maxcovers = 32;
2143
2144 int32 minI,
2145 minwords,
2146 maxitems;
2147 CoverPos *covers;
2148
2149 covers = palloc(maxcovers * sizeof(CoverPos));
2150
2151 /* get all covers */
2152 while (hlCover(prs, query, &p, &q))
2153 {
2154 startpos = p;
2155 endpos = q;
2156
2157 /*
2158 * Break the cover into smaller fragments such that each fragment has
2159 * at most max_words. Also ensure that each end of the fragment is a
2160 * query word. This will allow us to stretch the fragment in either
2161 * direction
2162 */
2163
2164 while (startpos <= endpos)
2165 {
2166 get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2167 if (numcovers >= maxcovers)
2168 {
2169 maxcovers *= 2;
2170 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2171 }
2172 covers[numcovers].startpos = startpos;
2173 covers[numcovers].endpos = endpos;
2174 covers[numcovers].curlen = curlen;
2175 covers[numcovers].poslen = poslen;
2176 covers[numcovers].in = 0;
2177 covers[numcovers].excluded = 0;
2178 numcovers++;
2179 startpos = endpos + 1;
2180 endpos = q;
2181 }
2182 /* move p to generate the next cover */
2183 p++;
2184 }
2185
2186 /* choose best covers */
2187 for (f = 0; f < max_fragments; f++)
2188 {
2189 maxitems = 0;
2190 minwords = PG_INT32_MAX;
2191 minI = -1;
2192
2193 /*
2194 * Choose the cover that contains max items. In case of tie choose the
2195 * one with smaller number of words.
2196 */
2197 for (i = 0; i < numcovers; i++)
2198 {
2199 if (!covers[i].in && !covers[i].excluded &&
2200 (maxitems < covers[i].poslen || (maxitems == covers[i].poslen
2201 && minwords > covers[i].curlen)))
2202 {
2203 maxitems = covers[i].poslen;
2204 minwords = covers[i].curlen;
2205 minI = i;
2206 }
2207 }
2208 /* if a cover was found mark it */
2209 if (minI >= 0)
2210 {
2211 covers[minI].in = 1;
2212 /* adjust the size of cover */
2213 startpos = covers[minI].startpos;
2214 endpos = covers[minI].endpos;
2215 curlen = covers[minI].curlen;
2216 /* stretch the cover if cover size is lower than max_words */
2217 if (curlen < max_words)
2218 {
2219 /* divide the stretch on both sides of cover */
2220 maxstretch = (max_words - curlen) / 2;
2221
2222 /*
2223 * first stretch the startpos stop stretching if 1. we hit the
2224 * beginning of document 2. exceed maxstretch 3. we hit an
2225 * already marked fragment
2226 */
2227 stretch = 0;
2228 posmarker = startpos;
2229 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2230 {
2231 if (!NONWORDTOKEN(prs->words[i].type))
2232 {
2233 curlen++;
2234 stretch++;
2235 }
2236 posmarker = i;
2237 }
2238 /* cut back startpos till we find a non short token */
2239 for (i = posmarker; i < startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
2240 {
2241 if (!NONWORDTOKEN(prs->words[i].type))
2242 curlen--;
2243 }
2244 startpos = i;
2245 /* now stretch the endpos as much as possible */
2246 posmarker = endpos;
2247 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2248 {
2249 if (!NONWORDTOKEN(prs->words[i].type))
2250 curlen++;
2251 posmarker = i;
2252 }
2253 /* cut back endpos till we find a non-short token */
2254 for (i = posmarker; i > endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
2255 {
2256 if (!NONWORDTOKEN(prs->words[i].type))
2257 curlen--;
2258 }
2259 endpos = i;
2260 }
2261 covers[minI].startpos = startpos;
2262 covers[minI].endpos = endpos;
2263 covers[minI].curlen = curlen;
2264 /* Mark the chosen fragments (covers) */
2265 mark_fragment(prs, highlight, startpos, endpos);
2266 num_f++;
2267 /* exclude overlapping covers */
2268 for (i = 0; i < numcovers; i++)
2269 {
2270 if (i != minI && ((covers[i].startpos >= covers[minI].startpos && covers[i].startpos <= covers[minI].endpos) || (covers[i].endpos >= covers[minI].startpos && covers[i].endpos <= covers[minI].endpos)))
2271 covers[i].excluded = 1;
2272 }
2273 }
2274 else
2275 break;
2276 }
2277
2278 /* show at least min_words we have not marked anything */
2279 if (num_f <= 0)
2280 {
2281 startpos = endpos = curlen = 0;
2282 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2283 {
2284 if (!NONWORDTOKEN(prs->words[i].type))
2285 curlen++;
2286 endpos = i;
2287 }
2288 mark_fragment(prs, highlight, startpos, endpos);
2289 }
2290 pfree(covers);
2291}
2292
2293static void
2294mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight,
2295 int shortword, int min_words, int max_words)
2296{
2297 int p = 0,
2298 q = 0;
2299 int bestb = -1,
2300 beste = -1;
2301 int bestlen = -1;
2302 int pose = 0,
2303 posb,
2304 poslen,
2305 curlen;
2306
2307 int i;
2308
2309 if (highlight == 0)
2310 {
2311 while (hlCover(prs, query, &p, &q))
2312 {
2313 /* find cover len in words */
2314 curlen = 0;
2315 poslen = 0;
2316 for (i = p; i <= q && curlen < max_words; i++)
2317 {
2318 if (!NONWORDTOKEN(prs->words[i].type))
2319 curlen++;
2320 if (prs->words[i].item && !prs->words[i].repeated)
2321 poslen++;
2322 pose = i;
2323 }
2324
2325 if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
2326 {
2327 /* best already found, so try one more cover */
2328 p++;
2329 continue;
2330 }
2331
2332 posb = p;
2333 if (curlen < max_words)
2334 { /* find good end */
2335 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2336 {
2337 if (i != q)
2338 {
2339 if (!NONWORDTOKEN(prs->words[i].type))
2340 curlen++;
2341 if (prs->words[i].item && !prs->words[i].repeated)
2342 poslen++;
2343 }
2344 pose = i;
2345 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2346 continue;
2347 if (curlen >= min_words)
2348 break;
2349 }
2350 if (curlen < min_words && i >= prs->curwords)
2351 { /* got end of text and our cover is shorter
2352 * than min_words */
2353 for (i = p - 1; i >= 0; i--)
2354 {
2355 if (!NONWORDTOKEN(prs->words[i].type))
2356 curlen++;
2357 if (prs->words[i].item && !prs->words[i].repeated)
2358 poslen++;
2359 if (curlen >= max_words)
2360 break;
2361 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2362 continue;
2363 if (curlen >= min_words)
2364 break;
2365 }
2366 posb = (i >= 0) ? i : 0;
2367 }
2368 }
2369 else
2370 { /* shorter cover :((( */
2371 if (i > q)
2372 i = q;
2373 for (; curlen > min_words; i--)
2374 {
2375 if (!NONWORDTOKEN(prs->words[i].type))
2376 curlen--;
2377 if (prs->words[i].item && !prs->words[i].repeated)
2378 poslen--;
2379 pose = i;
2380 if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
2381 continue;
2382 break;
2383 }
2384 }
2385
2386 if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
2387 (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
2388 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
2389 {
2390 bestb = posb;
2391 beste = pose;
2392 bestlen = poslen;
2393 }
2394
2395 p++;
2396 }
2397
2398 if (bestlen < 0)
2399 {
2400 curlen = 0;
2401 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2402 {
2403 if (!NONWORDTOKEN(prs->words[i].type))
2404 curlen++;
2405 pose = i;
2406 }
2407 bestb = 0;
2408 beste = pose;
2409 }
2410 }
2411 else
2412 {
2413 bestb = 0;
2414 beste = prs->curwords - 1;
2415 }
2416
2417 for (i = bestb; i <= beste; i++)
2418 {
2419 if (prs->words[i].item)
2420 prs->words[i].selected = 1;
2421 if (highlight == 0)
2422 {
2423 if (HLIDREPLACE(prs->words[i].type))
2424 prs->words[i].replace = 1;
2425 else if (HLIDSKIP(prs->words[i].type))
2426 prs->words[i].skip = 1;
2427 }
2428 else
2429 {
2430 if (XMLHLIDSKIP(prs->words[i].type))
2431 prs->words[i].skip = 1;
2432 }
2433
2434 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2435 }
2436
2437}
2438
2439Datum
2440prsd_headline(PG_FUNCTION_ARGS)
2441{
2442 HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2443 List *prsoptions = (List *) PG_GETARG_POINTER(1);
2444 TSQuery query = PG_GETARG_TSQUERY(2);
2445
2446 /* from opt + start and end tag */
2447 int min_words = 15;
2448 int max_words = 35;
2449 int shortword = 3;
2450 int max_fragments = 0;
2451 int highlight = 0;
2452 ListCell *l;
2453
2454 /* config */
2455 prs->startsel = NULL;
2456 prs->stopsel = NULL;
2457 foreach(l, prsoptions)
2458 {
2459 DefElem *defel = (DefElem *) lfirst(l);
2460 char *val = defGetString(defel);
2461
2462 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2463 max_words = pg_strtoint32(val);
2464 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2465 min_words = pg_strtoint32(val);
2466 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2467 shortword = pg_strtoint32(val);
2468 else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2469 max_fragments = pg_strtoint32(val);
2470 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2471 prs->startsel = pstrdup(val);
2472 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2473 prs->stopsel = pstrdup(val);
2474 else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2475 prs->fragdelim = pstrdup(val);
2476 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2477 highlight = (pg_strcasecmp(val, "1") == 0 ||
2478 pg_strcasecmp(val, "on") == 0 ||
2479 pg_strcasecmp(val, "true") == 0 ||
2480 pg_strcasecmp(val, "t") == 0 ||
2481 pg_strcasecmp(val, "y") == 0 ||
2482 pg_strcasecmp(val, "yes") == 0);
2483 else
2484 ereport(ERROR,
2485 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2486 errmsg("unrecognized headline parameter: \"%s\"",
2487 defel->defname)));
2488 }
2489
2490 if (highlight == 0)
2491 {
2492 if (min_words >= max_words)
2493 ereport(ERROR,
2494 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2495 errmsg("MinWords should be less than MaxWords")));
2496 if (min_words <= 0)
2497 ereport(ERROR,
2498 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2499 errmsg("MinWords should be positive")));
2500 if (shortword < 0)
2501 ereport(ERROR,
2502 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2503 errmsg("ShortWord should be >= 0")));
2504 if (max_fragments < 0)
2505 ereport(ERROR,
2506 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2507 errmsg("MaxFragments should be >= 0")));
2508 }
2509
2510 if (max_fragments == 0)
2511 /* call the default headline generator */
2512 mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
2513 else
2514 mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
2515
2516 if (!prs->startsel)
2517 prs->startsel = pstrdup("<b>");
2518 if (!prs->stopsel)
2519 prs->stopsel = pstrdup("</b>");
2520 if (!prs->fragdelim)
2521 prs->fragdelim = pstrdup(" ... ");
2522 prs->startsellen = strlen(prs->startsel);
2523 prs->stopsellen = strlen(prs->stopsel);
2524 prs->fragdelimlen = strlen(prs->fragdelim);
2525
2526 PG_RETURN_POINTER(prs);
2527}
2528