1/*-------------------------------------------------------------------------
2 *
3 * spell.c
4 * Normalizing word with ISpell
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 * Ispell dictionary
9 * -----------------
10 *
11 * Rules of dictionaries are defined in two files with .affix and .dict
12 * extensions. They are used by spell checker programs Ispell and Hunspell.
13 *
14 * An .affix file declares morphological rules to get a basic form of words.
15 * The format of an .affix file has different structure for Ispell and Hunspell
16 * dictionaries. The Hunspell format is more complicated. But when an .affix
17 * file is imported and compiled, it is stored in the same structure AffixNode.
18 *
19 * A .dict file stores a list of basic forms of words with references to
20 * affix rules. The format of a .dict file has the same structure for Ispell
21 * and Hunspell dictionaries.
22 *
23 * Compilation of a dictionary
24 * ---------------------------
25 *
26 * A compiled dictionary is stored in the IspellDict structure. Compilation of
27 * a dictionary is divided into the several steps:
28 * - NIImportDictionary() - stores each word of a .dict file in the
29 * temporary Spell field.
30 * - NIImportAffixes() - stores affix rules of an .affix file in the
31 * Affix field (not temporary) if an .affix file has the Ispell format.
32 * -> NIImportOOAffixes() - stores affix rules if an .affix file has the
33 * Hunspell format. The AffixData field is initialized if AF parameter
34 * is defined.
35 * - NISortDictionary() - builds a prefix tree (Trie) from the words list
36 * and stores it in the Dictionary field. The words list is got from the
37 * Spell field. The AffixData field is initialized if AF parameter is not
38 * defined.
39 * - NISortAffixes():
40 * - builds a list of compound affixes from the affix list and stores it
41 * in the CompoundAffix.
42 * - builds prefix trees (Trie) from the affix list for prefixes and suffixes
43 * and stores them in Suffix and Prefix fields.
44 * The affix list is got from the Affix field.
45 *
46 * Memory management
47 * -----------------
48 *
49 * The IspellDict structure has the Spell field which is used only in compile
50 * time. The Spell field stores a words list. It can take a lot of memory.
51 * Therefore when a dictionary is compiled this field is cleared by
52 * NIFinishBuild().
53 *
54 * All resources which should cleared by NIFinishBuild() is initialized using
55 * tmpalloc() and tmpalloc0().
56 *
57 * IDENTIFICATION
58 * src/backend/tsearch/spell.c
59 *
60 *-------------------------------------------------------------------------
61 */
62
63#include "postgres.h"
64
65#include "catalog/pg_collation.h"
66#include "tsearch/dicts/spell.h"
67#include "tsearch/ts_locale.h"
68#include "utils/memutils.h"
69
70
71/*
72 * Initialization requires a lot of memory that's not needed
73 * after the initialization is done. During initialization,
74 * CurrentMemoryContext is the long-lived memory context associated
75 * with the dictionary cache entry. We keep the short-lived stuff
76 * in the Conf->buildCxt context.
77 */
78#define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
79#define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
80
81/*
82 * Prepare for constructing an ISpell dictionary.
83 *
84 * The IspellDict struct is assumed to be zeroed when allocated.
85 */
86void
87NIStartBuild(IspellDict *Conf)
88{
89 /*
90 * The temp context is a child of CurTransactionContext, so that it will
91 * go away automatically on error.
92 */
93 Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
94 "Ispell dictionary init context",
95 ALLOCSET_DEFAULT_SIZES);
96}
97
98/*
99 * Clean up when dictionary construction is complete.
100 */
101void
102NIFinishBuild(IspellDict *Conf)
103{
104 /* Release no-longer-needed temp memory */
105 MemoryContextDelete(Conf->buildCxt);
106 /* Just for cleanliness, zero the now-dangling pointers */
107 Conf->buildCxt = NULL;
108 Conf->Spell = NULL;
109 Conf->firstfree = NULL;
110 Conf->CompoundAffixFlags = NULL;
111}
112
113
114/*
115 * "Compact" palloc: allocate without extra palloc overhead.
116 *
117 * Since we have no need to free the ispell data items individually, there's
118 * not much value in the per-chunk overhead normally consumed by palloc.
119 * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
120 *
121 * We currently pre-zero all data allocated this way, even though some of it
122 * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
123 * to indicate which allocations actually require zeroing.
124 */
125#define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
126#define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
127
128static void *
129compact_palloc0(IspellDict *Conf, size_t size)
130{
131 void *result;
132
133 /* Should only be called during init */
134 Assert(Conf->buildCxt != NULL);
135
136 /* No point in this for large chunks */
137 if (size > COMPACT_MAX_REQ)
138 return palloc0(size);
139
140 /* Keep everything maxaligned */
141 size = MAXALIGN(size);
142
143 /* Need more space? */
144 if (size > Conf->avail)
145 {
146 Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
147 Conf->avail = COMPACT_ALLOC_CHUNK;
148 }
149
150 result = (void *) Conf->firstfree;
151 Conf->firstfree += size;
152 Conf->avail -= size;
153
154 return result;
155}
156
157#define cpalloc(size) compact_palloc0(Conf, size)
158#define cpalloc0(size) compact_palloc0(Conf, size)
159
160static char *
161cpstrdup(IspellDict *Conf, const char *str)
162{
163 char *res = cpalloc(strlen(str) + 1);
164
165 strcpy(res, str);
166 return res;
167}
168
169
170/*
171 * Apply lowerstr(), producing a temporary result (in the buildCxt).
172 */
173static char *
174lowerstr_ctx(IspellDict *Conf, const char *src)
175{
176 MemoryContext saveCtx;
177 char *dst;
178
179 saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
180 dst = lowerstr(src);
181 MemoryContextSwitchTo(saveCtx);
182
183 return dst;
184}
185
186#define MAX_NORM 1024
187#define MAXNORMLEN 256
188
189#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
190#define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
191#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
192
193static char *VoidString = "";
194
195static int
196cmpspell(const void *s1, const void *s2)
197{
198 return strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word);
199}
200
201static int
202cmpspellaffix(const void *s1, const void *s2)
203{
204 return strcmp((*(SPELL *const *) s1)->p.flag,
205 (*(SPELL *const *) s2)->p.flag);
206}
207
208static int
209cmpcmdflag(const void *f1, const void *f2)
210{
211 CompoundAffixFlag *fv1 = (CompoundAffixFlag *) f1,
212 *fv2 = (CompoundAffixFlag *) f2;
213
214 Assert(fv1->flagMode == fv2->flagMode);
215
216 if (fv1->flagMode == FM_NUM)
217 {
218 if (fv1->flag.i == fv2->flag.i)
219 return 0;
220
221 return (fv1->flag.i > fv2->flag.i) ? 1 : -1;
222 }
223
224 return strcmp(fv1->flag.s, fv2->flag.s);
225}
226
227static char *
228findchar(char *str, int c)
229{
230 while (*str)
231 {
232 if (t_iseq(str, c))
233 return str;
234 str += pg_mblen(str);
235 }
236
237 return NULL;
238}
239
240static char *
241findchar2(char *str, int c1, int c2)
242{
243 while (*str)
244 {
245 if (t_iseq(str, c1) || t_iseq(str, c2))
246 return str;
247 str += pg_mblen(str);
248 }
249
250 return NULL;
251}
252
253
254/* backward string compare for suffix tree operations */
255static int
256strbcmp(const unsigned char *s1, const unsigned char *s2)
257{
258 int l1 = strlen((const char *) s1) - 1,
259 l2 = strlen((const char *) s2) - 1;
260
261 while (l1 >= 0 && l2 >= 0)
262 {
263 if (s1[l1] < s2[l2])
264 return -1;
265 if (s1[l1] > s2[l2])
266 return 1;
267 l1--;
268 l2--;
269 }
270 if (l1 < l2)
271 return -1;
272 if (l1 > l2)
273 return 1;
274
275 return 0;
276}
277
278static int
279strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
280{
281 int l1 = strlen((const char *) s1) - 1,
282 l2 = strlen((const char *) s2) - 1,
283 l = count;
284
285 while (l1 >= 0 && l2 >= 0 && l > 0)
286 {
287 if (s1[l1] < s2[l2])
288 return -1;
289 if (s1[l1] > s2[l2])
290 return 1;
291 l1--;
292 l2--;
293 l--;
294 }
295 if (l == 0)
296 return 0;
297 if (l1 < l2)
298 return -1;
299 if (l1 > l2)
300 return 1;
301 return 0;
302}
303
304/*
305 * Compares affixes.
306 * First compares the type of an affix. Prefixes should go before affixes.
307 * If types are equal then compares replaceable string.
308 */
309static int
310cmpaffix(const void *s1, const void *s2)
311{
312 const AFFIX *a1 = (const AFFIX *) s1;
313 const AFFIX *a2 = (const AFFIX *) s2;
314
315 if (a1->type < a2->type)
316 return -1;
317 if (a1->type > a2->type)
318 return 1;
319 if (a1->type == FF_PREFIX)
320 return strcmp(a1->repl, a2->repl);
321 else
322 return strbcmp((const unsigned char *) a1->repl,
323 (const unsigned char *) a2->repl);
324}
325
326/*
327 * Gets an affix flag from the set of affix flags (sflagset).
328 *
329 * Several flags can be stored in a single string. Flags can be represented by:
330 * - 1 character (FM_CHAR). A character may be Unicode.
331 * - 2 characters (FM_LONG). A character may be Unicode.
332 * - numbers from 1 to 65000 (FM_NUM).
333 *
334 * Depending on the flagMode an affix string can have the following format:
335 * - FM_CHAR: ABCD
336 * Here we have 4 flags: A, B, C and D
337 * - FM_LONG: ABCDE*
338 * Here we have 3 flags: AB, CD and E*
339 * - FM_NUM: 200,205,50
340 * Here we have 3 flags: 200, 205 and 50
341 *
342 * Conf: current dictionary.
343 * sflagset: the set of affix flags. Returns a reference to the start of a next
344 * affix flag.
345 * sflag: returns an affix flag from sflagset.
346 */
347static void
348getNextFlagFromString(IspellDict *Conf, char **sflagset, char *sflag)
349{
350 int32 s;
351 char *next,
352 *sbuf = *sflagset;
353 int maxstep;
354 bool stop = false;
355 bool met_comma = false;
356
357 maxstep = (Conf->flagMode == FM_LONG) ? 2 : 1;
358
359 while (**sflagset)
360 {
361 switch (Conf->flagMode)
362 {
363 case FM_LONG:
364 case FM_CHAR:
365 COPYCHAR(sflag, *sflagset);
366 sflag += pg_mblen(*sflagset);
367
368 /* Go to start of the next flag */
369 *sflagset += pg_mblen(*sflagset);
370
371 /* Check if we get all characters of flag */
372 maxstep--;
373 stop = (maxstep == 0);
374 break;
375 case FM_NUM:
376 s = strtol(*sflagset, &next, 10);
377 if (*sflagset == next || errno == ERANGE)
378 ereport(ERROR,
379 (errcode(ERRCODE_CONFIG_FILE_ERROR),
380 errmsg("invalid affix flag \"%s\"", *sflagset)));
381 if (s < 0 || s > FLAGNUM_MAXSIZE)
382 ereport(ERROR,
383 (errcode(ERRCODE_CONFIG_FILE_ERROR),
384 errmsg("affix flag \"%s\" is out of range",
385 *sflagset)));
386 sflag += sprintf(sflag, "%0d", s);
387
388 /* Go to start of the next flag */
389 *sflagset = next;
390 while (**sflagset)
391 {
392 if (t_isdigit(*sflagset))
393 {
394 if (!met_comma)
395 ereport(ERROR,
396 (errcode(ERRCODE_CONFIG_FILE_ERROR),
397 errmsg("invalid affix flag \"%s\"",
398 *sflagset)));
399 break;
400 }
401 else if (t_iseq(*sflagset, ','))
402 {
403 if (met_comma)
404 ereport(ERROR,
405 (errcode(ERRCODE_CONFIG_FILE_ERROR),
406 errmsg("invalid affix flag \"%s\"",
407 *sflagset)));
408 met_comma = true;
409 }
410 else if (!t_isspace(*sflagset))
411 {
412 ereport(ERROR,
413 (errcode(ERRCODE_CONFIG_FILE_ERROR),
414 errmsg("invalid character in affix flag \"%s\"",
415 *sflagset)));
416 }
417
418 *sflagset += pg_mblen(*sflagset);
419 }
420 stop = true;
421 break;
422 default:
423 elog(ERROR, "unrecognized type of Conf->flagMode: %d",
424 Conf->flagMode);
425 }
426
427 if (stop)
428 break;
429 }
430
431 if (Conf->flagMode == FM_LONG && maxstep > 0)
432 ereport(ERROR,
433 (errcode(ERRCODE_CONFIG_FILE_ERROR),
434 errmsg("invalid affix flag \"%s\" with \"long\" flag value",
435 sbuf)));
436
437 *sflag = '\0';
438}
439
440/*
441 * Checks if the affix set Conf->AffixData[affix] contains affixflag.
442 * Conf->AffixData[affix] does not contain affixflag if this flag is not used
443 * actually by the .dict file.
444 *
445 * Conf: current dictionary.
446 * affix: index of the Conf->AffixData array.
447 * affixflag: the affix flag.
448 *
449 * Returns true if the string Conf->AffixData[affix] contains affixflag,
450 * otherwise returns false.
451 */
452static bool
453IsAffixFlagInUse(IspellDict *Conf, int affix, const char *affixflag)
454{
455 char *flagcur;
456 char flag[BUFSIZ];
457
458 if (*affixflag == 0)
459 return true;
460
461 flagcur = Conf->AffixData[affix];
462
463 while (*flagcur)
464 {
465 getNextFlagFromString(Conf, &flagcur, flag);
466 /* Compare first affix flag in flagcur with affixflag */
467 if (strcmp(flag, affixflag) == 0)
468 return true;
469 }
470
471 /* Could not find affixflag */
472 return false;
473}
474
475/*
476 * Adds the new word into the temporary array Spell.
477 *
478 * Conf: current dictionary.
479 * word: new word.
480 * flag: set of affix flags. Single flag can be get by getNextFlagFromString().
481 */
482static void
483NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
484{
485 if (Conf->nspell >= Conf->mspell)
486 {
487 if (Conf->mspell)
488 {
489 Conf->mspell *= 2;
490 Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
491 }
492 else
493 {
494 Conf->mspell = 1024 * 20;
495 Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
496 }
497 }
498 Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
499 strcpy(Conf->Spell[Conf->nspell]->word, word);
500 Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
501 ? cpstrdup(Conf, flag) : VoidString;
502 Conf->nspell++;
503}
504
505/*
506 * Imports dictionary into the temporary array Spell.
507 *
508 * Note caller must already have applied get_tsearch_config_filename.
509 *
510 * Conf: current dictionary.
511 * filename: path to the .dict file.
512 */
513void
514NIImportDictionary(IspellDict *Conf, const char *filename)
515{
516 tsearch_readline_state trst;
517 char *line;
518
519 if (!tsearch_readline_begin(&trst, filename))
520 ereport(ERROR,
521 (errcode(ERRCODE_CONFIG_FILE_ERROR),
522 errmsg("could not open dictionary file \"%s\": %m",
523 filename)));
524
525 while ((line = tsearch_readline(&trst)) != NULL)
526 {
527 char *s,
528 *pstr;
529
530 /* Set of affix flags */
531 const char *flag;
532
533 /* Extract flag from the line */
534 flag = NULL;
535 if ((s = findchar(line, '/')))
536 {
537 *s++ = '\0';
538 flag = s;
539 while (*s)
540 {
541 /* we allow only single encoded flags for faster works */
542 if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
543 s++;
544 else
545 {
546 *s = '\0';
547 break;
548 }
549 }
550 }
551 else
552 flag = "";
553
554 /* Remove trailing spaces */
555 s = line;
556 while (*s)
557 {
558 if (t_isspace(s))
559 {
560 *s = '\0';
561 break;
562 }
563 s += pg_mblen(s);
564 }
565 pstr = lowerstr_ctx(Conf, line);
566
567 NIAddSpell(Conf, pstr, flag);
568 pfree(pstr);
569
570 pfree(line);
571 }
572 tsearch_readline_end(&trst);
573}
574
575/*
576 * Searches a basic form of word in the prefix tree. This word was generated
577 * using an affix rule. This rule may not be presented in an affix set of
578 * a basic form of word.
579 *
580 * For example, we have the entry in the .dict file:
581 * meter/GMD
582 *
583 * The affix rule with the flag S:
584 * SFX S y ies [^aeiou]y
585 * is not presented here.
586 *
587 * The affix rule with the flag M:
588 * SFX M 0 's .
589 * is presented here.
590 *
591 * Conf: current dictionary.
592 * word: basic form of word.
593 * affixflag: affix flag, by which a basic form of word was generated.
594 * flag: compound flag used to compare with StopMiddle->compoundflag.
595 *
596 * Returns 1 if the word was found in the prefix tree, else returns 0.
597 */
598static int
599FindWord(IspellDict *Conf, const char *word, const char *affixflag, int flag)
600{
601 SPNode *node = Conf->Dictionary;
602 SPNodeData *StopLow,
603 *StopHigh,
604 *StopMiddle;
605 const uint8 *ptr = (const uint8 *) word;
606
607 flag &= FF_COMPOUNDFLAGMASK;
608
609 while (node && *ptr)
610 {
611 StopLow = node->data;
612 StopHigh = node->data + node->length;
613 while (StopLow < StopHigh)
614 {
615 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
616 if (StopMiddle->val == *ptr)
617 {
618 if (*(ptr + 1) == '\0' && StopMiddle->isword)
619 {
620 if (flag == 0)
621 {
622 /*
623 * The word can be formed only with another word. And
624 * in the flag parameter there is not a sign that we
625 * search compound words.
626 */
627 if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
628 return 0;
629 }
630 else if ((flag & StopMiddle->compoundflag) == 0)
631 return 0;
632
633 /*
634 * Check if this affix rule is presented in the affix set
635 * with index StopMiddle->affix.
636 */
637 if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
638 return 1;
639 }
640 node = StopMiddle->node;
641 ptr++;
642 break;
643 }
644 else if (StopMiddle->val < *ptr)
645 StopLow = StopMiddle + 1;
646 else
647 StopHigh = StopMiddle;
648 }
649 if (StopLow >= StopHigh)
650 break;
651 }
652 return 0;
653}
654
655/*
656 * Adds a new affix rule to the Affix field.
657 *
658 * Conf: current dictionary.
659 * flag: affix flag ('\' in the below example).
660 * flagflags: set of flags from the flagval field for this affix rule. This set
661 * is listed after '/' character in the added string (repl).
662 *
663 * For example L flag in the hunspell_sample.affix:
664 * SFX \ 0 Y/L [^Y]
665 *
666 * mask: condition for search ('[^Y]' in the above example).
667 * find: stripping characters from beginning (at prefix) or end (at suffix)
668 * of the word ('0' in the above example, 0 means that there is not
669 * stripping character).
670 * repl: adding string after stripping ('Y' in the above example).
671 * type: FF_SUFFIX or FF_PREFIX.
672 */
673static void
674NIAddAffix(IspellDict *Conf, const char *flag, char flagflags, const char *mask,
675 const char *find, const char *repl, int type)
676{
677 AFFIX *Affix;
678
679 if (Conf->naffixes >= Conf->maffixes)
680 {
681 if (Conf->maffixes)
682 {
683 Conf->maffixes *= 2;
684 Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
685 }
686 else
687 {
688 Conf->maffixes = 16;
689 Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
690 }
691 }
692
693 Affix = Conf->Affix + Conf->naffixes;
694
695 /* This affix rule can be applied for words with any ending */
696 if (strcmp(mask, ".") == 0 || *mask == '\0')
697 {
698 Affix->issimple = 1;
699 Affix->isregis = 0;
700 }
701 /* This affix rule will use regis to search word ending */
702 else if (RS_isRegis(mask))
703 {
704 Affix->issimple = 0;
705 Affix->isregis = 1;
706 RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
707 *mask ? mask : VoidString);
708 }
709 /* This affix rule will use regex_t to search word ending */
710 else
711 {
712 int masklen;
713 int wmasklen;
714 int err;
715 pg_wchar *wmask;
716 char *tmask;
717
718 Affix->issimple = 0;
719 Affix->isregis = 0;
720 tmask = (char *) tmpalloc(strlen(mask) + 3);
721 if (type == FF_SUFFIX)
722 sprintf(tmask, "%s$", mask);
723 else
724 sprintf(tmask, "^%s", mask);
725
726 masklen = strlen(tmask);
727 wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
728 wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
729
730 err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
731 REG_ADVANCED | REG_NOSUB,
732 DEFAULT_COLLATION_OID);
733 if (err)
734 {
735 char errstr[100];
736
737 pg_regerror(err, &(Affix->reg.regex), errstr, sizeof(errstr));
738 ereport(ERROR,
739 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
740 errmsg("invalid regular expression: %s", errstr)));
741 }
742 }
743
744 Affix->flagflags = flagflags;
745 if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
746 {
747 if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
748 Affix->flagflags |= FF_COMPOUNDFLAG;
749 }
750 Affix->flag = cpstrdup(Conf, flag);
751 Affix->type = type;
752
753 Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
754 if ((Affix->replen = strlen(repl)) > 0)
755 Affix->repl = cpstrdup(Conf, repl);
756 else
757 Affix->repl = VoidString;
758 Conf->naffixes++;
759}
760
761/* Parsing states for parse_affentry() and friends */
762#define PAE_WAIT_MASK 0
763#define PAE_INMASK 1
764#define PAE_WAIT_FIND 2
765#define PAE_INFIND 3
766#define PAE_WAIT_REPL 4
767#define PAE_INREPL 5
768#define PAE_WAIT_TYPE 6
769#define PAE_WAIT_FLAG 7
770
771/*
772 * Parse next space-separated field of an .affix file line.
773 *
774 * *str is the input pointer (will be advanced past field)
775 * next is where to copy the field value to, with null termination
776 *
777 * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
778 *
779 * Returns true if we found a field, false if not.
780 */
781static bool
782get_nextfield(char **str, char *next)
783{
784 int state = PAE_WAIT_MASK;
785 int avail = BUFSIZ;
786
787 while (**str)
788 {
789 if (state == PAE_WAIT_MASK)
790 {
791 if (t_iseq(*str, '#'))
792 return false;
793 else if (!t_isspace(*str))
794 {
795 int clen = pg_mblen(*str);
796
797 if (clen < avail)
798 {
799 COPYCHAR(next, *str);
800 next += clen;
801 avail -= clen;
802 }
803 state = PAE_INMASK;
804 }
805 }
806 else /* state == PAE_INMASK */
807 {
808 if (t_isspace(*str))
809 {
810 *next = '\0';
811 return true;
812 }
813 else
814 {
815 int clen = pg_mblen(*str);
816
817 if (clen < avail)
818 {
819 COPYCHAR(next, *str);
820 next += clen;
821 avail -= clen;
822 }
823 }
824 }
825 *str += pg_mblen(*str);
826 }
827
828 *next = '\0';
829
830 return (state == PAE_INMASK); /* OK if we got a nonempty field */
831}
832
833/*
834 * Parses entry of an .affix file of MySpell or Hunspell format.
835 *
836 * An .affix file entry has the following format:
837 * - header
838 * <type> <flag> <cross_flag> <flag_count>
839 * - fields after header:
840 * <type> <flag> <find> <replace> <mask>
841 *
842 * str is the input line
843 * field values are returned to type etc, which must be buffers of size BUFSIZ.
844 *
845 * Returns number of fields found; any omitted fields are set to empty strings.
846 */
847static int
848parse_ooaffentry(char *str, char *type, char *flag, char *find,
849 char *repl, char *mask)
850{
851 int state = PAE_WAIT_TYPE;
852 int fields_read = 0;
853 bool valid = false;
854
855 *type = *flag = *find = *repl = *mask = '\0';
856
857 while (*str)
858 {
859 switch (state)
860 {
861 case PAE_WAIT_TYPE:
862 valid = get_nextfield(&str, type);
863 state = PAE_WAIT_FLAG;
864 break;
865 case PAE_WAIT_FLAG:
866 valid = get_nextfield(&str, flag);
867 state = PAE_WAIT_FIND;
868 break;
869 case PAE_WAIT_FIND:
870 valid = get_nextfield(&str, find);
871 state = PAE_WAIT_REPL;
872 break;
873 case PAE_WAIT_REPL:
874 valid = get_nextfield(&str, repl);
875 state = PAE_WAIT_MASK;
876 break;
877 case PAE_WAIT_MASK:
878 valid = get_nextfield(&str, mask);
879 state = -1; /* force loop exit */
880 break;
881 default:
882 elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
883 state);
884 break;
885 }
886 if (valid)
887 fields_read++;
888 else
889 break; /* early EOL */
890 if (state < 0)
891 break; /* got all fields */
892 }
893
894 return fields_read;
895}
896
897/*
898 * Parses entry of an .affix file of Ispell format
899 *
900 * An .affix file entry has the following format:
901 * <mask> > [-<find>,]<replace>
902 */
903static bool
904parse_affentry(char *str, char *mask, char *find, char *repl)
905{
906 int state = PAE_WAIT_MASK;
907 char *pmask = mask,
908 *pfind = find,
909 *prepl = repl;
910
911 *mask = *find = *repl = '\0';
912
913 while (*str)
914 {
915 if (state == PAE_WAIT_MASK)
916 {
917 if (t_iseq(str, '#'))
918 return false;
919 else if (!t_isspace(str))
920 {
921 COPYCHAR(pmask, str);
922 pmask += pg_mblen(str);
923 state = PAE_INMASK;
924 }
925 }
926 else if (state == PAE_INMASK)
927 {
928 if (t_iseq(str, '>'))
929 {
930 *pmask = '\0';
931 state = PAE_WAIT_FIND;
932 }
933 else if (!t_isspace(str))
934 {
935 COPYCHAR(pmask, str);
936 pmask += pg_mblen(str);
937 }
938 }
939 else if (state == PAE_WAIT_FIND)
940 {
941 if (t_iseq(str, '-'))
942 {
943 state = PAE_INFIND;
944 }
945 else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
946 {
947 COPYCHAR(prepl, str);
948 prepl += pg_mblen(str);
949 state = PAE_INREPL;
950 }
951 else if (!t_isspace(str))
952 ereport(ERROR,
953 (errcode(ERRCODE_CONFIG_FILE_ERROR),
954 errmsg("syntax error")));
955 }
956 else if (state == PAE_INFIND)
957 {
958 if (t_iseq(str, ','))
959 {
960 *pfind = '\0';
961 state = PAE_WAIT_REPL;
962 }
963 else if (t_isalpha(str))
964 {
965 COPYCHAR(pfind, str);
966 pfind += pg_mblen(str);
967 }
968 else if (!t_isspace(str))
969 ereport(ERROR,
970 (errcode(ERRCODE_CONFIG_FILE_ERROR),
971 errmsg("syntax error")));
972 }
973 else if (state == PAE_WAIT_REPL)
974 {
975 if (t_iseq(str, '-'))
976 {
977 break; /* void repl */
978 }
979 else if (t_isalpha(str))
980 {
981 COPYCHAR(prepl, str);
982 prepl += pg_mblen(str);
983 state = PAE_INREPL;
984 }
985 else if (!t_isspace(str))
986 ereport(ERROR,
987 (errcode(ERRCODE_CONFIG_FILE_ERROR),
988 errmsg("syntax error")));
989 }
990 else if (state == PAE_INREPL)
991 {
992 if (t_iseq(str, '#'))
993 {
994 *prepl = '\0';
995 break;
996 }
997 else if (t_isalpha(str))
998 {
999 COPYCHAR(prepl, str);
1000 prepl += pg_mblen(str);
1001 }
1002 else if (!t_isspace(str))
1003 ereport(ERROR,
1004 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1005 errmsg("syntax error")));
1006 }
1007 else
1008 elog(ERROR, "unrecognized state in parse_affentry: %d", state);
1009
1010 str += pg_mblen(str);
1011 }
1012
1013 *pmask = *pfind = *prepl = '\0';
1014
1015 return (*mask && (*find || *repl));
1016}
1017
1018/*
1019 * Sets a Hunspell options depending on flag type.
1020 */
1021static void
1022setCompoundAffixFlagValue(IspellDict *Conf, CompoundAffixFlag *entry,
1023 char *s, uint32 val)
1024{
1025 if (Conf->flagMode == FM_NUM)
1026 {
1027 char *next;
1028 int i;
1029
1030 i = strtol(s, &next, 10);
1031 if (s == next || errno == ERANGE)
1032 ereport(ERROR,
1033 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1034 errmsg("invalid affix flag \"%s\"", s)));
1035 if (i < 0 || i > FLAGNUM_MAXSIZE)
1036 ereport(ERROR,
1037 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1038 errmsg("affix flag \"%s\" is out of range", s)));
1039
1040 entry->flag.i = i;
1041 }
1042 else
1043 entry->flag.s = cpstrdup(Conf, s);
1044
1045 entry->flagMode = Conf->flagMode;
1046 entry->value = val;
1047}
1048
1049/*
1050 * Sets up a correspondence for the affix parameter with the affix flag.
1051 *
1052 * Conf: current dictionary.
1053 * s: affix flag in string.
1054 * val: affix parameter.
1055 */
1056static void
1057addCompoundAffixFlagValue(IspellDict *Conf, char *s, uint32 val)
1058{
1059 CompoundAffixFlag *newValue;
1060 char sbuf[BUFSIZ];
1061 char *sflag;
1062 int clen;
1063
1064 while (*s && t_isspace(s))
1065 s += pg_mblen(s);
1066
1067 if (!*s)
1068 ereport(ERROR,
1069 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1070 errmsg("syntax error")));
1071
1072 /* Get flag without \n */
1073 sflag = sbuf;
1074 while (*s && !t_isspace(s) && *s != '\n')
1075 {
1076 clen = pg_mblen(s);
1077 COPYCHAR(sflag, s);
1078 sflag += clen;
1079 s += clen;
1080 }
1081 *sflag = '\0';
1082
1083 /* Resize array or allocate memory for array CompoundAffixFlag */
1084 if (Conf->nCompoundAffixFlag >= Conf->mCompoundAffixFlag)
1085 {
1086 if (Conf->mCompoundAffixFlag)
1087 {
1088 Conf->mCompoundAffixFlag *= 2;
1089 Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1090 repalloc((void *) Conf->CompoundAffixFlags,
1091 Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1092 }
1093 else
1094 {
1095 Conf->mCompoundAffixFlag = 10;
1096 Conf->CompoundAffixFlags = (CompoundAffixFlag *)
1097 tmpalloc(Conf->mCompoundAffixFlag * sizeof(CompoundAffixFlag));
1098 }
1099 }
1100
1101 newValue = Conf->CompoundAffixFlags + Conf->nCompoundAffixFlag;
1102
1103 setCompoundAffixFlagValue(Conf, newValue, sbuf, val);
1104
1105 Conf->usecompound = true;
1106 Conf->nCompoundAffixFlag++;
1107}
1108
1109/*
1110 * Returns a set of affix parameters which correspondence to the set of affix
1111 * flags s.
1112 */
1113static int
1114getCompoundAffixFlagValue(IspellDict *Conf, char *s)
1115{
1116 uint32 flag = 0;
1117 CompoundAffixFlag *found,
1118 key;
1119 char sflag[BUFSIZ];
1120 char *flagcur;
1121
1122 if (Conf->nCompoundAffixFlag == 0)
1123 return 0;
1124
1125 flagcur = s;
1126 while (*flagcur)
1127 {
1128 getNextFlagFromString(Conf, &flagcur, sflag);
1129 setCompoundAffixFlagValue(Conf, &key, sflag, 0);
1130
1131 found = (CompoundAffixFlag *)
1132 bsearch(&key, (void *) Conf->CompoundAffixFlags,
1133 Conf->nCompoundAffixFlag, sizeof(CompoundAffixFlag),
1134 cmpcmdflag);
1135 if (found != NULL)
1136 flag |= found->value;
1137 }
1138
1139 return flag;
1140}
1141
1142/*
1143 * Returns a flag set using the s parameter.
1144 *
1145 * If Conf->useFlagAliases is true then the s parameter is index of the
1146 * Conf->AffixData array and function returns its entry.
1147 * Else function returns the s parameter.
1148 */
1149static char *
1150getAffixFlagSet(IspellDict *Conf, char *s)
1151{
1152 if (Conf->useFlagAliases && *s != '\0')
1153 {
1154 int curaffix;
1155 char *end;
1156
1157 curaffix = strtol(s, &end, 10);
1158 if (s == end || errno == ERANGE)
1159 ereport(ERROR,
1160 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1161 errmsg("invalid affix alias \"%s\"", s)));
1162
1163 if (curaffix > 0 && curaffix <= Conf->nAffixData)
1164
1165 /*
1166 * Do not subtract 1 from curaffix because empty string was added
1167 * in NIImportOOAffixes
1168 */
1169 return Conf->AffixData[curaffix];
1170 else
1171 return VoidString;
1172 }
1173 else
1174 return s;
1175}
1176
1177/*
1178 * Import an affix file that follows MySpell or Hunspell format.
1179 *
1180 * Conf: current dictionary.
1181 * filename: path to the .affix file.
1182 */
1183static void
1184NIImportOOAffixes(IspellDict *Conf, const char *filename)
1185{
1186 char type[BUFSIZ],
1187 *ptype = NULL;
1188 char sflag[BUFSIZ];
1189 char mask[BUFSIZ],
1190 *pmask;
1191 char find[BUFSIZ],
1192 *pfind;
1193 char repl[BUFSIZ],
1194 *prepl;
1195 bool isSuffix = false;
1196 int naffix = 0,
1197 curaffix = 0;
1198 int sflaglen = 0;
1199 char flagflags = 0;
1200 tsearch_readline_state trst;
1201 char *recoded;
1202
1203 /* read file to find any flag */
1204 Conf->usecompound = false;
1205 Conf->useFlagAliases = false;
1206 Conf->flagMode = FM_CHAR;
1207
1208 if (!tsearch_readline_begin(&trst, filename))
1209 ereport(ERROR,
1210 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1211 errmsg("could not open affix file \"%s\": %m",
1212 filename)));
1213
1214 while ((recoded = tsearch_readline(&trst)) != NULL)
1215 {
1216 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1217 {
1218 pfree(recoded);
1219 continue;
1220 }
1221
1222 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
1223 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
1224 FF_COMPOUNDFLAG);
1225 else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
1226 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
1227 FF_COMPOUNDBEGIN);
1228 else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
1229 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
1230 FF_COMPOUNDLAST);
1231 /* COMPOUNDLAST and COMPOUNDEND are synonyms */
1232 else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
1233 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
1234 FF_COMPOUNDLAST);
1235 else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
1236 addCompoundAffixFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
1237 FF_COMPOUNDMIDDLE);
1238 else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
1239 addCompoundAffixFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
1240 FF_COMPOUNDONLY);
1241 else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
1242 addCompoundAffixFlagValue(Conf,
1243 recoded + strlen("COMPOUNDPERMITFLAG"),
1244 FF_COMPOUNDPERMITFLAG);
1245 else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
1246 addCompoundAffixFlagValue(Conf,
1247 recoded + strlen("COMPOUNDFORBIDFLAG"),
1248 FF_COMPOUNDFORBIDFLAG);
1249 else if (STRNCMP(recoded, "FLAG") == 0)
1250 {
1251 char *s = recoded + strlen("FLAG");
1252
1253 while (*s && t_isspace(s))
1254 s += pg_mblen(s);
1255
1256 if (*s)
1257 {
1258 if (STRNCMP(s, "long") == 0)
1259 Conf->flagMode = FM_LONG;
1260 else if (STRNCMP(s, "num") == 0)
1261 Conf->flagMode = FM_NUM;
1262 else if (STRNCMP(s, "default") != 0)
1263 ereport(ERROR,
1264 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1265 errmsg("Ispell dictionary supports only "
1266 "\"default\", \"long\", "
1267 "and \"num\" flag values")));
1268 }
1269 }
1270
1271 pfree(recoded);
1272 }
1273 tsearch_readline_end(&trst);
1274
1275 if (Conf->nCompoundAffixFlag > 1)
1276 qsort((void *) Conf->CompoundAffixFlags, Conf->nCompoundAffixFlag,
1277 sizeof(CompoundAffixFlag), cmpcmdflag);
1278
1279 if (!tsearch_readline_begin(&trst, filename))
1280 ereport(ERROR,
1281 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1282 errmsg("could not open affix file \"%s\": %m",
1283 filename)));
1284
1285 while ((recoded = tsearch_readline(&trst)) != NULL)
1286 {
1287 int fields_read;
1288
1289 if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
1290 goto nextline;
1291
1292 fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
1293
1294 if (ptype)
1295 pfree(ptype);
1296 ptype = lowerstr_ctx(Conf, type);
1297
1298 /* First try to parse AF parameter (alias compression) */
1299 if (STRNCMP(ptype, "af") == 0)
1300 {
1301 /* First line is the number of aliases */
1302 if (!Conf->useFlagAliases)
1303 {
1304 Conf->useFlagAliases = true;
1305 naffix = atoi(sflag);
1306 if (naffix <= 0)
1307 ereport(ERROR,
1308 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1309 errmsg("invalid number of flag vector aliases")));
1310
1311 /* Also reserve place for empty flag set */
1312 naffix++;
1313
1314 Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1315 Conf->lenAffixData = Conf->nAffixData = naffix;
1316
1317 /* Add empty flag set into AffixData */
1318 Conf->AffixData[curaffix] = VoidString;
1319 curaffix++;
1320 }
1321 /* Other lines are aliases */
1322 else
1323 {
1324 if (curaffix < naffix)
1325 {
1326 Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
1327 curaffix++;
1328 }
1329 else
1330 ereport(ERROR,
1331 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1332 errmsg("number of aliases exceeds specified number %d",
1333 naffix - 1)));
1334 }
1335 goto nextline;
1336 }
1337 /* Else try to parse prefixes and suffixes */
1338 if (fields_read < 4 ||
1339 (STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
1340 goto nextline;
1341
1342 sflaglen = strlen(sflag);
1343 if (sflaglen == 0
1344 || (sflaglen > 1 && Conf->flagMode == FM_CHAR)
1345 || (sflaglen > 2 && Conf->flagMode == FM_LONG))
1346 goto nextline;
1347
1348 /*--------
1349 * Affix header. For example:
1350 * SFX \ N 1
1351 *--------
1352 */
1353 if (fields_read == 4)
1354 {
1355 isSuffix = (STRNCMP(ptype, "sfx") == 0);
1356 if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
1357 flagflags = FF_CROSSPRODUCT;
1358 else
1359 flagflags = 0;
1360 }
1361 /*--------
1362 * Affix fields. For example:
1363 * SFX \ 0 Y/L [^Y]
1364 *--------
1365 */
1366 else
1367 {
1368 char *ptr;
1369 int aflg = 0;
1370
1371 /* Get flags after '/' (flags are case sensitive) */
1372 if ((ptr = strchr(repl, '/')) != NULL)
1373 aflg |= getCompoundAffixFlagValue(Conf,
1374 getAffixFlagSet(Conf,
1375 ptr + 1));
1376 /* Get lowercased version of string before '/' */
1377 prepl = lowerstr_ctx(Conf, repl);
1378 if ((ptr = strchr(prepl, '/')) != NULL)
1379 *ptr = '\0';
1380 pfind = lowerstr_ctx(Conf, find);
1381 pmask = lowerstr_ctx(Conf, mask);
1382 if (t_iseq(find, '0'))
1383 *pfind = '\0';
1384 if (t_iseq(repl, '0'))
1385 *prepl = '\0';
1386
1387 NIAddAffix(Conf, sflag, flagflags | aflg, pmask, pfind, prepl,
1388 isSuffix ? FF_SUFFIX : FF_PREFIX);
1389 pfree(prepl);
1390 pfree(pfind);
1391 pfree(pmask);
1392 }
1393
1394nextline:
1395 pfree(recoded);
1396 }
1397
1398 tsearch_readline_end(&trst);
1399 if (ptype)
1400 pfree(ptype);
1401}
1402
1403/*
1404 * import affixes
1405 *
1406 * Note caller must already have applied get_tsearch_config_filename
1407 *
1408 * This function is responsible for parsing ispell ("old format") affix files.
1409 * If we realize that the file contains new-format commands, we pass off the
1410 * work to NIImportOOAffixes(), which will re-read the whole file.
1411 */
1412void
1413NIImportAffixes(IspellDict *Conf, const char *filename)
1414{
1415 char *pstr = NULL;
1416 char flag[BUFSIZ];
1417 char mask[BUFSIZ];
1418 char find[BUFSIZ];
1419 char repl[BUFSIZ];
1420 char *s;
1421 bool suffixes = false;
1422 bool prefixes = false;
1423 char flagflags = 0;
1424 tsearch_readline_state trst;
1425 bool oldformat = false;
1426 char *recoded = NULL;
1427
1428 if (!tsearch_readline_begin(&trst, filename))
1429 ereport(ERROR,
1430 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1431 errmsg("could not open affix file \"%s\": %m",
1432 filename)));
1433
1434 Conf->usecompound = false;
1435 Conf->useFlagAliases = false;
1436 Conf->flagMode = FM_CHAR;
1437
1438 while ((recoded = tsearch_readline(&trst)) != NULL)
1439 {
1440 pstr = lowerstr(recoded);
1441
1442 /* Skip comments and empty lines */
1443 if (*pstr == '#' || *pstr == '\n')
1444 goto nextline;
1445
1446 if (STRNCMP(pstr, "compoundwords") == 0)
1447 {
1448 /* Find case-insensitive L flag in non-lowercased string */
1449 s = findchar2(recoded, 'l', 'L');
1450 if (s)
1451 {
1452 while (*s && !t_isspace(s))
1453 s += pg_mblen(s);
1454 while (*s && t_isspace(s))
1455 s += pg_mblen(s);
1456
1457 if (*s && pg_mblen(s) == 1)
1458 {
1459 addCompoundAffixFlagValue(Conf, s, FF_COMPOUNDFLAG);
1460 Conf->usecompound = true;
1461 }
1462 oldformat = true;
1463 goto nextline;
1464 }
1465 }
1466 if (STRNCMP(pstr, "suffixes") == 0)
1467 {
1468 suffixes = true;
1469 prefixes = false;
1470 oldformat = true;
1471 goto nextline;
1472 }
1473 if (STRNCMP(pstr, "prefixes") == 0)
1474 {
1475 suffixes = false;
1476 prefixes = true;
1477 oldformat = true;
1478 goto nextline;
1479 }
1480 if (STRNCMP(pstr, "flag") == 0)
1481 {
1482 s = recoded + 4; /* we need non-lowercased string */
1483 flagflags = 0;
1484
1485 while (*s && t_isspace(s))
1486 s += pg_mblen(s);
1487
1488 if (*s == '*')
1489 {
1490 flagflags |= FF_CROSSPRODUCT;
1491 s++;
1492 }
1493 else if (*s == '~')
1494 {
1495 flagflags |= FF_COMPOUNDONLY;
1496 s++;
1497 }
1498
1499 if (*s == '\\')
1500 s++;
1501
1502 /*
1503 * An old-format flag is a single ASCII character; we expect it to
1504 * be followed by EOL, whitespace, or ':'. Otherwise this is a
1505 * new-format flag command.
1506 */
1507 if (*s && pg_mblen(s) == 1)
1508 {
1509 COPYCHAR(flag, s);
1510 flag[1] = '\0';
1511
1512 s++;
1513 if (*s == '\0' || *s == '#' || *s == '\n' || *s == ':' ||
1514 t_isspace(s))
1515 {
1516 oldformat = true;
1517 goto nextline;
1518 }
1519 }
1520 goto isnewformat;
1521 }
1522 if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 ||
1523 STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
1524 STRNCMP(recoded, "PFX") == 0 ||
1525 STRNCMP(recoded, "SFX") == 0)
1526 goto isnewformat;
1527
1528 if ((!suffixes) && (!prefixes))
1529 goto nextline;
1530
1531 if (!parse_affentry(pstr, mask, find, repl))
1532 goto nextline;
1533
1534 NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
1535
1536nextline:
1537 pfree(recoded);
1538 pfree(pstr);
1539 }
1540 tsearch_readline_end(&trst);
1541 return;
1542
1543isnewformat:
1544 if (oldformat)
1545 ereport(ERROR,
1546 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1547 errmsg("affix file contains both old-style and new-style commands")));
1548 tsearch_readline_end(&trst);
1549
1550 NIImportOOAffixes(Conf, filename);
1551}
1552
1553/*
1554 * Merges two affix flag sets and stores a new affix flag set into
1555 * Conf->AffixData.
1556 *
1557 * Returns index of a new affix flag set.
1558 */
1559static int
1560MergeAffix(IspellDict *Conf, int a1, int a2)
1561{
1562 char **ptr;
1563
1564 /* Do not merge affix flags if one of affix flags is empty */
1565 if (*Conf->AffixData[a1] == '\0')
1566 return a2;
1567 else if (*Conf->AffixData[a2] == '\0')
1568 return a1;
1569
1570 while (Conf->nAffixData + 1 >= Conf->lenAffixData)
1571 {
1572 Conf->lenAffixData *= 2;
1573 Conf->AffixData = (char **) repalloc(Conf->AffixData,
1574 sizeof(char *) * Conf->lenAffixData);
1575 }
1576
1577 ptr = Conf->AffixData + Conf->nAffixData;
1578 if (Conf->flagMode == FM_NUM)
1579 {
1580 *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1581 strlen(Conf->AffixData[a2]) +
1582 1 /* comma */ + 1 /* \0 */ );
1583 sprintf(*ptr, "%s,%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1584 }
1585 else
1586 {
1587 *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
1588 strlen(Conf->AffixData[a2]) +
1589 1 /* \0 */ );
1590 sprintf(*ptr, "%s%s", Conf->AffixData[a1], Conf->AffixData[a2]);
1591 }
1592 ptr++;
1593 *ptr = NULL;
1594 Conf->nAffixData++;
1595
1596 return Conf->nAffixData - 1;
1597}
1598
1599/*
1600 * Returns a set of affix parameters which correspondence to the set of affix
1601 * flags with the given index.
1602 */
1603static uint32
1604makeCompoundFlags(IspellDict *Conf, int affix)
1605{
1606 char *str = Conf->AffixData[affix];
1607
1608 return (getCompoundAffixFlagValue(Conf, str) & FF_COMPOUNDFLAGMASK);
1609}
1610
1611/*
1612 * Makes a prefix tree for the given level.
1613 *
1614 * Conf: current dictionary.
1615 * low: lower index of the Conf->Spell array.
1616 * high: upper index of the Conf->Spell array.
1617 * level: current prefix tree level.
1618 */
1619static SPNode *
1620mkSPNode(IspellDict *Conf, int low, int high, int level)
1621{
1622 int i;
1623 int nchar = 0;
1624 char lastchar = '\0';
1625 SPNode *rs;
1626 SPNodeData *data;
1627 int lownew = low;
1628
1629 for (i = low; i < high; i++)
1630 if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
1631 {
1632 nchar++;
1633 lastchar = Conf->Spell[i]->word[level];
1634 }
1635
1636 if (!nchar)
1637 return NULL;
1638
1639 rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
1640 rs->length = nchar;
1641 data = rs->data;
1642
1643 lastchar = '\0';
1644 for (i = low; i < high; i++)
1645 if (Conf->Spell[i]->p.d.len > level)
1646 {
1647 if (lastchar != Conf->Spell[i]->word[level])
1648 {
1649 if (lastchar)
1650 {
1651 /* Next level of the prefix tree */
1652 data->node = mkSPNode(Conf, lownew, i, level + 1);
1653 lownew = i;
1654 data++;
1655 }
1656 lastchar = Conf->Spell[i]->word[level];
1657 }
1658 data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
1659 if (Conf->Spell[i]->p.d.len == level + 1)
1660 {
1661 bool clearCompoundOnly = false;
1662
1663 if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
1664 {
1665 /*
1666 * MergeAffix called a few times. If one of word is
1667 * allowed to be in compound word and another isn't, then
1668 * clear FF_COMPOUNDONLY flag.
1669 */
1670
1671 clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
1672 & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
1673 ? false : true;
1674 data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
1675 }
1676 else
1677 data->affix = Conf->Spell[i]->p.d.affix;
1678 data->isword = 1;
1679
1680 data->compoundflag = makeCompoundFlags(Conf, data->affix);
1681
1682 if ((data->compoundflag & FF_COMPOUNDONLY) &&
1683 (data->compoundflag & FF_COMPOUNDFLAG) == 0)
1684 data->compoundflag |= FF_COMPOUNDFLAG;
1685
1686 if (clearCompoundOnly)
1687 data->compoundflag &= ~FF_COMPOUNDONLY;
1688 }
1689 }
1690
1691 /* Next level of the prefix tree */
1692 data->node = mkSPNode(Conf, lownew, high, level + 1);
1693
1694 return rs;
1695}
1696
1697/*
1698 * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
1699 * and affixes.
1700 */
1701void
1702NISortDictionary(IspellDict *Conf)
1703{
1704 int i;
1705 int naffix = 0;
1706 int curaffix;
1707
1708 /* compress affixes */
1709
1710 /*
1711 * If we use flag aliases then we need to use Conf->AffixData filled in
1712 * the NIImportOOAffixes().
1713 */
1714 if (Conf->useFlagAliases)
1715 {
1716 for (i = 0; i < Conf->nspell; i++)
1717 {
1718 char *end;
1719
1720 if (*Conf->Spell[i]->p.flag != '\0')
1721 {
1722 curaffix = strtol(Conf->Spell[i]->p.flag, &end, 10);
1723 if (Conf->Spell[i]->p.flag == end || errno == ERANGE)
1724 ereport(ERROR,
1725 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1726 errmsg("invalid affix alias \"%s\"",
1727 Conf->Spell[i]->p.flag)));
1728 }
1729 else
1730 {
1731 /*
1732 * If Conf->Spell[i]->p.flag is empty, then get empty value of
1733 * Conf->AffixData (0 index).
1734 */
1735 curaffix = 0;
1736 }
1737
1738 Conf->Spell[i]->p.d.affix = curaffix;
1739 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1740 }
1741 }
1742 /* Otherwise fill Conf->AffixData here */
1743 else
1744 {
1745 /* Count the number of different flags used in the dictionary */
1746 qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
1747 cmpspellaffix);
1748
1749 naffix = 0;
1750 for (i = 0; i < Conf->nspell; i++)
1751 {
1752 if (i == 0 ||
1753 strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag) != 0)
1754 naffix++;
1755 }
1756
1757 /*
1758 * Fill in Conf->AffixData with the affixes that were used in the
1759 * dictionary. Replace textual flag-field of Conf->Spell entries with
1760 * indexes into Conf->AffixData array.
1761 */
1762 Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
1763
1764 curaffix = -1;
1765 for (i = 0; i < Conf->nspell; i++)
1766 {
1767 if (i == 0 ||
1768 strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]) != 0)
1769 {
1770 curaffix++;
1771 Assert(curaffix < naffix);
1772 Conf->AffixData[curaffix] = cpstrdup(Conf,
1773 Conf->Spell[i]->p.flag);
1774 }
1775
1776 Conf->Spell[i]->p.d.affix = curaffix;
1777 Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
1778 }
1779
1780 Conf->lenAffixData = Conf->nAffixData = naffix;
1781 }
1782
1783 /* Start build a prefix tree */
1784 qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
1785 Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
1786}
1787
1788/*
1789 * Makes a prefix tree for the given level using the repl string of an affix
1790 * rule. Affixes with empty replace string do not include in the prefix tree.
1791 * This affixes are included by mkVoidAffix().
1792 *
1793 * Conf: current dictionary.
1794 * low: lower index of the Conf->Affix array.
1795 * high: upper index of the Conf->Affix array.
1796 * level: current prefix tree level.
1797 * type: FF_SUFFIX or FF_PREFIX.
1798 */
1799static AffixNode *
1800mkANode(IspellDict *Conf, int low, int high, int level, int type)
1801{
1802 int i;
1803 int nchar = 0;
1804 uint8 lastchar = '\0';
1805 AffixNode *rs;
1806 AffixNodeData *data;
1807 int lownew = low;
1808 int naff;
1809 AFFIX **aff;
1810
1811 for (i = low; i < high; i++)
1812 if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
1813 {
1814 nchar++;
1815 lastchar = GETCHAR(Conf->Affix + i, level, type);
1816 }
1817
1818 if (!nchar)
1819 return NULL;
1820
1821 aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
1822 naff = 0;
1823
1824 rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
1825 rs->length = nchar;
1826 data = rs->data;
1827
1828 lastchar = '\0';
1829 for (i = low; i < high; i++)
1830 if (Conf->Affix[i].replen > level)
1831 {
1832 if (lastchar != GETCHAR(Conf->Affix + i, level, type))
1833 {
1834 if (lastchar)
1835 {
1836 /* Next level of the prefix tree */
1837 data->node = mkANode(Conf, lownew, i, level + 1, type);
1838 if (naff)
1839 {
1840 data->naff = naff;
1841 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1842 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1843 naff = 0;
1844 }
1845 data++;
1846 lownew = i;
1847 }
1848 lastchar = GETCHAR(Conf->Affix + i, level, type);
1849 }
1850 data->val = GETCHAR(Conf->Affix + i, level, type);
1851 if (Conf->Affix[i].replen == level + 1)
1852 { /* affix stopped */
1853 aff[naff++] = Conf->Affix + i;
1854 }
1855 }
1856
1857 /* Next level of the prefix tree */
1858 data->node = mkANode(Conf, lownew, high, level + 1, type);
1859 if (naff)
1860 {
1861 data->naff = naff;
1862 data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
1863 memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
1864 naff = 0;
1865 }
1866
1867 pfree(aff);
1868
1869 return rs;
1870}
1871
1872/*
1873 * Makes the root void node in the prefix tree. The root void node is created
1874 * for affixes which have empty replace string ("repl" field).
1875 */
1876static void
1877mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
1878{
1879 int i,
1880 cnt = 0;
1881 int start = (issuffix) ? startsuffix : 0;
1882 int end = (issuffix) ? Conf->naffixes : startsuffix;
1883 AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
1884
1885 Affix->length = 1;
1886 Affix->isvoid = 1;
1887
1888 if (issuffix)
1889 {
1890 Affix->data->node = Conf->Suffix;
1891 Conf->Suffix = Affix;
1892 }
1893 else
1894 {
1895 Affix->data->node = Conf->Prefix;
1896 Conf->Prefix = Affix;
1897 }
1898
1899 /* Count affixes with empty replace string */
1900 for (i = start; i < end; i++)
1901 if (Conf->Affix[i].replen == 0)
1902 cnt++;
1903
1904 /* There is not affixes with empty replace string */
1905 if (cnt == 0)
1906 return;
1907
1908 Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
1909 Affix->data->naff = (uint32) cnt;
1910
1911 cnt = 0;
1912 for (i = start; i < end; i++)
1913 if (Conf->Affix[i].replen == 0)
1914 {
1915 Affix->data->aff[cnt] = Conf->Affix + i;
1916 cnt++;
1917 }
1918}
1919
1920/*
1921 * Checks if the affixflag is used by dictionary. Conf->AffixData does not
1922 * contain affixflag if this flag is not used actually by the .dict file.
1923 *
1924 * Conf: current dictionary.
1925 * affixflag: affix flag.
1926 *
1927 * Returns true if the Conf->AffixData array contains affixflag, otherwise
1928 * returns false.
1929 */
1930static bool
1931isAffixInUse(IspellDict *Conf, char *affixflag)
1932{
1933 int i;
1934
1935 for (i = 0; i < Conf->nAffixData; i++)
1936 if (IsAffixFlagInUse(Conf, i, affixflag))
1937 return true;
1938
1939 return false;
1940}
1941
1942/*
1943 * Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
1944 */
1945void
1946NISortAffixes(IspellDict *Conf)
1947{
1948 AFFIX *Affix;
1949 size_t i;
1950 CMPDAffix *ptr;
1951 int firstsuffix = Conf->naffixes;
1952
1953 if (Conf->naffixes == 0)
1954 return;
1955
1956 /* Store compound affixes in the Conf->CompoundAffix array */
1957 if (Conf->naffixes > 1)
1958 qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
1959 Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
1960 ptr->affix = NULL;
1961
1962 for (i = 0; i < Conf->naffixes; i++)
1963 {
1964 Affix = &(((AFFIX *) Conf->Affix)[i]);
1965 if (Affix->type == FF_SUFFIX && i < firstsuffix)
1966 firstsuffix = i;
1967
1968 if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
1969 isAffixInUse(Conf, Affix->flag))
1970 {
1971 bool issuffix = (Affix->type == FF_SUFFIX);
1972
1973 if (ptr == Conf->CompoundAffix ||
1974 issuffix != (ptr - 1)->issuffix ||
1975 strbncmp((const unsigned char *) (ptr - 1)->affix,
1976 (const unsigned char *) Affix->repl,
1977 (ptr - 1)->len))
1978 {
1979 /* leave only unique and minimals suffixes */
1980 ptr->affix = Affix->repl;
1981 ptr->len = Affix->replen;
1982 ptr->issuffix = issuffix;
1983 ptr++;
1984 }
1985 }
1986 }
1987 ptr->affix = NULL;
1988 Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
1989
1990 /* Start build a prefix tree */
1991 Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
1992 Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
1993 mkVoidAffix(Conf, true, firstsuffix);
1994 mkVoidAffix(Conf, false, firstsuffix);
1995}
1996
1997static AffixNodeData *
1998FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
1999{
2000 AffixNodeData *StopLow,
2001 *StopHigh,
2002 *StopMiddle;
2003 uint8 symbol;
2004
2005 if (node->isvoid)
2006 { /* search void affixes */
2007 if (node->data->naff)
2008 return node->data;
2009 node = node->data->node;
2010 }
2011
2012 while (node && *level < wrdlen)
2013 {
2014 StopLow = node->data;
2015 StopHigh = node->data + node->length;
2016 while (StopLow < StopHigh)
2017 {
2018 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2019 symbol = GETWCHAR(word, wrdlen, *level, type);
2020
2021 if (StopMiddle->val == symbol)
2022 {
2023 (*level)++;
2024 if (StopMiddle->naff)
2025 return StopMiddle;
2026 node = StopMiddle->node;
2027 break;
2028 }
2029 else if (StopMiddle->val < symbol)
2030 StopLow = StopMiddle + 1;
2031 else
2032 StopHigh = StopMiddle;
2033 }
2034 if (StopLow >= StopHigh)
2035 break;
2036 }
2037 return NULL;
2038}
2039
2040static char *
2041CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
2042{
2043 /*
2044 * Check compound allow flags
2045 */
2046
2047 if (flagflags == 0)
2048 {
2049 if (Affix->flagflags & FF_COMPOUNDONLY)
2050 return NULL;
2051 }
2052 else if (flagflags & FF_COMPOUNDBEGIN)
2053 {
2054 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2055 return NULL;
2056 if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
2057 if (Affix->type == FF_SUFFIX)
2058 return NULL;
2059 }
2060 else if (flagflags & FF_COMPOUNDMIDDLE)
2061 {
2062 if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
2063 (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
2064 return NULL;
2065 }
2066 else if (flagflags & FF_COMPOUNDLAST)
2067 {
2068 if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
2069 return NULL;
2070 if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
2071 if (Affix->type == FF_PREFIX)
2072 return NULL;
2073 }
2074
2075 /*
2076 * make replace pattern of affix
2077 */
2078 if (Affix->type == FF_SUFFIX)
2079 {
2080 strcpy(newword, word);
2081 strcpy(newword + len - Affix->replen, Affix->find);
2082 if (baselen) /* store length of non-changed part of word */
2083 *baselen = len - Affix->replen;
2084 }
2085 else
2086 {
2087 /*
2088 * if prefix is an all non-changed part's length then all word
2089 * contains only prefix and suffix, so out
2090 */
2091 if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
2092 return NULL;
2093 strcpy(newword, Affix->find);
2094 strcat(newword, word + Affix->replen);
2095 }
2096
2097 /*
2098 * check resulting word
2099 */
2100 if (Affix->issimple)
2101 return newword;
2102 else if (Affix->isregis)
2103 {
2104 if (RS_execute(&(Affix->reg.regis), newword))
2105 return newword;
2106 }
2107 else
2108 {
2109 int err;
2110 pg_wchar *data;
2111 size_t data_len;
2112 int newword_len;
2113
2114 /* Convert data string to wide characters */
2115 newword_len = strlen(newword);
2116 data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
2117 data_len = pg_mb2wchar_with_len(newword, data, newword_len);
2118
2119 if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
2120 {
2121 pfree(data);
2122 return newword;
2123 }
2124 pfree(data);
2125 }
2126
2127 return NULL;
2128}
2129
2130static int
2131addToResult(char **forms, char **cur, char *word)
2132{
2133 if (cur - forms >= MAX_NORM - 1)
2134 return 0;
2135 if (forms == cur || strcmp(word, *(cur - 1)) != 0)
2136 {
2137 *cur = pstrdup(word);
2138 *(cur + 1) = NULL;
2139 return 1;
2140 }
2141
2142 return 0;
2143}
2144
2145static char **
2146NormalizeSubWord(IspellDict *Conf, char *word, int flag)
2147{
2148 AffixNodeData *suffix = NULL,
2149 *prefix = NULL;
2150 int slevel = 0,
2151 plevel = 0;
2152 int wrdlen = strlen(word),
2153 swrdlen;
2154 char **forms;
2155 char **cur;
2156 char newword[2 * MAXNORMLEN] = "";
2157 char pnewword[2 * MAXNORMLEN] = "";
2158 AffixNode *snode = Conf->Suffix,
2159 *pnode;
2160 int i,
2161 j;
2162
2163 if (wrdlen > MAXNORMLEN)
2164 return NULL;
2165 cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
2166 *cur = NULL;
2167
2168
2169 /* Check that the word itself is normal form */
2170 if (FindWord(Conf, word, VoidString, flag))
2171 {
2172 *cur = pstrdup(word);
2173 cur++;
2174 *cur = NULL;
2175 }
2176
2177 /* Find all other NORMAL forms of the 'word' (check only prefix) */
2178 pnode = Conf->Prefix;
2179 plevel = 0;
2180 while (pnode)
2181 {
2182 prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
2183 if (!prefix)
2184 break;
2185 for (j = 0; j < prefix->naff; j++)
2186 {
2187 if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
2188 {
2189 /* prefix success */
2190 if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
2191 cur += addToResult(forms, cur, newword);
2192 }
2193 }
2194 pnode = prefix->node;
2195 }
2196
2197 /*
2198 * Find all other NORMAL forms of the 'word' (check suffix and then
2199 * prefix)
2200 */
2201 while (snode)
2202 {
2203 int baselen = 0;
2204
2205 /* find possible suffix */
2206 suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
2207 if (!suffix)
2208 break;
2209 /* foreach suffix check affix */
2210 for (i = 0; i < suffix->naff; i++)
2211 {
2212 if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
2213 {
2214 /* suffix success */
2215 if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
2216 cur += addToResult(forms, cur, newword);
2217
2218 /* now we will look changed word with prefixes */
2219 pnode = Conf->Prefix;
2220 plevel = 0;
2221 swrdlen = strlen(newword);
2222 while (pnode)
2223 {
2224 prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
2225 if (!prefix)
2226 break;
2227 for (j = 0; j < prefix->naff; j++)
2228 {
2229 if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
2230 {
2231 /* prefix success */
2232 char *ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
2233 VoidString : prefix->aff[j]->flag;
2234
2235 if (FindWord(Conf, pnewword, ff, flag))
2236 cur += addToResult(forms, cur, pnewword);
2237 }
2238 }
2239 pnode = prefix->node;
2240 }
2241 }
2242 }
2243
2244 snode = suffix->node;
2245 }
2246
2247 if (cur == forms)
2248 {
2249 pfree(forms);
2250 return NULL;
2251 }
2252 return forms;
2253}
2254
2255typedef struct SplitVar
2256{
2257 int nstem;
2258 int lenstem;
2259 char **stem;
2260 struct SplitVar *next;
2261} SplitVar;
2262
2263static int
2264CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
2265{
2266 bool issuffix;
2267
2268 /* in case CompoundAffix is null: */
2269 if (*ptr == NULL)
2270 return -1;
2271
2272 if (CheckInPlace)
2273 {
2274 while ((*ptr)->affix)
2275 {
2276 if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
2277 {
2278 len = (*ptr)->len;
2279 issuffix = (*ptr)->issuffix;
2280 (*ptr)++;
2281 return (issuffix) ? len : 0;
2282 }
2283 (*ptr)++;
2284 }
2285 }
2286 else
2287 {
2288 char *affbegin;
2289
2290 while ((*ptr)->affix)
2291 {
2292 if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
2293 {
2294 len = (*ptr)->len + (affbegin - word);
2295 issuffix = (*ptr)->issuffix;
2296 (*ptr)++;
2297 return (issuffix) ? len : 0;
2298 }
2299 (*ptr)++;
2300 }
2301 }
2302 return -1;
2303}
2304
2305static SplitVar *
2306CopyVar(SplitVar *s, int makedup)
2307{
2308 SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
2309
2310 v->next = NULL;
2311 if (s)
2312 {
2313 int i;
2314
2315 v->lenstem = s->lenstem;
2316 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2317 v->nstem = s->nstem;
2318 for (i = 0; i < s->nstem; i++)
2319 v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
2320 }
2321 else
2322 {
2323 v->lenstem = 16;
2324 v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
2325 v->nstem = 0;
2326 }
2327 return v;
2328}
2329
2330static void
2331AddStem(SplitVar *v, char *word)
2332{
2333 if (v->nstem >= v->lenstem)
2334 {
2335 v->lenstem *= 2;
2336 v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
2337 }
2338
2339 v->stem[v->nstem] = word;
2340 v->nstem++;
2341}
2342
2343static SplitVar *
2344SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
2345{
2346 SplitVar *var = NULL;
2347 SPNodeData *StopLow,
2348 *StopHigh,
2349 *StopMiddle = NULL;
2350 SPNode *node = (snode) ? snode : Conf->Dictionary;
2351 int level = (snode) ? minpos : startpos; /* recursive
2352 * minpos==level */
2353 int lenaff;
2354 CMPDAffix *caff;
2355 char *notprobed;
2356 int compoundflag = 0;
2357
2358 notprobed = (char *) palloc(wordlen);
2359 memset(notprobed, 1, wordlen);
2360 var = CopyVar(orig, 1);
2361
2362 while (level < wordlen)
2363 {
2364 /* find word with epenthetic or/and compound affix */
2365 caff = Conf->CompoundAffix;
2366 while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
2367 {
2368 /*
2369 * there is one of compound affixes, so check word for existings
2370 */
2371 char buf[MAXNORMLEN];
2372 char **subres;
2373
2374 lenaff = level - startpos + lenaff;
2375
2376 if (!notprobed[startpos + lenaff - 1])
2377 continue;
2378
2379 if (level + lenaff - 1 <= minpos)
2380 continue;
2381
2382 if (lenaff >= MAXNORMLEN)
2383 continue; /* skip too big value */
2384 if (lenaff > 0)
2385 memcpy(buf, word + startpos, lenaff);
2386 buf[lenaff] = '\0';
2387
2388 if (level == 0)
2389 compoundflag = FF_COMPOUNDBEGIN;
2390 else if (level == wordlen - 1)
2391 compoundflag = FF_COMPOUNDLAST;
2392 else
2393 compoundflag = FF_COMPOUNDMIDDLE;
2394 subres = NormalizeSubWord(Conf, buf, compoundflag);
2395 if (subres)
2396 {
2397 /* Yes, it was a word from dictionary */
2398 SplitVar *new = CopyVar(var, 0);
2399 SplitVar *ptr = var;
2400 char **sptr = subres;
2401
2402 notprobed[startpos + lenaff - 1] = 0;
2403
2404 while (*sptr)
2405 {
2406 AddStem(new, *sptr);
2407 sptr++;
2408 }
2409 pfree(subres);
2410
2411 while (ptr->next)
2412 ptr = ptr->next;
2413 ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
2414
2415 pfree(new->stem);
2416 pfree(new);
2417 }
2418 }
2419
2420 if (!node)
2421 break;
2422
2423 StopLow = node->data;
2424 StopHigh = node->data + node->length;
2425 while (StopLow < StopHigh)
2426 {
2427 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
2428 if (StopMiddle->val == ((uint8 *) (word))[level])
2429 break;
2430 else if (StopMiddle->val < ((uint8 *) (word))[level])
2431 StopLow = StopMiddle + 1;
2432 else
2433 StopHigh = StopMiddle;
2434 }
2435
2436 if (StopLow < StopHigh)
2437 {
2438 if (startpos == 0)
2439 compoundflag = FF_COMPOUNDBEGIN;
2440 else if (level == wordlen - 1)
2441 compoundflag = FF_COMPOUNDLAST;
2442 else
2443 compoundflag = FF_COMPOUNDMIDDLE;
2444
2445 /* find infinitive */
2446 if (StopMiddle->isword &&
2447 (StopMiddle->compoundflag & compoundflag) &&
2448 notprobed[level])
2449 {
2450 /* ok, we found full compoundallowed word */
2451 if (level > minpos)
2452 {
2453 /* and its length more than minimal */
2454 if (wordlen == level + 1)
2455 {
2456 /* well, it was last word */
2457 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2458 pfree(notprobed);
2459 return var;
2460 }
2461 else
2462 {
2463 /* then we will search more big word at the same point */
2464 SplitVar *ptr = var;
2465
2466 while (ptr->next)
2467 ptr = ptr->next;
2468 ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
2469 /* we can find next word */
2470 level++;
2471 AddStem(var, pnstrdup(word + startpos, level - startpos));
2472 node = Conf->Dictionary;
2473 startpos = level;
2474 continue;
2475 }
2476 }
2477 }
2478 node = StopMiddle->node;
2479 }
2480 else
2481 node = NULL;
2482 level++;
2483 }
2484
2485 AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
2486 pfree(notprobed);
2487 return var;
2488}
2489
2490static void
2491addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
2492{
2493 if (*lres == NULL)
2494 *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
2495
2496 if (*lcur - *lres < MAX_NORM - 1)
2497 {
2498 (*lcur)->lexeme = word;
2499 (*lcur)->flags = flags;
2500 (*lcur)->nvariant = NVariant;
2501 (*lcur)++;
2502 (*lcur)->lexeme = NULL;
2503 }
2504}
2505
2506TSLexeme *
2507NINormalizeWord(IspellDict *Conf, char *word)
2508{
2509 char **res;
2510 TSLexeme *lcur = NULL,
2511 *lres = NULL;
2512 uint16 NVariant = 1;
2513
2514 res = NormalizeSubWord(Conf, word, 0);
2515
2516 if (res)
2517 {
2518 char **ptr = res;
2519
2520 while (*ptr && (lcur - lres) < MAX_NORM)
2521 {
2522 addNorm(&lres, &lcur, *ptr, 0, NVariant++);
2523 ptr++;
2524 }
2525 pfree(res);
2526 }
2527
2528 if (Conf->usecompound)
2529 {
2530 int wordlen = strlen(word);
2531 SplitVar *ptr,
2532 *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
2533 int i;
2534
2535 while (var)
2536 {
2537 if (var->nstem > 1)
2538 {
2539 char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
2540
2541 if (subres)
2542 {
2543 char **subptr = subres;
2544
2545 while (*subptr)
2546 {
2547 for (i = 0; i < var->nstem - 1; i++)
2548 {
2549 addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
2550 }
2551
2552 addNorm(&lres, &lcur, *subptr, 0, NVariant);
2553 subptr++;
2554 NVariant++;
2555 }
2556
2557 pfree(subres);
2558 var->stem[0] = NULL;
2559 pfree(var->stem[var->nstem - 1]);
2560 }
2561 }
2562
2563 for (i = 0; i < var->nstem && var->stem[i]; i++)
2564 pfree(var->stem[i]);
2565 ptr = var->next;
2566 pfree(var->stem);
2567 pfree(var);
2568 var = ptr;
2569 }
2570 }
2571
2572 return lres;
2573}
2574