1// This is an open source non-commercial project. Dear PVS-Studio, please check
2// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4// spell.c: code for spell checking
5//
6// See spellfile.c for the Vim spell file format.
7//
8// The spell checking mechanism uses a tree (aka trie). Each node in the tree
9// has a list of bytes that can appear (siblings). For each byte there is a
10// pointer to the node with the byte that follows in the word (child).
11//
12// A NUL byte is used where the word may end. The bytes are sorted, so that
13// binary searching can be used and the NUL bytes are at the start. The
14// number of possible bytes is stored before the list of bytes.
15//
16// The tree uses two arrays: "byts" stores the characters, "idxs" stores
17// either the next index or flags. The tree starts at index 0. For example,
18// to lookup "vi" this sequence is followed:
19// i = 0
20// len = byts[i]
21// n = where "v" appears in byts[i + 1] to byts[i + len]
22// i = idxs[n]
23// len = byts[i]
24// n = where "i" appears in byts[i + 1] to byts[i + len]
25// i = idxs[n]
26// len = byts[i]
27// find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi".
28//
29// There are two word trees: one with case-folded words and one with words in
30// original case. The second one is only used for keep-case words and is
31// usually small.
32//
33// There is one additional tree for when not all prefixes are applied when
34// generating the .spl file. This tree stores all the possible prefixes, as
35// if they were words. At each word (prefix) end the prefix nr is stored, the
36// following word must support this prefix nr. And the condition nr is
37// stored, used to lookup the condition that the word must match with.
38//
39// Thanks to Olaf Seibert for providing an example implementation of this tree
40// and the compression mechanism.
41// LZ trie ideas:
42// http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf
43// More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html
44//
45// Matching involves checking the caps type: Onecap ALLCAP KeepCap.
46//
47// Why doesn't Vim use aspell/ispell/myspell/etc.?
48// See ":help develop-spell".
49
50// Use SPELL_PRINTTREE for debugging: dump the word tree after adding a word.
51// Only use it for small word lists!
52
53// Use SPELL_COMPRESS_ALLWAYS for debugging: compress the word tree after
54// adding a word. Only use it for small word lists!
55
56// Use DEBUG_TRIEWALK to print the changes made in suggest_trie_walk() for a
57// specific word.
58
59// Use this to adjust the score after finding suggestions, based on the
60// suggested word sounding like the bad word. This is much faster than doing
61// it for every possible suggestion.
62// Disadvantage: When "the" is typed as "hte" it sounds quite different ("@"
63// vs "ht") and goes down in the list.
64// Used when 'spellsuggest' is set to "best".
65#define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4)
66
67// Do the opposite: based on a maximum end score and a known sound score,
68// compute the maximum word score that can be used.
69#define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3)
70
71#include <assert.h>
72#include <inttypes.h>
73#include <limits.h>
74#include <stdbool.h>
75#include <string.h>
76#include <stdlib.h>
77#include <wctype.h>
78
79/* for offsetof() */
80#include <stddef.h>
81
82#include "nvim/vim.h"
83#include "nvim/ascii.h"
84#include "nvim/spell.h"
85#include "nvim/buffer.h"
86#include "nvim/change.h"
87#include "nvim/charset.h"
88#include "nvim/cursor.h"
89#include "nvim/edit.h"
90#include "nvim/eval.h"
91#include "nvim/ex_cmds.h"
92#include "nvim/ex_cmds2.h"
93#include "nvim/ex_docmd.h"
94#include "nvim/fileio.h"
95#include "nvim/func_attr.h"
96#include "nvim/getchar.h"
97#include "nvim/hashtab.h"
98#include "nvim/mark.h"
99#include "nvim/mbyte.h"
100#include "nvim/memline.h"
101#include "nvim/memory.h"
102#include "nvim/message.h"
103#include "nvim/misc1.h"
104#include "nvim/garray.h"
105#include "nvim/normal.h"
106#include "nvim/option.h"
107#include "nvim/os_unix.h"
108#include "nvim/path.h"
109#include "nvim/regexp.h"
110#include "nvim/screen.h"
111#include "nvim/search.h"
112#include "nvim/spellfile.h"
113#include "nvim/strings.h"
114#include "nvim/syntax.h"
115#include "nvim/undo.h"
116#include "nvim/os/os.h"
117#include "nvim/os/input.h"
118
119// only used for su_badflags
120#define WF_MIXCAP 0x20 // mix of upper and lower case: macaRONI
121
122#define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP | WF_FIXCAP)
123
124// Result values. Lower number is accepted over higher one.
125#define SP_BANNED -1
126#define SP_RARE 0
127#define SP_OK 1
128#define SP_LOCAL 2
129#define SP_BAD 3
130
131// First language that is loaded, start of the linked list of loaded
132// languages.
133slang_T *first_lang = NULL;
134
135// file used for "zG" and "zW"
136char_u *int_wordlist = NULL;
137
138typedef struct wordcount_S {
139 uint16_t wc_count; // nr of times word was seen
140 char_u wc_word[1]; // word, actually longer
141} wordcount_T;
142
143#define WC_KEY_OFF offsetof(wordcount_T, wc_word)
144#define HI2WC(hi) ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF))
145#define MAXWORDCOUNT 0xffff
146
147// Information used when looking for suggestions.
148typedef struct suginfo_S {
149 garray_T su_ga; // suggestions, contains "suggest_T"
150 int su_maxcount; // max. number of suggestions displayed
151 int su_maxscore; // maximum score for adding to su_ga
152 int su_sfmaxscore; // idem, for when doing soundfold words
153 garray_T su_sga; // like su_ga, sound-folded scoring
154 char_u *su_badptr; // start of bad word in line
155 int su_badlen; // length of detected bad word in line
156 int su_badflags; // caps flags for bad word
157 char_u su_badword[MAXWLEN]; // bad word truncated at su_badlen
158 char_u su_fbadword[MAXWLEN]; // su_badword case-folded
159 char_u su_sal_badword[MAXWLEN]; // su_badword soundfolded
160 hashtab_T su_banned; // table with banned words
161 slang_T *su_sallang; // default language for sound folding
162} suginfo_T;
163
164// One word suggestion. Used in "si_ga".
165typedef struct {
166 char_u *st_word; // suggested word, allocated string
167 int st_wordlen; // STRLEN(st_word)
168 int st_orglen; // length of replaced text
169 int st_score; // lower is better
170 int st_altscore; // used when st_score compares equal
171 bool st_salscore; // st_score is for soundalike
172 bool st_had_bonus; // bonus already included in score
173 slang_T *st_slang; // language used for sound folding
174} suggest_T;
175
176#define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i])
177
178// True if a word appears in the list of banned words.
179#define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word)))
180
181// Number of suggestions kept when cleaning up. We need to keep more than
182// what is displayed, because when rescore_suggestions() is called the score
183// may change and wrong suggestions may be removed later.
184#define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < \
185 130 ? 150 : (su)->su_maxcount + 20)
186
187// Threshold for sorting and cleaning up suggestions. Don't want to keep lots
188// of suggestions that are not going to be displayed.
189#define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50)
190
191// score for various changes
192#define SCORE_SPLIT 149 // split bad word
193#define SCORE_SPLIT_NO 249 // split bad word with NOSPLITSUGS
194#define SCORE_ICASE 52 // slightly different case
195#define SCORE_REGION 200 // word is for different region
196#define SCORE_RARE 180 // rare word
197#define SCORE_SWAP 75 // swap two characters
198#define SCORE_SWAP3 110 // swap two characters in three
199#define SCORE_REP 65 // REP replacement
200#define SCORE_SUBST 93 // substitute a character
201#define SCORE_SIMILAR 33 // substitute a similar character
202#define SCORE_SUBCOMP 33 // substitute a composing character
203#define SCORE_DEL 94 // delete a character
204#define SCORE_DELDUP 66 // delete a duplicated character
205#define SCORE_DELCOMP 28 // delete a composing character
206#define SCORE_INS 96 // insert a character
207#define SCORE_INSDUP 67 // insert a duplicate character
208#define SCORE_INSCOMP 30 // insert a composing character
209#define SCORE_NONWORD 103 // change non-word to word char
210
211#define SCORE_FILE 30 // suggestion from a file
212#define SCORE_MAXINIT 350 // Initial maximum score: higher == slower.
213 // 350 allows for about three changes.
214
215#define SCORE_COMMON1 30 // subtracted for words seen before
216#define SCORE_COMMON2 40 // subtracted for words often seen
217#define SCORE_COMMON3 50 // subtracted for words very often seen
218#define SCORE_THRES2 10 // word count threshold for COMMON2
219#define SCORE_THRES3 100 // word count threshold for COMMON3
220
221// When trying changed soundfold words it becomes slow when trying more than
222// two changes. With less then two changes it's slightly faster but we miss a
223// few good suggestions. In rare cases we need to try three of four changes.
224#define SCORE_SFMAX1 200 // maximum score for first try
225#define SCORE_SFMAX2 300 // maximum score for second try
226#define SCORE_SFMAX3 400 // maximum score for third try
227
228#define SCORE_BIG SCORE_INS * 3 // big difference
229#define SCORE_MAXMAX 999999 // accept any score
230#define SCORE_LIMITMAX 350 // for spell_edit_score_limit()
231
232// for spell_edit_score_limit() we need to know the minimum value of
233// SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS
234#define SCORE_EDIT_MIN SCORE_SIMILAR
235
236// Structure to store info for word matching.
237typedef struct matchinf_S {
238 langp_T *mi_lp; // info for language and region
239
240 // pointers to original text to be checked
241 char_u *mi_word; // start of word being checked
242 char_u *mi_end; // end of matching word so far
243 char_u *mi_fend; // next char to be added to mi_fword
244 char_u *mi_cend; // char after what was used for
245 // mi_capflags
246
247 // case-folded text
248 char_u mi_fword[MAXWLEN + 1]; // mi_word case-folded
249 int mi_fwordlen; // nr of valid bytes in mi_fword
250
251 // for when checking word after a prefix
252 int mi_prefarridx; // index in sl_pidxs with list of
253 // affixID/condition
254 int mi_prefcnt; // number of entries at mi_prefarridx
255 int mi_prefixlen; // byte length of prefix
256 int mi_cprefixlen; // byte length of prefix in original
257 // case
258
259 // for when checking a compound word
260 int mi_compoff; // start of following word offset
261 char_u mi_compflags[MAXWLEN]; // flags for compound words used
262 int mi_complen; // nr of compound words used
263 int mi_compextra; // nr of COMPOUNDROOT words
264
265 // others
266 int mi_result; // result so far: SP_BAD, SP_OK, etc.
267 int mi_capflags; // WF_ONECAP WF_ALLCAP WF_KEEPCAP
268 win_T *mi_win; // buffer being checked
269
270 // for NOBREAK
271 int mi_result2; // "mi_resul" without following word
272 char_u *mi_end2; // "mi_end" without following word
273} matchinf_T;
274
275// Structure used for the cookie argument of do_in_runtimepath().
276typedef struct spelload_S {
277 char_u sl_lang[MAXWLEN + 1]; // language name
278 slang_T *sl_slang; // resulting slang_T struct
279 int sl_nobreak; // NOBREAK language found
280} spelload_T;
281
282#define SY_MAXLEN 30
283typedef struct syl_item_S {
284 char_u sy_chars[SY_MAXLEN]; // the sequence of chars
285 int sy_len;
286} syl_item_T;
287
288spelltab_T spelltab;
289int did_set_spelltab;
290
291// structure used to store soundfolded words that add_sound_suggest() has
292// handled already.
293typedef struct {
294 short sft_score; // lowest score used
295 char_u sft_word[1]; // soundfolded word, actually longer
296} sftword_T;
297
298typedef struct {
299 int badi;
300 int goodi;
301 int score;
302} limitscore_T;
303
304
305#ifdef INCLUDE_GENERATED_DECLARATIONS
306# include "spell.c.generated.h"
307#endif
308
309// values for ts_isdiff
310#define DIFF_NONE 0 // no different byte (yet)
311#define DIFF_YES 1 // different byte found
312#define DIFF_INSERT 2 // inserting character
313
314// values for ts_flags
315#define TSF_PREFIXOK 1 // already checked that prefix is OK
316#define TSF_DIDSPLIT 2 // tried split at this point
317#define TSF_DIDDEL 4 // did a delete, "ts_delidx" has index
318
319// special values ts_prefixdepth
320#define PFD_NOPREFIX 0xff // not using prefixes
321#define PFD_PREFIXTREE 0xfe // walking through the prefix tree
322#define PFD_NOTSPECIAL 0xfd // highest value that's not special
323
324// mode values for find_word
325#define FIND_FOLDWORD 0 // find word case-folded
326#define FIND_KEEPWORD 1 // find keep-case word
327#define FIND_PREFIX 2 // find word after prefix
328#define FIND_COMPOUND 3 // find case-folded compound word
329#define FIND_KEEPCOMPOUND 4 // find keep-case compound word
330
331char *e_format = N_("E759: Format error in spell file");
332
333// Remember what "z?" replaced.
334static char_u *repl_from = NULL;
335static char_u *repl_to = NULL;
336
337// Main spell-checking function.
338// "ptr" points to a character that could be the start of a word.
339// "*attrp" is set to the highlight index for a badly spelled word. For a
340// non-word or when it's OK it remains unchanged.
341// This must only be called when 'spelllang' is not empty.
342//
343// "capcol" is used to check for a Capitalised word after the end of a
344// sentence. If it's zero then perform the check. Return the column where to
345// check next, or -1 when no sentence end was found. If it's NULL then don't
346// worry.
347//
348// Returns the length of the word in bytes, also when it's OK, so that the
349// caller can skip over the word.
350size_t spell_check(
351 win_T *wp, // current window
352 char_u *ptr,
353 hlf_T *attrp,
354 int *capcol, // column to check for Capital
355 bool docount // count good words
356)
357{
358 matchinf_T mi; // Most things are put in "mi" so that it can
359 // be passed to functions quickly.
360 size_t nrlen = 0; // found a number first
361 int c;
362 size_t wrongcaplen = 0;
363 int lpi;
364 bool count_word = docount;
365
366 // A word never starts at a space or a control character. Return quickly
367 // then, skipping over the character.
368 if (*ptr <= ' ') {
369 return 1;
370 }
371
372 // Return here when loading language files failed.
373 if (GA_EMPTY(&wp->w_s->b_langp)) {
374 return 1;
375 }
376
377 memset(&mi, 0, sizeof(matchinf_T));
378
379 // A number is always OK. Also skip hexadecimal numbers 0xFF99 and
380 // 0X99FF. But always do check spelling to find "3GPP" and "11
381 // julifeest".
382 if (*ptr >= '0' && *ptr <= '9') {
383 if (*ptr == '0' && (ptr[1] == 'b' || ptr[1] == 'B')) {
384 mi.mi_end = (char_u*) skipbin((char*) ptr + 2);
385 } else if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X')) {
386 mi.mi_end = skiphex(ptr + 2);
387 } else {
388 mi.mi_end = skipdigits(ptr);
389 }
390 nrlen = (size_t)(mi.mi_end - ptr);
391 }
392
393 // Find the normal end of the word (until the next non-word character).
394 mi.mi_word = ptr;
395 mi.mi_fend = ptr;
396 if (spell_iswordp(mi.mi_fend, wp)) {
397 do {
398 MB_PTR_ADV(mi.mi_fend);
399 } while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend, wp));
400
401 if (capcol != NULL && *capcol == 0 && wp->w_s->b_cap_prog != NULL) {
402 // Check word starting with capital letter.
403 c = PTR2CHAR(ptr);
404 if (!SPELL_ISUPPER(c)) {
405 wrongcaplen = (size_t)(mi.mi_fend - ptr);
406 }
407 }
408 }
409 if (capcol != NULL) {
410 *capcol = -1;
411 }
412
413 // We always use the characters up to the next non-word character,
414 // also for bad words.
415 mi.mi_end = mi.mi_fend;
416
417 // Check caps type later.
418 mi.mi_capflags = 0;
419 mi.mi_cend = NULL;
420 mi.mi_win = wp;
421
422 // case-fold the word with one non-word character, so that we can check
423 // for the word end.
424 if (*mi.mi_fend != NUL) {
425 MB_PTR_ADV(mi.mi_fend);
426 }
427
428 (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword, MAXWLEN + 1);
429 mi.mi_fwordlen = (int)STRLEN(mi.mi_fword);
430
431 // The word is bad unless we recognize it.
432 mi.mi_result = SP_BAD;
433 mi.mi_result2 = SP_BAD;
434
435 // Loop over the languages specified in 'spelllang'.
436 // We check them all, because a word may be matched longer in another
437 // language.
438 for (lpi = 0; lpi < wp->w_s->b_langp.ga_len; ++lpi) {
439 mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, lpi);
440
441 // If reloading fails the language is still in the list but everything
442 // has been cleared.
443 if (mi.mi_lp->lp_slang->sl_fidxs == NULL) {
444 continue;
445 }
446
447 // Check for a matching word in case-folded words.
448 find_word(&mi, FIND_FOLDWORD);
449
450 // Check for a matching word in keep-case words.
451 find_word(&mi, FIND_KEEPWORD);
452
453 // Check for matching prefixes.
454 find_prefix(&mi, FIND_FOLDWORD);
455
456 // For a NOBREAK language, may want to use a word without a following
457 // word as a backup.
458 if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD
459 && mi.mi_result2 != SP_BAD) {
460 mi.mi_result = mi.mi_result2;
461 mi.mi_end = mi.mi_end2;
462 }
463
464 // Count the word in the first language where it's found to be OK.
465 if (count_word && mi.mi_result == SP_OK) {
466 count_common_word(mi.mi_lp->lp_slang, ptr,
467 (int)(mi.mi_end - ptr), 1);
468 count_word = false;
469 }
470 }
471
472 if (mi.mi_result != SP_OK) {
473 // If we found a number skip over it. Allows for "42nd". Do flag
474 // rare and local words, e.g., "3GPP".
475 if (nrlen > 0) {
476 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) {
477 return nrlen;
478 }
479 } else if (!spell_iswordp_nmw(ptr, wp)) {
480 // When we are at a non-word character there is no error, just
481 // skip over the character (try looking for a word after it).
482 if (capcol != NULL && wp->w_s->b_cap_prog != NULL) {
483 regmatch_T regmatch;
484
485 // Check for end of sentence.
486 regmatch.regprog = wp->w_s->b_cap_prog;
487 regmatch.rm_ic = false;
488 int r = vim_regexec(&regmatch, ptr, 0);
489 wp->w_s->b_cap_prog = regmatch.regprog;
490 if (r) {
491 *capcol = (int)(regmatch.endp[0] - ptr);
492 }
493 }
494
495 if (has_mbyte) {
496 return (size_t)(*mb_ptr2len)(ptr);
497 }
498 return 1;
499 } else if (mi.mi_end == ptr) {
500 // Always include at least one character. Required for when there
501 // is a mixup in "midword".
502 MB_PTR_ADV(mi.mi_end);
503 } else if (mi.mi_result == SP_BAD
504 && LANGP_ENTRY(wp->w_s->b_langp, 0)->lp_slang->sl_nobreak) {
505 char_u *p, *fp;
506 int save_result = mi.mi_result;
507
508 // First language in 'spelllang' is NOBREAK. Find first position
509 // at which any word would be valid.
510 mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, 0);
511 if (mi.mi_lp->lp_slang->sl_fidxs != NULL) {
512 p = mi.mi_word;
513 fp = mi.mi_fword;
514 for (;;) {
515 MB_PTR_ADV(p);
516 MB_PTR_ADV(fp);
517 if (p >= mi.mi_end) {
518 break;
519 }
520 mi.mi_compoff = (int)(fp - mi.mi_fword);
521 find_word(&mi, FIND_COMPOUND);
522 if (mi.mi_result != SP_BAD) {
523 mi.mi_end = p;
524 break;
525 }
526 }
527 mi.mi_result = save_result;
528 }
529 }
530
531 if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) {
532 *attrp = HLF_SPB;
533 } else if (mi.mi_result == SP_RARE) {
534 *attrp = HLF_SPR;
535 } else {
536 *attrp = HLF_SPL;
537 }
538 }
539
540 if (wrongcaplen > 0 && (mi.mi_result == SP_OK || mi.mi_result == SP_RARE)) {
541 // Report SpellCap only when the word isn't badly spelled.
542 *attrp = HLF_SPC;
543 return wrongcaplen;
544 }
545
546 return (size_t)(mi.mi_end - ptr);
547}
548
549// Check if the word at "mip->mi_word" is in the tree.
550// When "mode" is FIND_FOLDWORD check in fold-case word tree.
551// When "mode" is FIND_KEEPWORD check in keep-case word tree.
552// When "mode" is FIND_PREFIX check for word after prefix in fold-case word
553// tree.
554//
555// For a match mip->mi_result is updated.
556static void find_word(matchinf_T *mip, int mode)
557{
558 int wlen = 0;
559 int flen;
560 char_u *ptr;
561 slang_T *slang = mip->mi_lp->lp_slang;
562 char_u *byts;
563 idx_T *idxs;
564
565 if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND) {
566 // Check for word with matching case in keep-case tree.
567 ptr = mip->mi_word;
568 flen = 9999; // no case folding, always enough bytes
569 byts = slang->sl_kbyts;
570 idxs = slang->sl_kidxs;
571
572 if (mode == FIND_KEEPCOMPOUND)
573 // Skip over the previously found word(s).
574 wlen += mip->mi_compoff;
575 } else {
576 // Check for case-folded in case-folded tree.
577 ptr = mip->mi_fword;
578 flen = mip->mi_fwordlen; // available case-folded bytes
579 byts = slang->sl_fbyts;
580 idxs = slang->sl_fidxs;
581
582 if (mode == FIND_PREFIX) {
583 // Skip over the prefix.
584 wlen = mip->mi_prefixlen;
585 flen -= mip->mi_prefixlen;
586 } else if (mode == FIND_COMPOUND) {
587 // Skip over the previously found word(s).
588 wlen = mip->mi_compoff;
589 flen -= mip->mi_compoff;
590 }
591
592 }
593
594 if (byts == NULL)
595 return; // array is empty
596
597 idx_T arridx = 0;
598 int endlen[MAXWLEN]; // length at possible word endings
599 idx_T endidx[MAXWLEN]; // possible word endings
600 int endidxcnt = 0;
601 int len;
602 int c;
603
604 // Repeat advancing in the tree until:
605 // - there is a byte that doesn't match,
606 // - we reach the end of the tree,
607 // - or we reach the end of the line.
608 for (;; ) {
609 if (flen <= 0 && *mip->mi_fend != NUL)
610 flen = fold_more(mip);
611
612 len = byts[arridx++];
613
614 // If the first possible byte is a zero the word could end here.
615 // Remember this index, we first check for the longest word.
616 if (byts[arridx] == 0) {
617 if (endidxcnt == MAXWLEN) {
618 // Must be a corrupted spell file.
619 EMSG(_(e_format));
620 return;
621 }
622 endlen[endidxcnt] = wlen;
623 endidx[endidxcnt++] = arridx++;
624 --len;
625
626 // Skip over the zeros, there can be several flag/region
627 // combinations.
628 while (len > 0 && byts[arridx] == 0) {
629 ++arridx;
630 --len;
631 }
632 if (len == 0)
633 break; // no children, word must end here
634 }
635
636 // Stop looking at end of the line.
637 if (ptr[wlen] == NUL)
638 break;
639
640 // Perform a binary search in the list of accepted bytes.
641 c = ptr[wlen];
642 if (c == TAB) // <Tab> is handled like <Space>
643 c = ' ';
644 idx_T lo = arridx;
645 idx_T hi = arridx + len - 1;
646 while (lo < hi) {
647 idx_T m = (lo + hi) / 2;
648 if (byts[m] > c)
649 hi = m - 1;
650 else if (byts[m] < c)
651 lo = m + 1;
652 else {
653 lo = hi = m;
654 break;
655 }
656 }
657
658 // Stop if there is no matching byte.
659 if (hi < lo || byts[lo] != c)
660 break;
661
662 // Continue at the child (if there is one).
663 arridx = idxs[lo];
664 ++wlen;
665 --flen;
666
667 // One space in the good word may stand for several spaces in the
668 // checked word.
669 if (c == ' ') {
670 for (;; ) {
671 if (flen <= 0 && *mip->mi_fend != NUL)
672 flen = fold_more(mip);
673 if (ptr[wlen] != ' ' && ptr[wlen] != TAB)
674 break;
675 ++wlen;
676 --flen;
677 }
678 }
679 }
680
681 char_u *p;
682 bool word_ends;
683
684 // Verify that one of the possible endings is valid. Try the longest
685 // first.
686 while (endidxcnt > 0) {
687 --endidxcnt;
688 arridx = endidx[endidxcnt];
689 wlen = endlen[endidxcnt];
690
691 if (utf_head_off(ptr, ptr + wlen) > 0) {
692 continue; // not at first byte of character
693 }
694 if (spell_iswordp(ptr + wlen, mip->mi_win)) {
695 if (slang->sl_compprog == NULL && !slang->sl_nobreak)
696 continue; // next char is a word character
697 word_ends = false;
698 } else
699 word_ends = true;
700 // The prefix flag is before compound flags. Once a valid prefix flag
701 // has been found we try compound flags.
702 bool prefix_found = false;
703
704 if (mode != FIND_KEEPWORD && has_mbyte) {
705 // Compute byte length in original word, length may change
706 // when folding case. This can be slow, take a shortcut when the
707 // case-folded word is equal to the keep-case word.
708 p = mip->mi_word;
709 if (STRNCMP(ptr, p, wlen) != 0) {
710 for (char_u *s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) {
711 MB_PTR_ADV(p);
712 }
713 wlen = (int)(p - mip->mi_word);
714 }
715 }
716
717 // Check flags and region. For FIND_PREFIX check the condition and
718 // prefix ID.
719 // Repeat this if there are more flags/region alternatives until there
720 // is a match.
721 for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0;
722 --len, ++arridx) {
723 uint32_t flags = idxs[arridx];
724
725 // For the fold-case tree check that the case of the checked word
726 // matches with what the word in the tree requires.
727 // For keep-case tree the case is always right. For prefixes we
728 // don't bother to check.
729 if (mode == FIND_FOLDWORD) {
730 if (mip->mi_cend != mip->mi_word + wlen) {
731 // mi_capflags was set for a different word length, need
732 // to do it again.
733 mip->mi_cend = mip->mi_word + wlen;
734 mip->mi_capflags = captype(mip->mi_word, mip->mi_cend);
735 }
736
737 if (mip->mi_capflags == WF_KEEPCAP
738 || !spell_valid_case(mip->mi_capflags, flags))
739 continue;
740 }
741 // When mode is FIND_PREFIX the word must support the prefix:
742 // check the prefix ID and the condition. Do that for the list at
743 // mip->mi_prefarridx that find_prefix() filled.
744 else if (mode == FIND_PREFIX && !prefix_found) {
745 c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx,
746 flags,
747 mip->mi_word + mip->mi_cprefixlen, slang,
748 false);
749 if (c == 0)
750 continue;
751
752 // Use the WF_RARE flag for a rare prefix.
753 if (c & WF_RAREPFX)
754 flags |= WF_RARE;
755 prefix_found = true;
756 }
757
758 if (slang->sl_nobreak) {
759 if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND)
760 && (flags & WF_BANNED) == 0) {
761 // NOBREAK: found a valid following word. That's all we
762 // need to know, so return.
763 mip->mi_result = SP_OK;
764 break;
765 }
766 } else if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND
767 || !word_ends)) {
768 // If there is no compound flag or the word is shorter than
769 // COMPOUNDMIN reject it quickly.
770 // Makes you wonder why someone puts a compound flag on a word
771 // that's too short... Myspell compatibility requires this
772 // anyway.
773 if (((unsigned)flags >> 24) == 0
774 || wlen - mip->mi_compoff < slang->sl_compminlen)
775 continue;
776 // For multi-byte chars check character length against
777 // COMPOUNDMIN.
778 if (has_mbyte
779 && slang->sl_compminlen > 0
780 && mb_charlen_len(mip->mi_word + mip->mi_compoff,
781 wlen - mip->mi_compoff) < slang->sl_compminlen)
782 continue;
783
784 // Limit the number of compound words to COMPOUNDWORDMAX if no
785 // maximum for syllables is specified.
786 if (!word_ends && mip->mi_complen + mip->mi_compextra + 2
787 > slang->sl_compmax
788 && slang->sl_compsylmax == MAXWLEN)
789 continue;
790
791 // Don't allow compounding on a side where an affix was added,
792 // unless COMPOUNDPERMITFLAG was used.
793 if (mip->mi_complen > 0 && (flags & WF_NOCOMPBEF))
794 continue;
795 if (!word_ends && (flags & WF_NOCOMPAFT))
796 continue;
797
798 // Quickly check if compounding is possible with this flag.
799 if (!byte_in_str(mip->mi_complen == 0
800 ? slang->sl_compstartflags
801 : slang->sl_compallflags,
802 ((unsigned)flags >> 24)))
803 continue;
804
805 // If there is a match with a CHECKCOMPOUNDPATTERN rule
806 // discard the compound word.
807 if (match_checkcompoundpattern(ptr, wlen, &slang->sl_comppat))
808 continue;
809
810 if (mode == FIND_COMPOUND) {
811 int capflags;
812
813 // Need to check the caps type of the appended compound
814 // word.
815 if (has_mbyte && STRNCMP(ptr, mip->mi_word,
816 mip->mi_compoff) != 0) {
817 // case folding may have changed the length
818 p = mip->mi_word;
819 for (char_u *s = ptr; s < ptr + mip->mi_compoff; MB_PTR_ADV(s)) {
820 MB_PTR_ADV(p);
821 }
822 } else {
823 p = mip->mi_word + mip->mi_compoff;
824 }
825 capflags = captype(p, mip->mi_word + wlen);
826 if (capflags == WF_KEEPCAP || (capflags == WF_ALLCAP
827 && (flags & WF_FIXCAP) != 0))
828 continue;
829
830 if (capflags != WF_ALLCAP) {
831 // When the character before the word is a word
832 // character we do not accept a Onecap word. We do
833 // accept a no-caps word, even when the dictionary
834 // word specifies ONECAP.
835 MB_PTR_BACK(mip->mi_word, p);
836 if (spell_iswordp_nmw(p, mip->mi_win)
837 ? capflags == WF_ONECAP
838 : (flags & WF_ONECAP) != 0
839 && capflags != WF_ONECAP) {
840 continue;
841 }
842 }
843 }
844
845 // If the word ends the sequence of compound flags of the
846 // words must match with one of the COMPOUNDRULE items and
847 // the number of syllables must not be too large.
848 mip->mi_compflags[mip->mi_complen] = ((unsigned)flags >> 24);
849 mip->mi_compflags[mip->mi_complen + 1] = NUL;
850 if (word_ends) {
851 char_u fword[MAXWLEN];
852
853 if (slang->sl_compsylmax < MAXWLEN) {
854 // "fword" is only needed for checking syllables.
855 if (ptr == mip->mi_word)
856 (void)spell_casefold(ptr, wlen, fword, MAXWLEN);
857 else
858 STRLCPY(fword, ptr, endlen[endidxcnt] + 1);
859 }
860 if (!can_compound(slang, fword, mip->mi_compflags))
861 continue;
862 } else if (slang->sl_comprules != NULL
863 && !match_compoundrule(slang, mip->mi_compflags))
864 // The compound flags collected so far do not match any
865 // COMPOUNDRULE, discard the compounded word.
866 continue;
867 }
868 // Check NEEDCOMPOUND: can't use word without compounding.
869 else if (flags & WF_NEEDCOMP)
870 continue;
871
872 int nobreak_result = SP_OK;
873
874 if (!word_ends) {
875 int save_result = mip->mi_result;
876 char_u *save_end = mip->mi_end;
877 langp_T *save_lp = mip->mi_lp;
878
879 // Check that a valid word follows. If there is one and we
880 // are compounding, it will set "mi_result", thus we are
881 // always finished here. For NOBREAK we only check that a
882 // valid word follows.
883 // Recursive!
884 if (slang->sl_nobreak)
885 mip->mi_result = SP_BAD;
886
887 // Find following word in case-folded tree.
888 mip->mi_compoff = endlen[endidxcnt];
889 if (has_mbyte && mode == FIND_KEEPWORD) {
890 // Compute byte length in case-folded word from "wlen":
891 // byte length in keep-case word. Length may change when
892 // folding case. This can be slow, take a shortcut when
893 // the case-folded word is equal to the keep-case word.
894 p = mip->mi_fword;
895 if (STRNCMP(ptr, p, wlen) != 0) {
896 for (char_u *s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) {
897 MB_PTR_ADV(p);
898 }
899 mip->mi_compoff = (int)(p - mip->mi_fword);
900 }
901 }
902#if 0
903 c = mip->mi_compoff;
904#endif
905 ++mip->mi_complen;
906 if (flags & WF_COMPROOT)
907 ++mip->mi_compextra;
908
909 // For NOBREAK we need to try all NOBREAK languages, at least
910 // to find the ".add" file(s).
911 for (int lpi = 0; lpi < mip->mi_win->w_s->b_langp.ga_len; ++lpi) {
912 if (slang->sl_nobreak) {
913 mip->mi_lp = LANGP_ENTRY(mip->mi_win->w_s->b_langp, lpi);
914 if (mip->mi_lp->lp_slang->sl_fidxs == NULL
915 || !mip->mi_lp->lp_slang->sl_nobreak)
916 continue;
917 }
918
919 find_word(mip, FIND_COMPOUND);
920
921 // When NOBREAK any word that matches is OK. Otherwise we
922 // need to find the longest match, thus try with keep-case
923 // and prefix too.
924 if (!slang->sl_nobreak || mip->mi_result == SP_BAD) {
925 // Find following word in keep-case tree.
926 mip->mi_compoff = wlen;
927 find_word(mip, FIND_KEEPCOMPOUND);
928
929#if 0 // Disabled, a prefix must not appear halfway through a compound
930 // word, unless the COMPOUNDPERMITFLAG is used, in which case it
931 // can't be a postponed prefix.
932 if (!slang->sl_nobreak || mip->mi_result == SP_BAD) {
933 // Check for following word with prefix.
934 mip->mi_compoff = c;
935 find_prefix(mip, FIND_COMPOUND);
936 }
937#endif
938 }
939
940 if (!slang->sl_nobreak)
941 break;
942 }
943 --mip->mi_complen;
944 if (flags & WF_COMPROOT)
945 --mip->mi_compextra;
946 mip->mi_lp = save_lp;
947
948 if (slang->sl_nobreak) {
949 nobreak_result = mip->mi_result;
950 mip->mi_result = save_result;
951 mip->mi_end = save_end;
952 } else {
953 if (mip->mi_result == SP_OK)
954 break;
955 continue;
956 }
957 }
958
959 int res = SP_BAD;
960 if (flags & WF_BANNED)
961 res = SP_BANNED;
962 else if (flags & WF_REGION) {
963 // Check region.
964 if ((mip->mi_lp->lp_region & (flags >> 16)) != 0)
965 res = SP_OK;
966 else
967 res = SP_LOCAL;
968 } else if (flags & WF_RARE)
969 res = SP_RARE;
970 else
971 res = SP_OK;
972
973 // Always use the longest match and the best result. For NOBREAK
974 // we separately keep the longest match without a following good
975 // word as a fall-back.
976 if (nobreak_result == SP_BAD) {
977 if (mip->mi_result2 > res) {
978 mip->mi_result2 = res;
979 mip->mi_end2 = mip->mi_word + wlen;
980 } else if (mip->mi_result2 == res
981 && mip->mi_end2 < mip->mi_word + wlen)
982 mip->mi_end2 = mip->mi_word + wlen;
983 } else if (mip->mi_result > res) {
984 mip->mi_result = res;
985 mip->mi_end = mip->mi_word + wlen;
986 } else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen)
987 mip->mi_end = mip->mi_word + wlen;
988
989 if (mip->mi_result == SP_OK)
990 break;
991 }
992
993 if (mip->mi_result == SP_OK)
994 break;
995 }
996}
997
998// Returns true if there is a match between the word ptr[wlen] and
999// CHECKCOMPOUNDPATTERN rules, assuming that we will concatenate with another
1000// word.
1001// A match means that the first part of CHECKCOMPOUNDPATTERN matches at the
1002// end of ptr[wlen] and the second part matches after it.
1003static bool
1004match_checkcompoundpattern (
1005 char_u *ptr,
1006 int wlen,
1007 garray_T *gap // &sl_comppat
1008)
1009{
1010 char_u *p;
1011 int len;
1012
1013 for (int i = 0; i + 1 < gap->ga_len; i += 2) {
1014 p = ((char_u **)gap->ga_data)[i + 1];
1015 if (STRNCMP(ptr + wlen, p, STRLEN(p)) == 0) {
1016 // Second part matches at start of following compound word, now
1017 // check if first part matches at end of previous word.
1018 p = ((char_u **)gap->ga_data)[i];
1019 len = (int)STRLEN(p);
1020 if (len <= wlen && STRNCMP(ptr + wlen - len, p, len) == 0)
1021 return true;
1022 }
1023 }
1024 return false;
1025}
1026
1027// Returns true if "flags" is a valid sequence of compound flags and "word"
1028// does not have too many syllables.
1029static bool can_compound(slang_T *slang, char_u *word, char_u *flags)
1030{
1031 char_u uflags[MAXWLEN * 2];
1032 int i;
1033 char_u *p;
1034
1035 if (slang->sl_compprog == NULL)
1036 return false;
1037 if (enc_utf8) {
1038 // Need to convert the single byte flags to utf8 characters.
1039 p = uflags;
1040 for (i = 0; flags[i] != NUL; i++) {
1041 p += utf_char2bytes(flags[i], p);
1042 }
1043 *p = NUL;
1044 p = uflags;
1045 } else
1046 p = flags;
1047 if (!vim_regexec_prog(&slang->sl_compprog, false, p, 0))
1048 return false;
1049
1050 // Count the number of syllables. This may be slow, do it last. If there
1051 // are too many syllables AND the number of compound words is above
1052 // COMPOUNDWORDMAX then compounding is not allowed.
1053 if (slang->sl_compsylmax < MAXWLEN
1054 && count_syllables(slang, word) > slang->sl_compsylmax)
1055 return (int)STRLEN(flags) < slang->sl_compmax;
1056 return true;
1057}
1058
1059// Returns true when the sequence of flags in "compflags" plus "flag" can
1060// possibly form a valid compounded word. This also checks the COMPOUNDRULE
1061// lines if they don't contain wildcards.
1062static bool can_be_compound(trystate_T *sp, slang_T *slang, char_u *compflags, int flag)
1063{
1064 // If the flag doesn't appear in sl_compstartflags or sl_compallflags
1065 // then it can't possibly compound.
1066 if (!byte_in_str(sp->ts_complen == sp->ts_compsplit
1067 ? slang->sl_compstartflags : slang->sl_compallflags, flag))
1068 return false;
1069
1070 // If there are no wildcards, we can check if the flags collected so far
1071 // possibly can form a match with COMPOUNDRULE patterns. This only
1072 // makes sense when we have two or more words.
1073 if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit) {
1074 compflags[sp->ts_complen] = flag;
1075 compflags[sp->ts_complen + 1] = NUL;
1076 bool v = match_compoundrule(slang, compflags + sp->ts_compsplit);
1077 compflags[sp->ts_complen] = NUL;
1078 return v;
1079 }
1080
1081 return true;
1082}
1083
1084// Returns true if the compound flags in compflags[] match the start of any
1085// compound rule. This is used to stop trying a compound if the flags
1086// collected so far can't possibly match any compound rule.
1087// Caller must check that slang->sl_comprules is not NULL.
1088static bool match_compoundrule(slang_T *slang, char_u *compflags)
1089{
1090 char_u *p;
1091 int i;
1092 int c;
1093
1094 // loop over all the COMPOUNDRULE entries
1095 for (p = slang->sl_comprules; *p != NUL; ++p) {
1096 // loop over the flags in the compound word we have made, match
1097 // them against the current rule entry
1098 for (i = 0;; ++i) {
1099 c = compflags[i];
1100 if (c == NUL)
1101 // found a rule that matches for the flags we have so far
1102 return true;
1103 if (*p == '/' || *p == NUL)
1104 break; // end of rule, it's too short
1105 if (*p == '[') {
1106 bool match = false;
1107
1108 // compare against all the flags in []
1109 ++p;
1110 while (*p != ']' && *p != NUL)
1111 if (*p++ == c)
1112 match = true;
1113 if (!match)
1114 break; // none matches
1115 } else if (*p != c)
1116 break; // flag of word doesn't match flag in pattern
1117 ++p;
1118 }
1119
1120 // Skip to the next "/", where the next pattern starts.
1121 p = vim_strchr(p, '/');
1122 if (p == NULL)
1123 break;
1124 }
1125
1126 // Checked all the rules and none of them match the flags, so there
1127 // can't possibly be a compound starting with these flags.
1128 return false;
1129}
1130
1131// Return non-zero if the prefix indicated by "arridx" matches with the prefix
1132// ID in "flags" for the word "word".
1133// The WF_RAREPFX flag is included in the return value for a rare prefix.
1134static int
1135valid_word_prefix (
1136 int totprefcnt, // nr of prefix IDs
1137 int arridx, // idx in sl_pidxs[]
1138 int flags,
1139 char_u *word,
1140 slang_T *slang,
1141 bool cond_req // only use prefixes with a condition
1142)
1143{
1144 int prefcnt;
1145 int pidx;
1146 int prefid;
1147
1148 prefid = (unsigned)flags >> 24;
1149 for (prefcnt = totprefcnt - 1; prefcnt >= 0; --prefcnt) {
1150 pidx = slang->sl_pidxs[arridx + prefcnt];
1151
1152 // Check the prefix ID.
1153 if (prefid != (pidx & 0xff))
1154 continue;
1155
1156 // Check if the prefix doesn't combine and the word already has a
1157 // suffix.
1158 if ((flags & WF_HAS_AFF) && (pidx & WF_PFX_NC))
1159 continue;
1160
1161 // Check the condition, if there is one. The condition index is
1162 // stored in the two bytes above the prefix ID byte.
1163 regprog_T **rp = &slang->sl_prefprog[((unsigned)pidx >> 8) & 0xffff];
1164 if (*rp != NULL) {
1165 if (!vim_regexec_prog(rp, false, word, 0)) {
1166 continue;
1167 }
1168 } else if (cond_req)
1169 continue;
1170
1171 // It's a match! Return the WF_ flags.
1172 return pidx;
1173 }
1174 return 0;
1175}
1176
1177// Check if the word at "mip->mi_word" has a matching prefix.
1178// If it does, then check the following word.
1179//
1180// If "mode" is "FIND_COMPOUND" then do the same after another word, find a
1181// prefix in a compound word.
1182//
1183// For a match mip->mi_result is updated.
1184static void find_prefix(matchinf_T *mip, int mode)
1185{
1186 idx_T arridx = 0;
1187 int len;
1188 int wlen = 0;
1189 int flen;
1190 int c;
1191 char_u *ptr;
1192 idx_T lo, hi, m;
1193 slang_T *slang = mip->mi_lp->lp_slang;
1194 char_u *byts;
1195 idx_T *idxs;
1196
1197 byts = slang->sl_pbyts;
1198 if (byts == NULL)
1199 return; // array is empty
1200
1201 // We use the case-folded word here, since prefixes are always
1202 // case-folded.
1203 ptr = mip->mi_fword;
1204 flen = mip->mi_fwordlen; // available case-folded bytes
1205 if (mode == FIND_COMPOUND) {
1206 // Skip over the previously found word(s).
1207 ptr += mip->mi_compoff;
1208 flen -= mip->mi_compoff;
1209 }
1210 idxs = slang->sl_pidxs;
1211
1212 // Repeat advancing in the tree until:
1213 // - there is a byte that doesn't match,
1214 // - we reach the end of the tree,
1215 // - or we reach the end of the line.
1216 for (;; ) {
1217 if (flen == 0 && *mip->mi_fend != NUL)
1218 flen = fold_more(mip);
1219
1220 len = byts[arridx++];
1221
1222 // If the first possible byte is a zero the prefix could end here.
1223 // Check if the following word matches and supports the prefix.
1224 if (byts[arridx] == 0) {
1225 // There can be several prefixes with different conditions. We
1226 // try them all, since we don't know which one will give the
1227 // longest match. The word is the same each time, pass the list
1228 // of possible prefixes to find_word().
1229 mip->mi_prefarridx = arridx;
1230 mip->mi_prefcnt = len;
1231 while (len > 0 && byts[arridx] == 0) {
1232 ++arridx;
1233 --len;
1234 }
1235 mip->mi_prefcnt -= len;
1236
1237 // Find the word that comes after the prefix.
1238 mip->mi_prefixlen = wlen;
1239 if (mode == FIND_COMPOUND)
1240 // Skip over the previously found word(s).
1241 mip->mi_prefixlen += mip->mi_compoff;
1242
1243 if (has_mbyte) {
1244 // Case-folded length may differ from original length.
1245 mip->mi_cprefixlen = nofold_len(mip->mi_fword,
1246 mip->mi_prefixlen, mip->mi_word);
1247 } else
1248 mip->mi_cprefixlen = mip->mi_prefixlen;
1249 find_word(mip, FIND_PREFIX);
1250
1251
1252 if (len == 0)
1253 break; // no children, word must end here
1254 }
1255
1256 // Stop looking at end of the line.
1257 if (ptr[wlen] == NUL)
1258 break;
1259
1260 // Perform a binary search in the list of accepted bytes.
1261 c = ptr[wlen];
1262 lo = arridx;
1263 hi = arridx + len - 1;
1264 while (lo < hi) {
1265 m = (lo + hi) / 2;
1266 if (byts[m] > c)
1267 hi = m - 1;
1268 else if (byts[m] < c)
1269 lo = m + 1;
1270 else {
1271 lo = hi = m;
1272 break;
1273 }
1274 }
1275
1276 // Stop if there is no matching byte.
1277 if (hi < lo || byts[lo] != c)
1278 break;
1279
1280 // Continue at the child (if there is one).
1281 arridx = idxs[lo];
1282 ++wlen;
1283 --flen;
1284 }
1285}
1286
1287// Need to fold at least one more character. Do until next non-word character
1288// for efficiency. Include the non-word character too.
1289// Return the length of the folded chars in bytes.
1290static int fold_more(matchinf_T *mip)
1291{
1292 int flen;
1293 char_u *p;
1294
1295 p = mip->mi_fend;
1296 do {
1297 MB_PTR_ADV(mip->mi_fend);
1298 } while (*mip->mi_fend != NUL && spell_iswordp(mip->mi_fend, mip->mi_win));
1299
1300 // Include the non-word character so that we can check for the word end.
1301 if (*mip->mi_fend != NUL) {
1302 MB_PTR_ADV(mip->mi_fend);
1303 }
1304
1305 (void)spell_casefold(p, (int)(mip->mi_fend - p),
1306 mip->mi_fword + mip->mi_fwordlen,
1307 MAXWLEN - mip->mi_fwordlen);
1308 flen = (int)STRLEN(mip->mi_fword + mip->mi_fwordlen);
1309 mip->mi_fwordlen += flen;
1310 return flen;
1311}
1312
1313/// Checks case flags for a word. Returns true, if the word has the requested
1314/// case.
1315///
1316/// @param wordflags Flags for the checked word.
1317/// @param treeflags Flags for the word in the spell tree.
1318static bool spell_valid_case(int wordflags, int treeflags)
1319{
1320 return (wordflags == WF_ALLCAP && (treeflags & WF_FIXCAP) == 0)
1321 || ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0
1322 && ((treeflags & WF_ONECAP) == 0
1323 || (wordflags & WF_ONECAP) != 0));
1324}
1325
1326// Returns true if spell checking is not enabled.
1327static bool no_spell_checking(win_T *wp)
1328{
1329 if (!wp->w_p_spell || *wp->w_s->b_p_spl == NUL
1330 || GA_EMPTY(&wp->w_s->b_langp)) {
1331 EMSG(_("E756: Spell checking is not enabled"));
1332 return true;
1333 }
1334 return false;
1335}
1336
1337// Moves to the next spell error.
1338// "curline" is false for "[s", "]s", "[S" and "]S".
1339// "curline" is true to find word under/after cursor in the same line.
1340// For Insert mode completion "dir" is BACKWARD and "curline" is true: move
1341// to after badly spelled word before the cursor.
1342// Return 0 if not found, length of the badly spelled word otherwise.
1343size_t
1344spell_move_to (
1345 win_T *wp,
1346 int dir, // FORWARD or BACKWARD
1347 bool allwords, // true for "[s"/"]s", false for "[S"/"]S"
1348 bool curline,
1349 hlf_T *attrp // return: attributes of bad word or NULL
1350 // (only when "dir" is FORWARD)
1351)
1352{
1353 linenr_T lnum;
1354 pos_T found_pos;
1355 size_t found_len = 0;
1356 char_u *line;
1357 char_u *p;
1358 char_u *endp;
1359 hlf_T attr = HLF_COUNT;
1360 size_t len;
1361 int has_syntax = syntax_present(wp);
1362 int col;
1363 bool can_spell;
1364 char_u *buf = NULL;
1365 size_t buflen = 0;
1366 int skip = 0;
1367 int capcol = -1;
1368 bool found_one = false;
1369 bool wrapped = false;
1370
1371 if (no_spell_checking(wp))
1372 return 0;
1373
1374 // Start looking for bad word at the start of the line, because we can't
1375 // start halfway through a word, we don't know where it starts or ends.
1376 //
1377 // When searching backwards, we continue in the line to find the last
1378 // bad word (in the cursor line: before the cursor).
1379 //
1380 // We concatenate the start of the next line, so that wrapped words work
1381 // (e.g. "et<line-break>cetera"). Doesn't work when searching backwards
1382 // though...
1383 lnum = wp->w_cursor.lnum;
1384 clearpos(&found_pos);
1385
1386 while (!got_int) {
1387 line = ml_get_buf(wp->w_buffer, lnum, FALSE);
1388
1389 len = STRLEN(line);
1390 if (buflen < len + MAXWLEN + 2) {
1391 xfree(buf);
1392 buflen = len + MAXWLEN + 2;
1393 buf = xmalloc(buflen);
1394 }
1395 assert(buf && buflen >= len + MAXWLEN + 2);
1396
1397 // In first line check first word for Capital.
1398 if (lnum == 1)
1399 capcol = 0;
1400
1401 // For checking first word with a capital skip white space.
1402 if (capcol == 0) {
1403 capcol = (int)getwhitecols(line);
1404 } else if (curline && wp == curwin) {
1405 // For spellbadword(): check if first word needs a capital.
1406 col = (int)getwhitecols(line);
1407 if (check_need_cap(lnum, col)) {
1408 capcol = col;
1409 }
1410
1411 // Need to get the line again, may have looked at the previous
1412 // one.
1413 line = ml_get_buf(wp->w_buffer, lnum, FALSE);
1414 }
1415
1416 // Copy the line into "buf" and append the start of the next line if
1417 // possible.
1418 STRCPY(buf, line);
1419 if (lnum < wp->w_buffer->b_ml.ml_line_count)
1420 spell_cat_line(buf + STRLEN(buf),
1421 ml_get_buf(wp->w_buffer, lnum + 1, FALSE),
1422 MAXWLEN);
1423 p = buf + skip;
1424 endp = buf + len;
1425 while (p < endp) {
1426 // When searching backward don't search after the cursor. Unless
1427 // we wrapped around the end of the buffer.
1428 if (dir == BACKWARD
1429 && lnum == wp->w_cursor.lnum
1430 && !wrapped
1431 && (colnr_T)(p - buf) >= wp->w_cursor.col)
1432 break;
1433
1434 // start of word
1435 attr = HLF_COUNT;
1436 len = spell_check(wp, p, &attr, &capcol, false);
1437
1438 if (attr != HLF_COUNT) {
1439 // We found a bad word. Check the attribute.
1440 if (allwords || attr == HLF_SPB) {
1441 // When searching forward only accept a bad word after
1442 // the cursor.
1443 if (dir == BACKWARD
1444 || lnum != wp->w_cursor.lnum
1445 || wrapped
1446 || ((colnr_T)(curline
1447 ? p - buf + (ptrdiff_t)len
1448 : p - buf) > wp->w_cursor.col)) {
1449 if (has_syntax) {
1450 col = (int)(p - buf);
1451 (void)syn_get_id(wp, lnum, (colnr_T)col,
1452 FALSE, &can_spell, FALSE);
1453 if (!can_spell)
1454 attr = HLF_COUNT;
1455 } else
1456 can_spell = true;
1457
1458 if (can_spell) {
1459 found_one = true;
1460 found_pos.lnum = lnum;
1461 found_pos.col = (int)(p - buf);
1462 found_pos.coladd = 0;
1463 if (dir == FORWARD) {
1464 // No need to search further.
1465 wp->w_cursor = found_pos;
1466 xfree(buf);
1467 if (attrp != NULL)
1468 *attrp = attr;
1469 return len;
1470 } else if (curline) {
1471 // Insert mode completion: put cursor after
1472 // the bad word.
1473 assert(len <= INT_MAX);
1474 found_pos.col += (int)len;
1475 }
1476 found_len = len;
1477 }
1478 } else
1479 found_one = true;
1480 }
1481 }
1482
1483 // advance to character after the word
1484 p += len;
1485 assert(len <= INT_MAX);
1486 capcol -= (int)len;
1487 }
1488
1489 if (dir == BACKWARD && found_pos.lnum != 0) {
1490 // Use the last match in the line (before the cursor).
1491 wp->w_cursor = found_pos;
1492 xfree(buf);
1493 return found_len;
1494 }
1495
1496 if (curline) {
1497 break; // only check cursor line
1498 }
1499
1500 // If we are back at the starting line and searched it again there
1501 // is no match, give up.
1502 if (lnum == wp->w_cursor.lnum && wrapped) {
1503 break;
1504 }
1505
1506 // Advance to next line.
1507 if (dir == BACKWARD) {
1508 if (lnum > 1) {
1509 lnum--;
1510 } else if (!p_ws) {
1511 break; // at first line and 'nowrapscan'
1512 } else {
1513 // Wrap around to the end of the buffer. May search the
1514 // starting line again and accept the last match.
1515 lnum = wp->w_buffer->b_ml.ml_line_count;
1516 wrapped = true;
1517 if (!shortmess(SHM_SEARCH))
1518 give_warning((char_u *)_(top_bot_msg), true);
1519 }
1520 capcol = -1;
1521 } else {
1522 if (lnum < wp->w_buffer->b_ml.ml_line_count)
1523 ++lnum;
1524 else if (!p_ws)
1525 break; // at first line and 'nowrapscan'
1526 else {
1527 // Wrap around to the start of the buffer. May search the
1528 // starting line again and accept the first match.
1529 lnum = 1;
1530 wrapped = true;
1531 if (!shortmess(SHM_SEARCH))
1532 give_warning((char_u *)_(bot_top_msg), true);
1533 }
1534
1535 // If we are back at the starting line and there is no match then
1536 // give up.
1537 if (lnum == wp->w_cursor.lnum && !found_one) {
1538 break;
1539 }
1540
1541 // Skip the characters at the start of the next line that were
1542 // included in a match crossing line boundaries.
1543 if (attr == HLF_COUNT)
1544 skip = (int)(p - endp);
1545 else
1546 skip = 0;
1547
1548 // Capcol skips over the inserted space.
1549 --capcol;
1550
1551 // But after empty line check first word in next line
1552 if (*skipwhite(line) == NUL)
1553 capcol = 0;
1554 }
1555
1556 line_breakcheck();
1557 }
1558
1559 xfree(buf);
1560 return 0;
1561}
1562
1563// For spell checking: concatenate the start of the following line "line" into
1564// "buf", blanking-out special characters. Copy less then "maxlen" bytes.
1565// Keep the blanks at the start of the next line, this is used in win_line()
1566// to skip those bytes if the word was OK.
1567void spell_cat_line(char_u *buf, char_u *line, int maxlen)
1568{
1569 char_u *p;
1570 int n;
1571
1572 p = skipwhite(line);
1573 while (vim_strchr((char_u *)"*#/\"\t", *p) != NULL)
1574 p = skipwhite(p + 1);
1575
1576 if (*p != NUL) {
1577 // Only worth concatenating if there is something else than spaces to
1578 // concatenate.
1579 n = (int)(p - line) + 1;
1580 if (n < maxlen - 1) {
1581 memset(buf, ' ', n);
1582 STRLCPY(buf + n, p, maxlen - n);
1583 }
1584 }
1585}
1586
1587// Load word list(s) for "lang" from Vim spell file(s).
1588// "lang" must be the language without the region: e.g., "en".
1589static void spell_load_lang(char_u *lang)
1590{
1591 char_u fname_enc[85];
1592 int r;
1593 spelload_T sl;
1594 int round;
1595
1596 // Copy the language name to pass it to spell_load_cb() as a cookie.
1597 // It's truncated when an error is detected.
1598 STRCPY(sl.sl_lang, lang);
1599 sl.sl_slang = NULL;
1600 sl.sl_nobreak = false;
1601
1602 // We may retry when no spell file is found for the language, an
1603 // autocommand may load it then.
1604 for (round = 1; round <= 2; ++round) {
1605 // Find the first spell file for "lang" in 'runtimepath' and load it.
1606 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5,
1607 "spell/%s.%s.spl", lang, spell_enc());
1608 r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl);
1609
1610 if (r == FAIL && *sl.sl_lang != NUL) {
1611 // Try loading the ASCII version.
1612 vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5,
1613 "spell/%s.ascii.spl", lang);
1614 r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl);
1615
1616 if (r == FAIL && *sl.sl_lang != NUL && round == 1
1617 && apply_autocmds(EVENT_SPELLFILEMISSING, lang,
1618 curbuf->b_fname, FALSE, curbuf))
1619 continue;
1620 break;
1621 }
1622 break;
1623 }
1624
1625 if (r == FAIL) {
1626 if (starting) {
1627 // Prompt the user at VimEnter if spell files are missing. #3027
1628 // Plugins aren't loaded yet, so spellfile.vim cannot handle this case.
1629 char autocmd_buf[512] = { 0 };
1630 snprintf(autocmd_buf, sizeof(autocmd_buf),
1631 "autocmd VimEnter * call spellfile#LoadFile('%s')|set spell",
1632 lang);
1633 do_cmdline_cmd(autocmd_buf);
1634 } else {
1635 smsg(
1636 _("Warning: Cannot find word list \"%s.%s.spl\" or \"%s.ascii.spl\""),
1637 lang, spell_enc(), lang);
1638 }
1639 } else if (sl.sl_slang != NULL) {
1640 // At least one file was loaded, now load ALL the additions.
1641 STRCPY(fname_enc + STRLEN(fname_enc) - 3, "add.spl");
1642 do_in_runtimepath(fname_enc, DIP_ALL, spell_load_cb, &sl);
1643 }
1644}
1645
1646// Return the encoding used for spell checking: Use 'encoding', except that we
1647// use "latin1" for "latin9". And limit to 60 characters (just in case).
1648char_u *spell_enc(void)
1649{
1650
1651 if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0)
1652 return p_enc;
1653 return (char_u *)"latin1";
1654}
1655
1656// Get the name of the .spl file for the internal wordlist into
1657// "fname[MAXPATHL]".
1658static void int_wordlist_spl(char_u *fname)
1659{
1660 vim_snprintf((char *)fname, MAXPATHL, SPL_FNAME_TMPL,
1661 int_wordlist, spell_enc());
1662}
1663
1664// Allocate a new slang_T for language "lang". "lang" can be NULL.
1665// Caller must fill "sl_next".
1666slang_T *slang_alloc(char_u *lang)
1667{
1668 slang_T *lp = xcalloc(1, sizeof(slang_T));
1669
1670 if (lang != NULL)
1671 lp->sl_name = vim_strsave(lang);
1672 ga_init(&lp->sl_rep, sizeof(fromto_T), 10);
1673 ga_init(&lp->sl_repsal, sizeof(fromto_T), 10);
1674 lp->sl_compmax = MAXWLEN;
1675 lp->sl_compsylmax = MAXWLEN;
1676 hash_init(&lp->sl_wordcount);
1677
1678 return lp;
1679}
1680
1681// Free the contents of an slang_T and the structure itself.
1682void slang_free(slang_T *lp)
1683{
1684 xfree(lp->sl_name);
1685 xfree(lp->sl_fname);
1686 slang_clear(lp);
1687 xfree(lp);
1688}
1689
1690/// Frees a salitem_T
1691static void free_salitem(salitem_T *smp) {
1692 xfree(smp->sm_lead);
1693 // Don't free sm_oneof and sm_rules, they point into sm_lead.
1694 xfree(smp->sm_to);
1695 xfree(smp->sm_lead_w);
1696 xfree(smp->sm_oneof_w);
1697 xfree(smp->sm_to_w);
1698}
1699
1700/// Frees a fromto_T
1701static void free_fromto(fromto_T *ftp) {
1702 xfree(ftp->ft_from);
1703 xfree(ftp->ft_to);
1704}
1705
1706// Clear an slang_T so that the file can be reloaded.
1707void slang_clear(slang_T *lp)
1708{
1709 garray_T *gap;
1710
1711 XFREE_CLEAR(lp->sl_fbyts);
1712 XFREE_CLEAR(lp->sl_kbyts);
1713 XFREE_CLEAR(lp->sl_pbyts);
1714
1715 XFREE_CLEAR(lp->sl_fidxs);
1716 XFREE_CLEAR(lp->sl_kidxs);
1717 XFREE_CLEAR(lp->sl_pidxs);
1718
1719 GA_DEEP_CLEAR(&lp->sl_rep, fromto_T, free_fromto);
1720 GA_DEEP_CLEAR(&lp->sl_repsal, fromto_T, free_fromto);
1721
1722 gap = &lp->sl_sal;
1723 if (lp->sl_sofo) {
1724 // "ga_len" is set to 1 without adding an item for latin1
1725 GA_DEEP_CLEAR_PTR(gap);
1726 } else {
1727 // SAL items: free salitem_T items
1728 GA_DEEP_CLEAR(gap, salitem_T, free_salitem);
1729 }
1730
1731 for (int i = 0; i < lp->sl_prefixcnt; ++i) {
1732 vim_regfree(lp->sl_prefprog[i]);
1733 }
1734 lp->sl_prefixcnt = 0;
1735 XFREE_CLEAR(lp->sl_prefprog);
1736 XFREE_CLEAR(lp->sl_info);
1737 XFREE_CLEAR(lp->sl_midword);
1738
1739 vim_regfree(lp->sl_compprog);
1740 lp->sl_compprog = NULL;
1741 XFREE_CLEAR(lp->sl_comprules);
1742 XFREE_CLEAR(lp->sl_compstartflags);
1743 XFREE_CLEAR(lp->sl_compallflags);
1744
1745 XFREE_CLEAR(lp->sl_syllable);
1746 ga_clear(&lp->sl_syl_items);
1747
1748 ga_clear_strings(&lp->sl_comppat);
1749
1750 hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF);
1751 hash_init(&lp->sl_wordcount);
1752
1753 hash_clear_all(&lp->sl_map_hash, 0);
1754
1755 // Clear info from .sug file.
1756 slang_clear_sug(lp);
1757
1758 lp->sl_compmax = MAXWLEN;
1759 lp->sl_compminlen = 0;
1760 lp->sl_compsylmax = MAXWLEN;
1761 lp->sl_regions[0] = NUL;
1762}
1763
1764// Clear the info from the .sug file in "lp".
1765void slang_clear_sug(slang_T *lp)
1766{
1767 XFREE_CLEAR(lp->sl_sbyts);
1768 XFREE_CLEAR(lp->sl_sidxs);
1769 close_spellbuf(lp->sl_sugbuf);
1770 lp->sl_sugbuf = NULL;
1771 lp->sl_sugloaded = false;
1772 lp->sl_sugtime = 0;
1773}
1774
1775// Load one spell file and store the info into a slang_T.
1776// Invoked through do_in_runtimepath().
1777static void spell_load_cb(char_u *fname, void *cookie)
1778{
1779 spelload_T *slp = (spelload_T *)cookie;
1780 slang_T *slang;
1781
1782 slang = spell_load_file(fname, slp->sl_lang, NULL, false);
1783 if (slang != NULL) {
1784 // When a previously loaded file has NOBREAK also use it for the
1785 // ".add" files.
1786 if (slp->sl_nobreak && slang->sl_add)
1787 slang->sl_nobreak = true;
1788 else if (slang->sl_nobreak)
1789 slp->sl_nobreak = true;
1790
1791 slp->sl_slang = slang;
1792 }
1793}
1794
1795/// Add a word to the hashtable of common words.
1796/// If it's already there then the counter is increased.
1797///
1798/// @param[in] lp
1799/// @param[in] word added to common words hashtable
1800/// @param[in] len length of word or -1 for NUL terminated
1801/// @param[in] count 1 to count once, 10 to init
1802void count_common_word(slang_T *lp, char_u *word, int len, int count)
1803{
1804 hash_T hash;
1805 hashitem_T *hi;
1806 wordcount_T *wc;
1807 char_u buf[MAXWLEN];
1808 char_u *p;
1809
1810 if (len == -1) {
1811 p = word;
1812 } else if (len >= MAXWLEN) {
1813 return;
1814 } else {
1815 STRLCPY(buf, word, len + 1);
1816 p = buf;
1817 }
1818
1819 hash = hash_hash(p);
1820 const size_t p_len = STRLEN(p);
1821 hi = hash_lookup(&lp->sl_wordcount, (const char *)p, p_len, hash);
1822 if (HASHITEM_EMPTY(hi)) {
1823 wc = xmalloc(sizeof(wordcount_T) + p_len);
1824 memcpy(wc->wc_word, p, p_len + 1);
1825 wc->wc_count = count;
1826 hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash);
1827 } else {
1828 wc = HI2WC(hi);
1829 if ((wc->wc_count += count) < (unsigned)count) // check for overflow
1830 wc->wc_count = MAXWORDCOUNT;
1831 }
1832}
1833
1834// Adjust the score of common words.
1835static int
1836score_wordcount_adj (
1837 slang_T *slang,
1838 int score,
1839 char_u *word,
1840 bool split // word was split, less bonus
1841)
1842{
1843 hashitem_T *hi;
1844 wordcount_T *wc;
1845 int bonus;
1846 int newscore;
1847
1848 hi = hash_find(&slang->sl_wordcount, word);
1849 if (!HASHITEM_EMPTY(hi)) {
1850 wc = HI2WC(hi);
1851 if (wc->wc_count < SCORE_THRES2)
1852 bonus = SCORE_COMMON1;
1853 else if (wc->wc_count < SCORE_THRES3)
1854 bonus = SCORE_COMMON2;
1855 else
1856 bonus = SCORE_COMMON3;
1857 if (split)
1858 newscore = score - bonus / 2;
1859 else
1860 newscore = score - bonus;
1861 if (newscore < 0)
1862 return 0;
1863 return newscore;
1864 }
1865 return score;
1866}
1867
1868// Returns true if byte "n" appears in "str".
1869// Like strchr() but independent of locale.
1870bool byte_in_str(char_u *str, int n)
1871{
1872 char_u *p;
1873
1874 for (p = str; *p != NUL; ++p)
1875 if (*p == n)
1876 return true;
1877 return false;
1878}
1879
1880// Truncate "slang->sl_syllable" at the first slash and put the following items
1881// in "slang->sl_syl_items".
1882int init_syl_tab(slang_T *slang)
1883{
1884 char_u *p;
1885 char_u *s;
1886 int l;
1887
1888 ga_init(&slang->sl_syl_items, sizeof(syl_item_T), 4);
1889 p = vim_strchr(slang->sl_syllable, '/');
1890 while (p != NULL) {
1891 *p++ = NUL;
1892 if (*p == NUL) // trailing slash
1893 break;
1894 s = p;
1895 p = vim_strchr(p, '/');
1896 if (p == NULL)
1897 l = (int)STRLEN(s);
1898 else
1899 l = (int)(p - s);
1900 if (l >= SY_MAXLEN)
1901 return SP_FORMERROR;
1902
1903 syl_item_T *syl = GA_APPEND_VIA_PTR(syl_item_T, &slang->sl_syl_items);
1904 STRLCPY(syl->sy_chars, s, l + 1);
1905 syl->sy_len = l;
1906 }
1907 return OK;
1908}
1909
1910// Count the number of syllables in "word".
1911// When "word" contains spaces the syllables after the last space are counted.
1912// Returns zero if syllables are not defines.
1913static int count_syllables(slang_T *slang, char_u *word)
1914{
1915 int cnt = 0;
1916 bool skip = false;
1917 char_u *p;
1918 int len;
1919 syl_item_T *syl;
1920 int c;
1921
1922 if (slang->sl_syllable == NULL)
1923 return 0;
1924
1925 for (p = word; *p != NUL; p += len) {
1926 // When running into a space reset counter.
1927 if (*p == ' ') {
1928 len = 1;
1929 cnt = 0;
1930 continue;
1931 }
1932
1933 // Find longest match of syllable items.
1934 len = 0;
1935 for (int i = 0; i < slang->sl_syl_items.ga_len; ++i) {
1936 syl = ((syl_item_T *)slang->sl_syl_items.ga_data) + i;
1937 if (syl->sy_len > len
1938 && STRNCMP(p, syl->sy_chars, syl->sy_len) == 0)
1939 len = syl->sy_len;
1940 }
1941 if (len != 0) { // found a match, count syllable
1942 ++cnt;
1943 skip = false;
1944 } else {
1945 // No recognized syllable item, at least a syllable char then?
1946 c = utf_ptr2char(p);
1947 len = (*mb_ptr2len)(p);
1948 if (vim_strchr(slang->sl_syllable, c) == NULL)
1949 skip = false; // No, search for next syllable
1950 else if (!skip) {
1951 ++cnt; // Yes, count it
1952 skip = true; // don't count following syllable chars
1953 }
1954 }
1955 }
1956 return cnt;
1957}
1958
1959// Parse 'spelllang' and set w_s->b_langp accordingly.
1960// Returns NULL if it's OK, an error message otherwise.
1961char_u *did_set_spelllang(win_T *wp)
1962{
1963 garray_T ga;
1964 char_u *splp;
1965 char_u *region;
1966 char_u region_cp[3];
1967 bool filename;
1968 int region_mask;
1969 slang_T *slang;
1970 int c;
1971 char_u lang[MAXWLEN + 1];
1972 char_u spf_name[MAXPATHL];
1973 int len;
1974 char_u *p;
1975 int round;
1976 char_u *spf;
1977 char_u *use_region = NULL;
1978 bool dont_use_region = false;
1979 bool nobreak = false;
1980 langp_T *lp, *lp2;
1981 static bool recursive = false;
1982 char_u *ret_msg = NULL;
1983 char_u *spl_copy;
1984
1985 bufref_T bufref;
1986 set_bufref(&bufref, wp->w_buffer);
1987
1988 // We don't want to do this recursively. May happen when a language is
1989 // not available and the SpellFileMissing autocommand opens a new buffer
1990 // in which 'spell' is set.
1991 if (recursive)
1992 return NULL;
1993 recursive = true;
1994
1995 ga_init(&ga, sizeof(langp_T), 2);
1996 clear_midword(wp);
1997
1998 // Make a copy of 'spelllang', the SpellFileMissing autocommands may change
1999 // it under our fingers.
2000 spl_copy = vim_strsave(wp->w_s->b_p_spl);
2001
2002 wp->w_s->b_cjk = 0;
2003
2004 // Loop over comma separated language names.
2005 for (splp = spl_copy; *splp != NUL; ) {
2006 // Get one language name.
2007 copy_option_part(&splp, lang, MAXWLEN, ",");
2008 region = NULL;
2009 len = (int)STRLEN(lang);
2010
2011 if (STRCMP(lang, "cjk") == 0) {
2012 wp->w_s->b_cjk = 1;
2013 continue;
2014 }
2015
2016 // If the name ends in ".spl" use it as the name of the spell file.
2017 // If there is a region name let "region" point to it and remove it
2018 // from the name.
2019 if (len > 4 && fnamecmp(lang + len - 4, ".spl") == 0) {
2020 filename = true;
2021
2022 // Locate a region and remove it from the file name.
2023 p = vim_strchr(path_tail(lang), '_');
2024 if (p != NULL && ASCII_ISALPHA(p[1]) && ASCII_ISALPHA(p[2])
2025 && !ASCII_ISALPHA(p[3])) {
2026 STRLCPY(region_cp, p + 1, 3);
2027 memmove(p, p + 3, len - (p - lang) - 2);
2028 region = region_cp;
2029 } else
2030 dont_use_region = true;
2031
2032 // Check if we loaded this language before.
2033 for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
2034 if (path_full_compare(lang, slang->sl_fname, false) == kEqualFiles) {
2035 break;
2036 }
2037 }
2038 } else {
2039 filename = false;
2040 if (len > 3 && lang[len - 3] == '_') {
2041 region = lang + len - 2;
2042 lang[len - 3] = NUL;
2043 } else
2044 dont_use_region = true;
2045
2046 // Check if we loaded this language before.
2047 for (slang = first_lang; slang != NULL; slang = slang->sl_next)
2048 if (STRICMP(lang, slang->sl_name) == 0)
2049 break;
2050 }
2051
2052 if (region != NULL) {
2053 // If the region differs from what was used before then don't
2054 // use it for 'spellfile'.
2055 if (use_region != NULL && STRCMP(region, use_region) != 0)
2056 dont_use_region = true;
2057 use_region = region;
2058 }
2059
2060 // If not found try loading the language now.
2061 if (slang == NULL) {
2062 if (filename)
2063 (void)spell_load_file(lang, lang, NULL, false);
2064 else {
2065 spell_load_lang(lang);
2066 // SpellFileMissing autocommands may do anything, including
2067 // destroying the buffer we are using...
2068 if (!bufref_valid(&bufref)) {
2069 ret_msg =
2070 (char_u *)N_("E797: SpellFileMissing autocommand deleted buffer");
2071 goto theend;
2072 }
2073 }
2074 }
2075
2076 // Loop over the languages, there can be several files for "lang".
2077 for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
2078 if (filename
2079 ? path_full_compare(lang, slang->sl_fname, false) == kEqualFiles
2080 : STRICMP(lang, slang->sl_name) == 0) {
2081 region_mask = REGION_ALL;
2082 if (!filename && region != NULL) {
2083 // find region in sl_regions
2084 c = find_region(slang->sl_regions, region);
2085 if (c == REGION_ALL) {
2086 if (slang->sl_add) {
2087 if (*slang->sl_regions != NUL)
2088 // This addition file is for other regions.
2089 region_mask = 0;
2090 } else
2091 // This is probably an error. Give a warning and
2092 // accept the words anyway.
2093 smsg(_("Warning: region %s not supported"),
2094 region);
2095 } else
2096 region_mask = 1 << c;
2097 }
2098
2099 if (region_mask != 0) {
2100 langp_T *p_ = GA_APPEND_VIA_PTR(langp_T, &ga);
2101 p_->lp_slang = slang;
2102 p_->lp_region = region_mask;
2103
2104 use_midword(slang, wp);
2105 if (slang->sl_nobreak)
2106 nobreak = true;
2107 }
2108 }
2109 }
2110 }
2111
2112 // round 0: load int_wordlist, if possible.
2113 // round 1: load first name in 'spellfile'.
2114 // round 2: load second name in 'spellfile.
2115 // etc.
2116 spf = curwin->w_s->b_p_spf;
2117 for (round = 0; round == 0 || *spf != NUL; ++round) {
2118 if (round == 0) {
2119 // Internal wordlist, if there is one.
2120 if (int_wordlist == NULL)
2121 continue;
2122 int_wordlist_spl(spf_name);
2123 } else {
2124 // One entry in 'spellfile'.
2125 copy_option_part(&spf, spf_name, MAXPATHL - 5, ",");
2126 STRCAT(spf_name, ".spl");
2127
2128 // If it was already found above then skip it.
2129 for (c = 0; c < ga.ga_len; ++c) {
2130 p = LANGP_ENTRY(ga, c)->lp_slang->sl_fname;
2131 if (p != NULL
2132 && path_full_compare(spf_name, p, false) == kEqualFiles) {
2133 break;
2134 }
2135 }
2136 if (c < ga.ga_len)
2137 continue;
2138 }
2139
2140 // Check if it was loaded already.
2141 for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
2142 if (path_full_compare(spf_name, slang->sl_fname, false) == kEqualFiles) {
2143 break;
2144 }
2145 }
2146 if (slang == NULL) {
2147 // Not loaded, try loading it now. The language name includes the
2148 // region name, the region is ignored otherwise. for int_wordlist
2149 // use an arbitrary name.
2150 if (round == 0)
2151 STRCPY(lang, "internal wordlist");
2152 else {
2153 STRLCPY(lang, path_tail(spf_name), MAXWLEN + 1);
2154 p = vim_strchr(lang, '.');
2155 if (p != NULL)
2156 *p = NUL; // truncate at ".encoding.add"
2157 }
2158 slang = spell_load_file(spf_name, lang, NULL, true);
2159
2160 // If one of the languages has NOBREAK we assume the addition
2161 // files also have this.
2162 if (slang != NULL && nobreak)
2163 slang->sl_nobreak = true;
2164 }
2165 if (slang != NULL) {
2166 region_mask = REGION_ALL;
2167 if (use_region != NULL && !dont_use_region) {
2168 // find region in sl_regions
2169 c = find_region(slang->sl_regions, use_region);
2170 if (c != REGION_ALL)
2171 region_mask = 1 << c;
2172 else if (*slang->sl_regions != NUL)
2173 // This spell file is for other regions.
2174 region_mask = 0;
2175 }
2176
2177 if (region_mask != 0) {
2178 langp_T *p_ = GA_APPEND_VIA_PTR(langp_T, &ga);
2179 p_->lp_slang = slang;
2180 p_->lp_sallang = NULL;
2181 p_->lp_replang = NULL;
2182 p_->lp_region = region_mask;
2183
2184 use_midword(slang, wp);
2185 }
2186 }
2187 }
2188
2189 // Everything is fine, store the new b_langp value.
2190 ga_clear(&wp->w_s->b_langp);
2191 wp->w_s->b_langp = ga;
2192
2193 // For each language figure out what language to use for sound folding and
2194 // REP items. If the language doesn't support it itself use another one
2195 // with the same name. E.g. for "en-math" use "en".
2196 for (int i = 0; i < ga.ga_len; ++i) {
2197 lp = LANGP_ENTRY(ga, i);
2198
2199 // sound folding
2200 if (!GA_EMPTY(&lp->lp_slang->sl_sal))
2201 // language does sound folding itself
2202 lp->lp_sallang = lp->lp_slang;
2203 else
2204 // find first similar language that does sound folding
2205 for (int j = 0; j < ga.ga_len; ++j) {
2206 lp2 = LANGP_ENTRY(ga, j);
2207 if (!GA_EMPTY(&lp2->lp_slang->sl_sal)
2208 && STRNCMP(lp->lp_slang->sl_name,
2209 lp2->lp_slang->sl_name, 2) == 0) {
2210 lp->lp_sallang = lp2->lp_slang;
2211 break;
2212 }
2213 }
2214
2215 // REP items
2216 if (!GA_EMPTY(&lp->lp_slang->sl_rep))
2217 // language has REP items itself
2218 lp->lp_replang = lp->lp_slang;
2219 else
2220 // find first similar language that has REP items
2221 for (int j = 0; j < ga.ga_len; ++j) {
2222 lp2 = LANGP_ENTRY(ga, j);
2223 if (!GA_EMPTY(&lp2->lp_slang->sl_rep)
2224 && STRNCMP(lp->lp_slang->sl_name,
2225 lp2->lp_slang->sl_name, 2) == 0) {
2226 lp->lp_replang = lp2->lp_slang;
2227 break;
2228 }
2229 }
2230 }
2231
2232theend:
2233 xfree(spl_copy);
2234 recursive = false;
2235 redraw_win_later(wp, NOT_VALID);
2236 return ret_msg;
2237}
2238
2239// Clear the midword characters for buffer "buf".
2240static void clear_midword(win_T *wp)
2241{
2242 memset(wp->w_s->b_spell_ismw, 0, 256);
2243 XFREE_CLEAR(wp->w_s->b_spell_ismw_mb);
2244}
2245
2246// Use the "sl_midword" field of language "lp" for buffer "buf".
2247// They add up to any currently used midword characters.
2248static void use_midword(slang_T *lp, win_T *wp)
2249{
2250 char_u *p;
2251
2252 if (lp->sl_midword == NULL) // there aren't any
2253 return;
2254
2255 for (p = lp->sl_midword; *p != NUL; )
2256 if (has_mbyte) {
2257 int c, l, n;
2258 char_u *bp;
2259
2260 c = utf_ptr2char(p);
2261 l = (*mb_ptr2len)(p);
2262 if (c < 256 && l <= 2)
2263 wp->w_s->b_spell_ismw[c] = true;
2264 else if (wp->w_s->b_spell_ismw_mb == NULL)
2265 // First multi-byte char in "b_spell_ismw_mb".
2266 wp->w_s->b_spell_ismw_mb = vim_strnsave(p, l);
2267 else {
2268 // Append multi-byte chars to "b_spell_ismw_mb".
2269 n = (int)STRLEN(wp->w_s->b_spell_ismw_mb);
2270 bp = vim_strnsave(wp->w_s->b_spell_ismw_mb, n + l);
2271 xfree(wp->w_s->b_spell_ismw_mb);
2272 wp->w_s->b_spell_ismw_mb = bp;
2273 STRLCPY(bp + n, p, l + 1);
2274 }
2275 p += l;
2276 } else
2277 wp->w_s->b_spell_ismw[*p++] = true;
2278}
2279
2280// Find the region "region[2]" in "rp" (points to "sl_regions").
2281// Each region is simply stored as the two characters of its name.
2282// Returns the index if found (first is 0), REGION_ALL if not found.
2283static int find_region(char_u *rp, char_u *region)
2284{
2285 int i;
2286
2287 for (i = 0;; i += 2) {
2288 if (rp[i] == NUL)
2289 return REGION_ALL;
2290 if (rp[i] == region[0] && rp[i + 1] == region[1])
2291 break;
2292 }
2293 return i / 2;
2294}
2295
2296/// Return case type of word:
2297/// w word 0
2298/// Word WF_ONECAP
2299/// W WORD WF_ALLCAP
2300/// WoRd wOrd WF_KEEPCAP
2301///
2302/// @param[in] word
2303/// @param[in] end End of word or NULL for NUL delimited string
2304///
2305/// @returns Case type of word
2306int captype(char_u *word, char_u *end)
2307 FUNC_ATTR_NONNULL_ARG(1)
2308{
2309 char_u *p;
2310 int c;
2311 int firstcap;
2312 bool allcap;
2313 bool past_second = false; // past second word char
2314
2315 // find first letter
2316 for (p = word; !spell_iswordp_nmw(p, curwin); MB_PTR_ADV(p)) {
2317 if (end == NULL ? *p == NUL : p >= end) {
2318 return 0; // only non-word characters, illegal word
2319 }
2320 }
2321 if (has_mbyte) {
2322 c = mb_ptr2char_adv((const char_u **)&p);
2323 } else {
2324 c = *p++;
2325 }
2326 firstcap = allcap = SPELL_ISUPPER(c);
2327
2328 // Need to check all letters to find a word with mixed upper/lower.
2329 // But a word with an upper char only at start is a ONECAP.
2330 for (; end == NULL ? *p != NUL : p < end; MB_PTR_ADV(p)) {
2331 if (spell_iswordp_nmw(p, curwin)) {
2332 c = PTR2CHAR(p);
2333 if (!SPELL_ISUPPER(c)) {
2334 // UUl -> KEEPCAP
2335 if (past_second && allcap) {
2336 return WF_KEEPCAP;
2337 }
2338 allcap = false;
2339 } else if (!allcap) {
2340 // UlU -> KEEPCAP
2341 return WF_KEEPCAP;
2342 }
2343 past_second = true;
2344 }
2345 }
2346
2347 if (allcap)
2348 return WF_ALLCAP;
2349 if (firstcap)
2350 return WF_ONECAP;
2351 return 0;
2352}
2353
2354// Like captype() but for a KEEPCAP word add ONECAP if the word starts with a
2355// capital. So that make_case_word() can turn WOrd into Word.
2356// Add ALLCAP for "WOrD".
2357static int badword_captype(char_u *word, char_u *end)
2358 FUNC_ATTR_NONNULL_ALL
2359{
2360 int flags = captype(word, end);
2361 int c;
2362 int l, u;
2363 bool first;
2364 char_u *p;
2365
2366 if (flags & WF_KEEPCAP) {
2367 // Count the number of UPPER and lower case letters.
2368 l = u = 0;
2369 first = false;
2370 for (p = word; p < end; MB_PTR_ADV(p)) {
2371 c = PTR2CHAR(p);
2372 if (SPELL_ISUPPER(c)) {
2373 ++u;
2374 if (p == word)
2375 first = true;
2376 } else
2377 ++l;
2378 }
2379
2380 // If there are more UPPER than lower case letters suggest an
2381 // ALLCAP word. Otherwise, if the first letter is UPPER then
2382 // suggest ONECAP. Exception: "ALl" most likely should be "All",
2383 // require three upper case letters.
2384 if (u > l && u > 2)
2385 flags |= WF_ALLCAP;
2386 else if (first)
2387 flags |= WF_ONECAP;
2388
2389 if (u >= 2 && l >= 2) // maCARONI maCAroni
2390 flags |= WF_MIXCAP;
2391 }
2392 return flags;
2393}
2394
2395// Delete the internal wordlist and its .spl file.
2396void spell_delete_wordlist(void)
2397{
2398 char_u fname[MAXPATHL] = {0};
2399
2400 if (int_wordlist != NULL) {
2401 os_remove((char *)int_wordlist);
2402 int_wordlist_spl(fname);
2403 os_remove((char *)fname);
2404 XFREE_CLEAR(int_wordlist);
2405 }
2406}
2407
2408// Free all languages.
2409void spell_free_all(void)
2410{
2411 slang_T *slang;
2412
2413 // Go through all buffers and handle 'spelllang'. <VN>
2414 FOR_ALL_BUFFERS(buf) {
2415 ga_clear(&buf->b_s.b_langp);
2416 }
2417
2418 while (first_lang != NULL) {
2419 slang = first_lang;
2420 first_lang = slang->sl_next;
2421 slang_free(slang);
2422 }
2423
2424 spell_delete_wordlist();
2425
2426 XFREE_CLEAR(repl_to);
2427 XFREE_CLEAR(repl_from);
2428}
2429
2430// Clear all spelling tables and reload them.
2431// Used after 'encoding' is set and when ":mkspell" was used.
2432void spell_reload(void)
2433{
2434 // Initialize the table for spell_iswordp().
2435 init_spell_chartab();
2436
2437 // Unload all allocated memory.
2438 spell_free_all();
2439
2440 // Go through all buffers and handle 'spelllang'.
2441 FOR_ALL_WINDOWS_IN_TAB(wp, curtab) {
2442 // Only load the wordlists when 'spelllang' is set and there is a
2443 // window for this buffer in which 'spell' is set.
2444 if (*wp->w_s->b_p_spl != NUL) {
2445 if (wp->w_p_spell) {
2446 (void)did_set_spelllang(wp);
2447 break;
2448 }
2449 }
2450 }
2451}
2452
2453
2454// Opposite of offset2bytes().
2455// "pp" points to the bytes and is advanced over it.
2456// Returns the offset.
2457static int bytes2offset(char_u **pp)
2458{
2459 char_u *p = *pp;
2460 int nr;
2461 int c;
2462
2463 c = *p++;
2464 if ((c & 0x80) == 0x00) { // 1 byte
2465 nr = c - 1;
2466 } else if ((c & 0xc0) == 0x80) { // 2 bytes
2467 nr = (c & 0x3f) - 1;
2468 nr = nr * 255 + (*p++ - 1);
2469 } else if ((c & 0xe0) == 0xc0) { // 3 bytes
2470 nr = (c & 0x1f) - 1;
2471 nr = nr * 255 + (*p++ - 1);
2472 nr = nr * 255 + (*p++ - 1);
2473 } else { // 4 bytes
2474 nr = (c & 0x0f) - 1;
2475 nr = nr * 255 + (*p++ - 1);
2476 nr = nr * 255 + (*p++ - 1);
2477 nr = nr * 255 + (*p++ - 1);
2478 }
2479
2480 *pp = p;
2481 return nr;
2482}
2483
2484// Open a spell buffer. This is a nameless buffer that is not in the buffer
2485// list and only contains text lines. Can use a swapfile to reduce memory
2486// use.
2487// Most other fields are invalid! Esp. watch out for string options being
2488// NULL and there is no undo info.
2489buf_T *open_spellbuf(void)
2490{
2491 buf_T *buf = xcalloc(1, sizeof(buf_T));
2492
2493 buf->b_spell = true;
2494 buf->b_p_swf = true; // may create a swap file
2495 if (ml_open(buf) == FAIL) {
2496 ELOG("Error opening a new memline");
2497 }
2498 ml_open_file(buf); // create swap file now
2499
2500 return buf;
2501}
2502
2503// Close the buffer used for spell info.
2504void close_spellbuf(buf_T *buf)
2505{
2506 if (buf != NULL) {
2507 ml_close(buf, TRUE);
2508 xfree(buf);
2509 }
2510}
2511
2512// Init the chartab used for spelling for ASCII.
2513void clear_spell_chartab(spelltab_T *sp)
2514{
2515 int i;
2516
2517 // Init everything to false.
2518 memset(sp->st_isw, false, sizeof(sp->st_isw));
2519 memset(sp->st_isu, false, sizeof(sp->st_isu));
2520
2521 for (i = 0; i < 256; ++i) {
2522 sp->st_fold[i] = i;
2523 sp->st_upper[i] = i;
2524 }
2525
2526 // We include digits. A word shouldn't start with a digit, but handling
2527 // that is done separately.
2528 for (i = '0'; i <= '9'; ++i)
2529 sp->st_isw[i] = true;
2530 for (i = 'A'; i <= 'Z'; ++i) {
2531 sp->st_isw[i] = true;
2532 sp->st_isu[i] = true;
2533 sp->st_fold[i] = i + 0x20;
2534 }
2535 for (i = 'a'; i <= 'z'; ++i) {
2536 sp->st_isw[i] = true;
2537 sp->st_upper[i] = i - 0x20;
2538 }
2539}
2540
2541// Init the chartab used for spelling. Called once while starting up.
2542// The default is to use isalpha(), but the spell file should define the word
2543// characters to make it possible that 'encoding' differs from the current
2544// locale. For utf-8 we don't use isalpha() but our own functions.
2545void init_spell_chartab(void)
2546{
2547 int i;
2548
2549 did_set_spelltab = false;
2550 clear_spell_chartab(&spelltab);
2551 for (i = 128; i < 256; i++) {
2552 int f = utf_fold(i);
2553 int u = mb_toupper(i);
2554
2555 spelltab.st_isu[i] = mb_isupper(i);
2556 spelltab.st_isw[i] = spelltab.st_isu[i] || mb_islower(i);
2557 // The folded/upper-cased value is different between latin1 and
2558 // utf8 for 0xb5, causing E763 for no good reason. Use the latin1
2559 // value for utf-8 to avoid this.
2560 spelltab.st_fold[i] = (f < 256) ? f : i;
2561 spelltab.st_upper[i] = (u < 256) ? u : i;
2562 }
2563}
2564
2565/// Returns true if "p" points to a word character.
2566/// As a special case we see "midword" characters as word character when it is
2567/// followed by a word character. This finds they'there but not 'they there'.
2568/// Thus this only works properly when past the first character of the word.
2569///
2570/// @param wp Buffer used.
2571static bool spell_iswordp(char_u *p, win_T *wp)
2572{
2573 char_u *s;
2574 int l;
2575 int c;
2576
2577 if (has_mbyte) {
2578 l = MB_PTR2LEN(p);
2579 s = p;
2580 if (l == 1) {
2581 // be quick for ASCII
2582 if (wp->w_s->b_spell_ismw[*p])
2583 s = p + 1; // skip a mid-word character
2584 } else {
2585 c = utf_ptr2char(p);
2586 if (c < 256 ? wp->w_s->b_spell_ismw[c]
2587 : (wp->w_s->b_spell_ismw_mb != NULL
2588 && vim_strchr(wp->w_s->b_spell_ismw_mb, c) != NULL)) {
2589 s = p + l;
2590 }
2591 }
2592
2593 c = utf_ptr2char(s);
2594 if (c > 255) {
2595 return spell_mb_isword_class(mb_get_class(s), wp);
2596 }
2597 return spelltab.st_isw[c];
2598 }
2599
2600 return spelltab.st_isw[wp->w_s->b_spell_ismw[*p] ? p[1] : p[0]];
2601}
2602
2603// Returns true if "p" points to a word character.
2604// Unlike spell_iswordp() this doesn't check for "midword" characters.
2605bool spell_iswordp_nmw(const char_u *p, win_T *wp)
2606{
2607 int c = utf_ptr2char(p);
2608 if (c > 255) {
2609 return spell_mb_isword_class(mb_get_class(p), wp);
2610 }
2611 return spelltab.st_isw[c];
2612}
2613
2614// Returns true if word class indicates a word character.
2615// Only for characters above 255.
2616// Unicode subscript and superscript are not considered word characters.
2617// See also utf_class() in mbyte.c.
2618static bool spell_mb_isword_class(int cl, win_T *wp)
2619{
2620 if (wp->w_s->b_cjk)
2621 // East Asian characters are not considered word characters.
2622 return cl == 2 || cl == 0x2800;
2623 return cl >= 2 && cl != 0x2070 && cl != 0x2080 && cl != 3;
2624}
2625
2626// Returns true if "p" points to a word character.
2627// Wide version of spell_iswordp().
2628static bool spell_iswordp_w(int *p, win_T *wp)
2629{
2630 int *s;
2631
2632 if (*p < 256 ? wp->w_s->b_spell_ismw[*p]
2633 : (wp->w_s->b_spell_ismw_mb != NULL
2634 && vim_strchr(wp->w_s->b_spell_ismw_mb, *p) != NULL))
2635 s = p + 1;
2636 else
2637 s = p;
2638
2639 if (*s > 255) {
2640 return spell_mb_isword_class(utf_class(*s), wp);
2641 }
2642 return spelltab.st_isw[*s];
2643}
2644
2645// Case-fold "str[len]" into "buf[buflen]". The result is NUL terminated.
2646// Uses the character definitions from the .spl file.
2647// When using a multi-byte 'encoding' the length may change!
2648// Returns FAIL when something wrong.
2649int spell_casefold(char_u *str, int len, char_u *buf, int buflen)
2650{
2651 int i;
2652
2653 if (len >= buflen) {
2654 buf[0] = NUL;
2655 return FAIL; // result will not fit
2656 }
2657
2658 if (has_mbyte) {
2659 int outi = 0;
2660 char_u *p;
2661 int c;
2662
2663 // Fold one character at a time.
2664 for (p = str; p < str + len; ) {
2665 if (outi + MB_MAXBYTES > buflen) {
2666 buf[outi] = NUL;
2667 return FAIL;
2668 }
2669 c = mb_cptr2char_adv((const char_u **)&p);
2670 outi += utf_char2bytes(SPELL_TOFOLD(c), buf + outi);
2671 }
2672 buf[outi] = NUL;
2673 } else {
2674 // Be quick for non-multibyte encodings.
2675 for (i = 0; i < len; ++i)
2676 buf[i] = spelltab.st_fold[str[i]];
2677 buf[i] = NUL;
2678 }
2679
2680 return OK;
2681}
2682
2683// values for sps_flags
2684#define SPS_BEST 1
2685#define SPS_FAST 2
2686#define SPS_DOUBLE 4
2687
2688static int sps_flags = SPS_BEST; // flags from 'spellsuggest'
2689static int sps_limit = 9999; // max nr of suggestions given
2690
2691// Check the 'spellsuggest' option. Return FAIL if it's wrong.
2692// Sets "sps_flags" and "sps_limit".
2693int spell_check_sps(void)
2694{
2695 char_u *p;
2696 char_u *s;
2697 char_u buf[MAXPATHL];
2698 int f;
2699
2700 sps_flags = 0;
2701 sps_limit = 9999;
2702
2703 for (p = p_sps; *p != NUL; ) {
2704 copy_option_part(&p, buf, MAXPATHL, ",");
2705
2706 f = 0;
2707 if (ascii_isdigit(*buf)) {
2708 s = buf;
2709 sps_limit = getdigits_int(&s, true, 0);
2710 if (*s != NUL && !ascii_isdigit(*s)) {
2711 f = -1;
2712 }
2713 } else if (STRCMP(buf, "best") == 0) {
2714 f = SPS_BEST;
2715 } else if (STRCMP(buf, "fast") == 0) {
2716 f = SPS_FAST;
2717 } else if (STRCMP(buf, "double") == 0) {
2718 f = SPS_DOUBLE;
2719 } else if (STRNCMP(buf, "expr:", 5) != 0
2720 && STRNCMP(buf, "file:", 5) != 0) {
2721 f = -1;
2722 }
2723
2724 if (f == -1 || (sps_flags != 0 && f != 0)) {
2725 sps_flags = SPS_BEST;
2726 sps_limit = 9999;
2727 return FAIL;
2728 }
2729 if (f != 0)
2730 sps_flags = f;
2731 }
2732
2733 if (sps_flags == 0)
2734 sps_flags = SPS_BEST;
2735
2736 return OK;
2737}
2738
2739// "z=": Find badly spelled word under or after the cursor.
2740// Give suggestions for the properly spelled word.
2741// In Visual mode use the highlighted word as the bad word.
2742// When "count" is non-zero use that suggestion.
2743void spell_suggest(int count)
2744{
2745 char_u *line;
2746 pos_T prev_cursor = curwin->w_cursor;
2747 char_u wcopy[MAXWLEN + 2];
2748 char_u *p;
2749 int c;
2750 suginfo_T sug;
2751 suggest_T *stp;
2752 int mouse_used;
2753 int need_cap;
2754 int limit;
2755 int selected = count;
2756 int badlen = 0;
2757 int msg_scroll_save = msg_scroll;
2758
2759 if (no_spell_checking(curwin))
2760 return;
2761
2762 if (VIsual_active) {
2763 // Use the Visually selected text as the bad word. But reject
2764 // a multi-line selection.
2765 if (curwin->w_cursor.lnum != VIsual.lnum) {
2766 vim_beep(BO_SPELL);
2767 return;
2768 }
2769 badlen = (int)curwin->w_cursor.col - (int)VIsual.col;
2770 if (badlen < 0) {
2771 badlen = -badlen;
2772 } else {
2773 curwin->w_cursor.col = VIsual.col;
2774 }
2775 badlen++;
2776 end_visual_mode();
2777 } else
2778 // Find the start of the badly spelled word.
2779 if (spell_move_to(curwin, FORWARD, true, true, NULL) == 0
2780 || curwin->w_cursor.col > prev_cursor.col) {
2781 // No bad word or it starts after the cursor: use the word under the
2782 // cursor.
2783 curwin->w_cursor = prev_cursor;
2784 line = get_cursor_line_ptr();
2785 p = line + curwin->w_cursor.col;
2786 // Backup to before start of word.
2787 while (p > line && spell_iswordp_nmw(p, curwin)) {
2788 MB_PTR_BACK(line, p);
2789 }
2790 // Forward to start of word.
2791 while (*p != NUL && !spell_iswordp_nmw(p, curwin)) {
2792 MB_PTR_ADV(p);
2793 }
2794
2795 if (!spell_iswordp_nmw(p, curwin)) { // No word found.
2796 beep_flush();
2797 return;
2798 }
2799 curwin->w_cursor.col = (colnr_T)(p - line);
2800 }
2801
2802 // Get the word and its length.
2803
2804 // Figure out if the word should be capitalised.
2805 need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col);
2806
2807 // Make a copy of current line since autocommands may free the line.
2808 line = vim_strsave(get_cursor_line_ptr());
2809
2810 // Get the list of suggestions. Limit to 'lines' - 2 or the number in
2811 // 'spellsuggest', whatever is smaller.
2812 if (sps_limit > (int)Rows - 2)
2813 limit = (int)Rows - 2;
2814 else
2815 limit = sps_limit;
2816 spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit,
2817 true, need_cap, true);
2818
2819 if (GA_EMPTY(&sug.su_ga))
2820 MSG(_("Sorry, no suggestions"));
2821 else if (count > 0) {
2822 if (count > sug.su_ga.ga_len)
2823 smsg(_("Sorry, only %" PRId64 " suggestions"),
2824 (int64_t)sug.su_ga.ga_len);
2825 } else {
2826 XFREE_CLEAR(repl_from);
2827 XFREE_CLEAR(repl_to);
2828
2829 // When 'rightleft' is set the list is drawn right-left.
2830 cmdmsg_rl = curwin->w_p_rl;
2831 if (cmdmsg_rl)
2832 msg_col = Columns - 1;
2833
2834 // List the suggestions.
2835 msg_start();
2836 msg_row = Rows - 1; // for when 'cmdheight' > 1
2837 lines_left = Rows; // avoid more prompt
2838 vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:"),
2839 sug.su_badlen, sug.su_badptr);
2840 if (cmdmsg_rl && STRNCMP(IObuff, "Change", 6) == 0) {
2841 // And now the rabbit from the high hat: Avoid showing the
2842 // untranslated message rightleft.
2843 vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC",
2844 sug.su_badlen, sug.su_badptr);
2845 }
2846 msg_puts((const char *)IObuff);
2847 msg_clr_eos();
2848 msg_putchar('\n');
2849
2850 msg_scroll = TRUE;
2851 for (int i = 0; i < sug.su_ga.ga_len; ++i) {
2852 stp = &SUG(sug.su_ga, i);
2853
2854 // The suggested word may replace only part of the bad word, add
2855 // the not replaced part.
2856 STRLCPY(wcopy, stp->st_word, MAXWLEN + 1);
2857 if (sug.su_badlen > stp->st_orglen)
2858 STRLCPY(wcopy + stp->st_wordlen,
2859 sug.su_badptr + stp->st_orglen,
2860 sug.su_badlen - stp->st_orglen + 1);
2861 vim_snprintf((char *)IObuff, IOSIZE, "%2d", i + 1);
2862 if (cmdmsg_rl) {
2863 rl_mirror(IObuff);
2864 }
2865 msg_puts((const char *)IObuff);
2866
2867 vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"", wcopy);
2868 msg_puts((const char *)IObuff);
2869
2870 // The word may replace more than "su_badlen".
2871 if (sug.su_badlen < stp->st_orglen) {
2872 vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\""),
2873 stp->st_orglen, sug.su_badptr);
2874 msg_puts((const char *)IObuff);
2875 }
2876
2877 if (p_verbose > 0) {
2878 // Add the score.
2879 if (sps_flags & (SPS_DOUBLE | SPS_BEST))
2880 vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)",
2881 stp->st_salscore ? "s " : "",
2882 stp->st_score, stp->st_altscore);
2883 else
2884 vim_snprintf((char *)IObuff, IOSIZE, " (%d)",
2885 stp->st_score);
2886 if (cmdmsg_rl)
2887 // Mirror the numbers, but keep the leading space.
2888 rl_mirror(IObuff + 1);
2889 msg_advance(30);
2890 msg_puts((const char *)IObuff);
2891 }
2892 msg_putchar('\n');
2893 }
2894
2895 cmdmsg_rl = FALSE;
2896 msg_col = 0;
2897 // Ask for choice.
2898 selected = prompt_for_number(&mouse_used);
2899 if (mouse_used)
2900 selected -= lines_left;
2901 lines_left = Rows; // avoid more prompt
2902 // don't delay for 'smd' in normal_cmd()
2903 msg_scroll = msg_scroll_save;
2904 }
2905
2906 if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK) {
2907 // Save the from and to text for :spellrepall.
2908 stp = &SUG(sug.su_ga, selected - 1);
2909 if (sug.su_badlen > stp->st_orglen) {
2910 // Replacing less than "su_badlen", append the remainder to
2911 // repl_to.
2912 repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen);
2913 vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s", stp->st_word,
2914 sug.su_badlen - stp->st_orglen,
2915 sug.su_badptr + stp->st_orglen);
2916 repl_to = vim_strsave(IObuff);
2917 } else {
2918 // Replacing su_badlen or more, use the whole word.
2919 repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen);
2920 repl_to = vim_strsave(stp->st_word);
2921 }
2922
2923 // Replace the word.
2924 p = xmalloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1);
2925 c = (int)(sug.su_badptr - line);
2926 memmove(p, line, c);
2927 STRCPY(p + c, stp->st_word);
2928 STRCAT(p, sug.su_badptr + stp->st_orglen);
2929 ml_replace(curwin->w_cursor.lnum, p, false);
2930 curwin->w_cursor.col = c;
2931
2932 // For redo we use a change-word command.
2933 ResetRedobuff();
2934 AppendToRedobuff("ciw");
2935 AppendToRedobuffLit(p + c,
2936 stp->st_wordlen + sug.su_badlen - stp->st_orglen);
2937 AppendCharToRedobuff(ESC);
2938
2939 // After this "p" may be invalid.
2940 changed_bytes(curwin->w_cursor.lnum, c);
2941 } else
2942 curwin->w_cursor = prev_cursor;
2943
2944 spell_find_cleanup(&sug);
2945 xfree(line);
2946}
2947
2948// Check if the word at line "lnum" column "col" is required to start with a
2949// capital. This uses 'spellcapcheck' of the current buffer.
2950static bool check_need_cap(linenr_T lnum, colnr_T col)
2951{
2952 bool need_cap = false;
2953 char_u *line;
2954 char_u *line_copy = NULL;
2955 char_u *p;
2956 colnr_T endcol;
2957 regmatch_T regmatch;
2958
2959 if (curwin->w_s->b_cap_prog == NULL)
2960 return false;
2961
2962 line = get_cursor_line_ptr();
2963 endcol = 0;
2964 if (getwhitecols(line) >= (int)col) {
2965 // At start of line, check if previous line is empty or sentence
2966 // ends there.
2967 if (lnum == 1)
2968 need_cap = true;
2969 else {
2970 line = ml_get(lnum - 1);
2971 if (*skipwhite(line) == NUL)
2972 need_cap = true;
2973 else {
2974 // Append a space in place of the line break.
2975 line_copy = concat_str(line, (char_u *)" ");
2976 line = line_copy;
2977 endcol = (colnr_T)STRLEN(line);
2978 }
2979 }
2980 } else {
2981 endcol = col;
2982 }
2983
2984 if (endcol > 0) {
2985 // Check if sentence ends before the bad word.
2986 regmatch.regprog = curwin->w_s->b_cap_prog;
2987 regmatch.rm_ic = FALSE;
2988 p = line + endcol;
2989 for (;; ) {
2990 MB_PTR_BACK(line, p);
2991 if (p == line || spell_iswordp_nmw(p, curwin)) {
2992 break;
2993 }
2994 if (vim_regexec(&regmatch, p, 0)
2995 && regmatch.endp[0] == line + endcol) {
2996 need_cap = true;
2997 break;
2998 }
2999 }
3000 curwin->w_s->b_cap_prog = regmatch.regprog;
3001 }
3002
3003 xfree(line_copy);
3004
3005 return need_cap;
3006}
3007
3008
3009// ":spellrepall"
3010void ex_spellrepall(exarg_T *eap)
3011{
3012 pos_T pos = curwin->w_cursor;
3013 char_u *frompat;
3014 int addlen;
3015 char_u *line;
3016 char_u *p;
3017 bool save_ws = p_ws;
3018 linenr_T prev_lnum = 0;
3019
3020 if (repl_from == NULL || repl_to == NULL) {
3021 EMSG(_("E752: No previous spell replacement"));
3022 return;
3023 }
3024 addlen = (int)(STRLEN(repl_to) - STRLEN(repl_from));
3025
3026 frompat = xmalloc(STRLEN(repl_from) + 7);
3027 sprintf((char *)frompat, "\\V\\<%s\\>", repl_from);
3028 p_ws = false;
3029
3030 sub_nsubs = 0;
3031 sub_nlines = 0;
3032 curwin->w_cursor.lnum = 0;
3033 while (!got_int) {
3034 if (do_search(NULL, '/', frompat, 1L, SEARCH_KEEP, NULL, NULL) == 0
3035 || u_save_cursor() == FAIL) {
3036 break;
3037 }
3038
3039 // Only replace when the right word isn't there yet. This happens
3040 // when changing "etc" to "etc.".
3041 line = get_cursor_line_ptr();
3042 if (addlen <= 0 || STRNCMP(line + curwin->w_cursor.col,
3043 repl_to, STRLEN(repl_to)) != 0) {
3044 p = xmalloc(STRLEN(line) + addlen + 1);
3045 memmove(p, line, curwin->w_cursor.col);
3046 STRCPY(p + curwin->w_cursor.col, repl_to);
3047 STRCAT(p, line + curwin->w_cursor.col + STRLEN(repl_from));
3048 ml_replace(curwin->w_cursor.lnum, p, false);
3049 changed_bytes(curwin->w_cursor.lnum, curwin->w_cursor.col);
3050
3051 if (curwin->w_cursor.lnum != prev_lnum) {
3052 ++sub_nlines;
3053 prev_lnum = curwin->w_cursor.lnum;
3054 }
3055 ++sub_nsubs;
3056 }
3057 curwin->w_cursor.col += (colnr_T)STRLEN(repl_to);
3058 }
3059
3060 p_ws = save_ws;
3061 curwin->w_cursor = pos;
3062 xfree(frompat);
3063
3064 if (sub_nsubs == 0)
3065 EMSG2(_("E753: Not found: %s"), repl_from);
3066 else
3067 do_sub_msg(false);
3068}
3069
3070// Find spell suggestions for "word". Return them in the growarray "*gap" as
3071// a list of allocated strings.
3072void
3073spell_suggest_list (
3074 garray_T *gap,
3075 char_u *word,
3076 int maxcount, // maximum nr of suggestions
3077 bool need_cap, // 'spellcapcheck' matched
3078 bool interactive
3079)
3080{
3081 suginfo_T sug;
3082 suggest_T *stp;
3083 char_u *wcopy;
3084
3085 spell_find_suggest(word, 0, &sug, maxcount, false, need_cap, interactive);
3086
3087 // Make room in "gap".
3088 ga_init(gap, sizeof(char_u *), sug.su_ga.ga_len + 1);
3089 ga_grow(gap, sug.su_ga.ga_len);
3090 for (int i = 0; i < sug.su_ga.ga_len; ++i) {
3091 stp = &SUG(sug.su_ga, i);
3092
3093 // The suggested word may replace only part of "word", add the not
3094 // replaced part.
3095 wcopy = xmalloc(stp->st_wordlen
3096 + STRLEN(sug.su_badptr + stp->st_orglen) + 1);
3097 STRCPY(wcopy, stp->st_word);
3098 STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen);
3099 ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy;
3100 }
3101
3102 spell_find_cleanup(&sug);
3103}
3104
3105// Find spell suggestions for the word at the start of "badptr".
3106// Return the suggestions in "su->su_ga".
3107// The maximum number of suggestions is "maxcount".
3108// Note: does use info for the current window.
3109// This is based on the mechanisms of Aspell, but completely reimplemented.
3110static void
3111spell_find_suggest (
3112 char_u *badptr,
3113 int badlen, // length of bad word or 0 if unknown
3114 suginfo_T *su,
3115 int maxcount,
3116 bool banbadword, // don't include badword in suggestions
3117 bool need_cap, // word should start with capital
3118 bool interactive
3119)
3120{
3121 hlf_T attr = HLF_COUNT;
3122 char_u buf[MAXPATHL];
3123 char_u *p;
3124 bool do_combine = false;
3125 char_u *sps_copy;
3126 static bool expr_busy = false;
3127 int c;
3128 langp_T *lp;
3129
3130 // Set the info in "*su".
3131 memset(su, 0, sizeof(suginfo_T));
3132 ga_init(&su->su_ga, (int)sizeof(suggest_T), 10);
3133 ga_init(&su->su_sga, (int)sizeof(suggest_T), 10);
3134 if (*badptr == NUL)
3135 return;
3136 hash_init(&su->su_banned);
3137
3138 su->su_badptr = badptr;
3139 if (badlen != 0)
3140 su->su_badlen = badlen;
3141 else {
3142 size_t tmplen = spell_check(curwin, su->su_badptr, &attr, NULL, false);
3143 assert(tmplen <= INT_MAX);
3144 su->su_badlen = (int)tmplen;
3145 }
3146 su->su_maxcount = maxcount;
3147 su->su_maxscore = SCORE_MAXINIT;
3148
3149 if (su->su_badlen >= MAXWLEN)
3150 su->su_badlen = MAXWLEN - 1; // just in case
3151 STRLCPY(su->su_badword, su->su_badptr, su->su_badlen + 1);
3152 (void)spell_casefold(su->su_badptr, su->su_badlen, su->su_fbadword, MAXWLEN);
3153
3154 // TODO(vim): make this work if the case-folded text is longer than the
3155 // original text. Currently an illegal byte causes wrong pointer
3156 // computations.
3157 su->su_fbadword[su->su_badlen] = NUL;
3158
3159 // get caps flags for bad word
3160 su->su_badflags = badword_captype(su->su_badptr,
3161 su->su_badptr + su->su_badlen);
3162 if (need_cap)
3163 su->su_badflags |= WF_ONECAP;
3164
3165 // Find the default language for sound folding. We simply use the first
3166 // one in 'spelllang' that supports sound folding. That's good for when
3167 // using multiple files for one language, it's not that bad when mixing
3168 // languages (e.g., "pl,en").
3169 for (int i = 0; i < curbuf->b_s.b_langp.ga_len; ++i) {
3170 lp = LANGP_ENTRY(curbuf->b_s.b_langp, i);
3171 if (lp->lp_sallang != NULL) {
3172 su->su_sallang = lp->lp_sallang;
3173 break;
3174 }
3175 }
3176
3177 // Soundfold the bad word with the default sound folding, so that we don't
3178 // have to do this many times.
3179 if (su->su_sallang != NULL)
3180 spell_soundfold(su->su_sallang, su->su_fbadword, true,
3181 su->su_sal_badword);
3182
3183 // If the word is not capitalised and spell_check() doesn't consider the
3184 // word to be bad then it might need to be capitalised. Add a suggestion
3185 // for that.
3186 c = PTR2CHAR(su->su_badptr);
3187 if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) {
3188 make_case_word(su->su_badword, buf, WF_ONECAP);
3189 add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE,
3190 0, true, su->su_sallang, false);
3191 }
3192
3193 // Ban the bad word itself. It may appear in another region.
3194 if (banbadword)
3195 add_banned(su, su->su_badword);
3196
3197 // Make a copy of 'spellsuggest', because the expression may change it.
3198 sps_copy = vim_strsave(p_sps);
3199
3200 // Loop over the items in 'spellsuggest'.
3201 for (p = sps_copy; *p != NUL; ) {
3202 copy_option_part(&p, buf, MAXPATHL, ",");
3203
3204 if (STRNCMP(buf, "expr:", 5) == 0) {
3205 // Evaluate an expression. Skip this when called recursively,
3206 // when using spellsuggest() in the expression.
3207 if (!expr_busy) {
3208 expr_busy = true;
3209 spell_suggest_expr(su, buf + 5);
3210 expr_busy = false;
3211 }
3212 } else if (STRNCMP(buf, "file:", 5) == 0)
3213 // Use list of suggestions in a file.
3214 spell_suggest_file(su, buf + 5);
3215 else {
3216 // Use internal method.
3217 spell_suggest_intern(su, interactive);
3218 if (sps_flags & SPS_DOUBLE)
3219 do_combine = true;
3220 }
3221 }
3222
3223 xfree(sps_copy);
3224
3225 if (do_combine)
3226 // Combine the two list of suggestions. This must be done last,
3227 // because sorting changes the order again.
3228 score_combine(su);
3229}
3230
3231// Find suggestions by evaluating expression "expr".
3232static void spell_suggest_expr(suginfo_T *su, char_u *expr)
3233{
3234 int score;
3235 const char *p;
3236
3237 // The work is split up in a few parts to avoid having to export
3238 // suginfo_T.
3239 // First evaluate the expression and get the resulting list.
3240 list_T *const list = eval_spell_expr(su->su_badword, expr);
3241 if (list != NULL) {
3242 // Loop over the items in the list.
3243 TV_LIST_ITER(list, li, {
3244 if (TV_LIST_ITEM_TV(li)->v_type == VAR_LIST) {
3245 // Get the word and the score from the items.
3246 score = get_spellword(TV_LIST_ITEM_TV(li)->vval.v_list, &p);
3247 if (score >= 0 && score <= su->su_maxscore) {
3248 add_suggestion(su, &su->su_ga, (const char_u *)p, su->su_badlen,
3249 score, 0, true, su->su_sallang, false);
3250 }
3251 }
3252 });
3253 tv_list_unref(list);
3254 }
3255
3256 // Remove bogus suggestions, sort and truncate at "maxcount".
3257 check_suggestions(su, &su->su_ga);
3258 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
3259}
3260
3261// Find suggestions in file "fname". Used for "file:" in 'spellsuggest'.
3262static void spell_suggest_file(suginfo_T *su, char_u *fname)
3263{
3264 FILE *fd;
3265 char_u line[MAXWLEN * 2];
3266 char_u *p;
3267 int len;
3268 char_u cword[MAXWLEN];
3269
3270 // Open the file.
3271 fd = os_fopen((char *)fname, "r");
3272 if (fd == NULL) {
3273 EMSG2(_(e_notopen), fname);
3274 return;
3275 }
3276
3277 // Read it line by line.
3278 while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int) {
3279 line_breakcheck();
3280
3281 p = vim_strchr(line, '/');
3282 if (p == NULL)
3283 continue; // No Tab found, just skip the line.
3284 *p++ = NUL;
3285 if (STRICMP(su->su_badword, line) == 0) {
3286 // Match! Isolate the good word, until CR or NL.
3287 for (len = 0; p[len] >= ' '; ++len)
3288 ;
3289 p[len] = NUL;
3290
3291 // If the suggestion doesn't have specific case duplicate the case
3292 // of the bad word.
3293 if (captype(p, NULL) == 0) {
3294 make_case_word(p, cword, su->su_badflags);
3295 p = cword;
3296 }
3297
3298 add_suggestion(su, &su->su_ga, p, su->su_badlen,
3299 SCORE_FILE, 0, true, su->su_sallang, false);
3300 }
3301 }
3302
3303 fclose(fd);
3304
3305 // Remove bogus suggestions, sort and truncate at "maxcount".
3306 check_suggestions(su, &su->su_ga);
3307 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
3308}
3309
3310// Find suggestions for the internal method indicated by "sps_flags".
3311static void spell_suggest_intern(suginfo_T *su, bool interactive)
3312{
3313 // Load the .sug file(s) that are available and not done yet.
3314 suggest_load_files();
3315
3316 // 1. Try special cases, such as repeating a word: "the the" -> "the".
3317 //
3318 // Set a maximum score to limit the combination of operations that is
3319 // tried.
3320 suggest_try_special(su);
3321
3322 // 2. Try inserting/deleting/swapping/changing a letter, use REP entries
3323 // from the .aff file and inserting a space (split the word).
3324 suggest_try_change(su);
3325
3326 // For the resulting top-scorers compute the sound-a-like score.
3327 if (sps_flags & SPS_DOUBLE)
3328 score_comp_sal(su);
3329
3330 // 3. Try finding sound-a-like words.
3331 if ((sps_flags & SPS_FAST) == 0) {
3332 if (sps_flags & SPS_BEST)
3333 // Adjust the word score for the suggestions found so far for how
3334 // they sounds like.
3335 rescore_suggestions(su);
3336
3337 // While going through the soundfold tree "su_maxscore" is the score
3338 // for the soundfold word, limits the changes that are being tried,
3339 // and "su_sfmaxscore" the rescored score, which is set by
3340 // cleanup_suggestions().
3341 // First find words with a small edit distance, because this is much
3342 // faster and often already finds the top-N suggestions. If we didn't
3343 // find many suggestions try again with a higher edit distance.
3344 // "sl_sounddone" is used to avoid doing the same word twice.
3345 suggest_try_soundalike_prep();
3346 su->su_maxscore = SCORE_SFMAX1;
3347 su->su_sfmaxscore = SCORE_MAXINIT * 3;
3348 suggest_try_soundalike(su);
3349 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) {
3350 // We didn't find enough matches, try again, allowing more
3351 // changes to the soundfold word.
3352 su->su_maxscore = SCORE_SFMAX2;
3353 suggest_try_soundalike(su);
3354 if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) {
3355 // Still didn't find enough matches, try again, allowing even
3356 // more changes to the soundfold word.
3357 su->su_maxscore = SCORE_SFMAX3;
3358 suggest_try_soundalike(su);
3359 }
3360 }
3361 su->su_maxscore = su->su_sfmaxscore;
3362 suggest_try_soundalike_finish();
3363 }
3364
3365 // When CTRL-C was hit while searching do show the results. Only clear
3366 // got_int when using a command, not for spellsuggest().
3367 os_breakcheck();
3368 if (interactive && got_int) {
3369 (void)vgetc();
3370 got_int = FALSE;
3371 }
3372
3373 if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0) {
3374 if (sps_flags & SPS_BEST)
3375 // Adjust the word score for how it sounds like.
3376 rescore_suggestions(su);
3377
3378 // Remove bogus suggestions, sort and truncate at "maxcount".
3379 check_suggestions(su, &su->su_ga);
3380 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
3381 }
3382}
3383
3384// Free the info put in "*su" by spell_find_suggest().
3385static void spell_find_cleanup(suginfo_T *su)
3386{
3387# define FREE_SUG_WORD(sug) xfree(sug->st_word)
3388 // Free the suggestions.
3389 GA_DEEP_CLEAR(&su->su_ga, suggest_T, FREE_SUG_WORD);
3390 GA_DEEP_CLEAR(&su->su_sga, suggest_T, FREE_SUG_WORD);
3391
3392 // Free the banned words.
3393 hash_clear_all(&su->su_banned, 0);
3394}
3395
3396/// Make a copy of "word", with the first letter upper or lower cased, to
3397/// "wcopy[MAXWLEN]". "word" must not be empty.
3398/// The result is NUL terminated.
3399///
3400/// @param[in] word source string to copy
3401/// @param[in,out] wcopy copied string, with case of first letter changed
3402/// @param[in] upper True to upper case, otherwise lower case
3403void onecap_copy(char_u *word, char_u *wcopy, bool upper)
3404{
3405 char_u *p;
3406 int c;
3407 int l;
3408
3409 p = word;
3410 if (has_mbyte) {
3411 c = mb_cptr2char_adv((const char_u **)&p);
3412 } else {
3413 c = *p++;
3414 }
3415 if (upper) {
3416 c = SPELL_TOUPPER(c);
3417 } else {
3418 c = SPELL_TOFOLD(c);
3419 }
3420 l = utf_char2bytes(c, wcopy);
3421 STRLCPY(wcopy + l, p, MAXWLEN - l);
3422}
3423
3424// Make a copy of "word" with all the letters upper cased into
3425// "wcopy[MAXWLEN]". The result is NUL terminated.
3426static void allcap_copy(char_u *word, char_u *wcopy)
3427{
3428 char_u *s;
3429 char_u *d;
3430 int c;
3431
3432 d = wcopy;
3433 for (s = word; *s != NUL; ) {
3434 if (has_mbyte) {
3435 c = mb_cptr2char_adv((const char_u **)&s);
3436 } else {
3437 c = *s++;
3438 }
3439
3440 if (c == 0xdf) {
3441 c = 'S';
3442 if (d - wcopy >= MAXWLEN - 1)
3443 break;
3444 *d++ = c;
3445 } else
3446 c = SPELL_TOUPPER(c);
3447
3448 if (d - wcopy >= MAXWLEN - MB_MAXBYTES) {
3449 break;
3450 }
3451 d += utf_char2bytes(c, d);
3452 }
3453 *d = NUL;
3454}
3455
3456// Try finding suggestions by recognizing specific situations.
3457static void suggest_try_special(suginfo_T *su)
3458{
3459 char_u *p;
3460 size_t len;
3461 int c;
3462 char_u word[MAXWLEN];
3463
3464 // Recognize a word that is repeated: "the the".
3465 p = skiptowhite(su->su_fbadword);
3466 len = p - su->su_fbadword;
3467 p = skipwhite(p);
3468 if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0) {
3469 // Include badflags: if the badword is onecap or allcap
3470 // use that for the goodword too: "The the" -> "The".
3471 c = su->su_fbadword[len];
3472 su->su_fbadword[len] = NUL;
3473 make_case_word(su->su_fbadword, word, su->su_badflags);
3474 su->su_fbadword[len] = c;
3475
3476 // Give a soundalike score of 0, compute the score as if deleting one
3477 // character.
3478 add_suggestion(su, &su->su_ga, word, su->su_badlen,
3479 RESCORE(SCORE_REP, 0), 0, true, su->su_sallang, false);
3480 }
3481}
3482
3483// Measure how much time is spent in each state.
3484// Output is dumped in "suggestprof".
3485
3486#ifdef SUGGEST_PROFILE
3487proftime_T current;
3488proftime_T total;
3489proftime_T times[STATE_FINAL + 1];
3490long counts[STATE_FINAL + 1];
3491
3492 static void
3493prof_init(void)
3494{
3495 for (int i = 0; i <= STATE_FINAL; i++) {
3496 profile_zero(&times[i]);
3497 counts[i] = 0;
3498 }
3499 profile_start(&current);
3500 profile_start(&total);
3501}
3502
3503// call before changing state
3504 static void
3505prof_store(state_T state)
3506{
3507 profile_end(&current);
3508 profile_add(&times[state], &current);
3509 counts[state]++;
3510 profile_start(&current);
3511}
3512# define PROF_STORE(state) prof_store(state);
3513
3514 static void
3515prof_report(char *name)
3516{
3517 FILE *fd = fopen("suggestprof", "a");
3518
3519 profile_end(&total);
3520 fprintf(fd, "-----------------------\n");
3521 fprintf(fd, "%s: %s\n", name, profile_msg(&total));
3522 for (int i = 0; i <= STATE_FINAL; i++) {
3523 fprintf(fd, "%d: %s ("%" PRId64)\n", i, profile_msg(&times[i]), counts[i]);
3524 }
3525 fclose(fd);
3526}
3527#else
3528# define PROF_STORE(state)
3529#endif
3530
3531// Try finding suggestions by adding/removing/swapping letters.
3532
3533static void suggest_try_change(suginfo_T *su)
3534{
3535 char_u fword[MAXWLEN]; // copy of the bad word, case-folded
3536 int n;
3537 char_u *p;
3538 langp_T *lp;
3539
3540 // We make a copy of the case-folded bad word, so that we can modify it
3541 // to find matches (esp. REP items). Append some more text, changing
3542 // chars after the bad word may help.
3543 STRCPY(fword, su->su_fbadword);
3544 n = (int)STRLEN(fword);
3545 p = su->su_badptr + su->su_badlen;
3546 (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n);
3547
3548 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
3549 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
3550
3551 // If reloading a spell file fails it's still in the list but
3552 // everything has been cleared.
3553 if (lp->lp_slang->sl_fbyts == NULL)
3554 continue;
3555
3556 // Try it for this language. Will add possible suggestions.
3557 //
3558#ifdef SUGGEST_PROFILE
3559 prof_init();
3560#endif
3561 suggest_trie_walk(su, lp, fword, false);
3562#ifdef SUGGEST_PROFILE
3563 prof_report("try_change");
3564#endif
3565 }
3566}
3567
3568// Check the maximum score, if we go over it we won't try this change.
3569#define TRY_DEEPER(su, stack, depth, add) \
3570 (stack[depth].ts_score + (add) < su->su_maxscore)
3571
3572// Try finding suggestions by adding/removing/swapping letters.
3573//
3574// This uses a state machine. At each node in the tree we try various
3575// operations. When trying if an operation works "depth" is increased and the
3576// stack[] is used to store info. This allows combinations, thus insert one
3577// character, replace one and delete another. The number of changes is
3578// limited by su->su_maxscore.
3579//
3580// After implementing this I noticed an article by Kemal Oflazer that
3581// describes something similar: "Error-tolerant Finite State Recognition with
3582// Applications to Morphological Analysis and Spelling Correction" (1996).
3583// The implementation in the article is simplified and requires a stack of
3584// unknown depth. The implementation here only needs a stack depth equal to
3585// the length of the word.
3586//
3587// This is also used for the sound-folded word, "soundfold" is true then.
3588// The mechanism is the same, but we find a match with a sound-folded word
3589// that comes from one or more original words. Each of these words may be
3590// added, this is done by add_sound_suggest().
3591// Don't use:
3592// the prefix tree or the keep-case tree
3593// "su->su_badlen"
3594// anything to do with upper and lower case
3595// anything to do with word or non-word characters ("spell_iswordp()")
3596// banned words
3597// word flags (rare, region, compounding)
3598// word splitting for now
3599// "similar_chars()"
3600// use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep"
3601static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, bool soundfold)
3602{
3603 char_u tword[MAXWLEN]; // good word collected so far
3604 trystate_T stack[MAXWLEN];
3605 char_u preword[MAXWLEN * 3]; // word found with proper case;
3606 // concatenation of prefix compound
3607 // words and split word. NUL terminated
3608 // when going deeper but not when coming
3609 // back.
3610 char_u compflags[MAXWLEN]; // compound flags, one for each word
3611 trystate_T *sp;
3612 int newscore;
3613 int score;
3614 char_u *byts, *fbyts, *pbyts;
3615 idx_T *idxs, *fidxs, *pidxs;
3616 int depth;
3617 int c, c2, c3;
3618 int n = 0;
3619 int flags;
3620 garray_T *gap;
3621 idx_T arridx;
3622 int len;
3623 char_u *p;
3624 fromto_T *ftp;
3625 int fl = 0, tl;
3626 int repextra = 0; // extra bytes in fword[] from REP item
3627 slang_T *slang = lp->lp_slang;
3628 int fword_ends;
3629 bool goodword_ends;
3630#ifdef DEBUG_TRIEWALK
3631 // Stores the name of the change made at each level.
3632 char_u changename[MAXWLEN][80];
3633#endif
3634 int breakcheckcount = 1000;
3635 bool compound_ok;
3636
3637 // Go through the whole case-fold tree, try changes at each node.
3638 // "tword[]" contains the word collected from nodes in the tree.
3639 // "fword[]" the word we are trying to match with (initially the bad
3640 // word).
3641 depth = 0;
3642 sp = &stack[0];
3643 memset(sp, 0, sizeof(trystate_T)); // -V512
3644 sp->ts_curi = 1;
3645
3646 if (soundfold) {
3647 // Going through the soundfold tree.
3648 byts = fbyts = slang->sl_sbyts;
3649 idxs = fidxs = slang->sl_sidxs;
3650 pbyts = NULL;
3651 pidxs = NULL;
3652 sp->ts_prefixdepth = PFD_NOPREFIX;
3653 sp->ts_state = STATE_START;
3654 } else {
3655 // When there are postponed prefixes we need to use these first. At
3656 // the end of the prefix we continue in the case-fold tree.
3657 fbyts = slang->sl_fbyts;
3658 fidxs = slang->sl_fidxs;
3659 pbyts = slang->sl_pbyts;
3660 pidxs = slang->sl_pidxs;
3661 if (pbyts != NULL) {
3662 byts = pbyts;
3663 idxs = pidxs;
3664 sp->ts_prefixdepth = PFD_PREFIXTREE;
3665 sp->ts_state = STATE_NOPREFIX; // try without prefix first
3666 } else {
3667 byts = fbyts;
3668 idxs = fidxs;
3669 sp->ts_prefixdepth = PFD_NOPREFIX;
3670 sp->ts_state = STATE_START;
3671 }
3672 }
3673
3674 // Loop to find all suggestions. At each round we either:
3675 // - For the current state try one operation, advance "ts_curi",
3676 // increase "depth".
3677 // - When a state is done go to the next, set "ts_state".
3678 // - When all states are tried decrease "depth".
3679 while (depth >= 0 && !got_int) {
3680 sp = &stack[depth];
3681 switch (sp->ts_state) {
3682 case STATE_START:
3683 case STATE_NOPREFIX:
3684 // Start of node: Deal with NUL bytes, which means
3685 // tword[] may end here.
3686 arridx = sp->ts_arridx; // current node in the tree
3687 len = byts[arridx]; // bytes in this node
3688 arridx += sp->ts_curi; // index of current byte
3689
3690 if (sp->ts_prefixdepth == PFD_PREFIXTREE) {
3691 // Skip over the NUL bytes, we use them later.
3692 for (n = 0; n < len && byts[arridx + n] == 0; ++n)
3693 ;
3694 sp->ts_curi += n;
3695
3696 // Always past NUL bytes now.
3697 n = (int)sp->ts_state;
3698 PROF_STORE(sp->ts_state)
3699 sp->ts_state = STATE_ENDNUL;
3700 sp->ts_save_badflags = su->su_badflags;
3701
3702 // At end of a prefix or at start of prefixtree: check for
3703 // following word.
3704 if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) {
3705 // Set su->su_badflags to the caps type at this position.
3706 // Use the caps type until here for the prefix itself.
3707 if (has_mbyte)
3708 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
3709 else
3710 n = sp->ts_fidx;
3711 flags = badword_captype(su->su_badptr, su->su_badptr + n);
3712 su->su_badflags = badword_captype(su->su_badptr + n,
3713 su->su_badptr + su->su_badlen);
3714#ifdef DEBUG_TRIEWALK
3715 sprintf(changename[depth], "prefix");
3716#endif
3717 go_deeper(stack, depth, 0);
3718 ++depth;
3719 sp = &stack[depth];
3720 sp->ts_prefixdepth = depth - 1;
3721 byts = fbyts;
3722 idxs = fidxs;
3723 sp->ts_arridx = 0;
3724
3725 // Move the prefix to preword[] with the right case
3726 // and make find_keepcap_word() works.
3727 tword[sp->ts_twordlen] = NUL;
3728 make_case_word(tword + sp->ts_splitoff,
3729 preword + sp->ts_prewordlen, flags);
3730 sp->ts_prewordlen = (char_u)STRLEN(preword);
3731 sp->ts_splitoff = sp->ts_twordlen;
3732 }
3733 break;
3734 }
3735
3736 if (sp->ts_curi > len || byts[arridx] != 0) {
3737 // Past bytes in node and/or past NUL bytes.
3738 PROF_STORE(sp->ts_state)
3739 sp->ts_state = STATE_ENDNUL;
3740 sp->ts_save_badflags = su->su_badflags;
3741 break;
3742 }
3743
3744 // End of word in tree.
3745 ++sp->ts_curi; // eat one NUL byte
3746
3747 flags = (int)idxs[arridx];
3748
3749 // Skip words with the NOSUGGEST flag.
3750 if (flags & WF_NOSUGGEST)
3751 break;
3752
3753 fword_ends = (fword[sp->ts_fidx] == NUL
3754 || (soundfold
3755 ? ascii_iswhite(fword[sp->ts_fidx])
3756 : !spell_iswordp(fword + sp->ts_fidx, curwin)));
3757 tword[sp->ts_twordlen] = NUL;
3758
3759 if (sp->ts_prefixdepth <= PFD_NOTSPECIAL
3760 && (sp->ts_flags & TSF_PREFIXOK) == 0) {
3761 // There was a prefix before the word. Check that the prefix
3762 // can be used with this word.
3763 // Count the length of the NULs in the prefix. If there are
3764 // none this must be the first try without a prefix.
3765 n = stack[sp->ts_prefixdepth].ts_arridx;
3766 len = pbyts[n++];
3767 for (c = 0; c < len && pbyts[n + c] == 0; ++c)
3768 ;
3769 if (c > 0) {
3770 c = valid_word_prefix(c, n, flags,
3771 tword + sp->ts_splitoff, slang, false);
3772 if (c == 0)
3773 break;
3774
3775 // Use the WF_RARE flag for a rare prefix.
3776 if (c & WF_RAREPFX)
3777 flags |= WF_RARE;
3778
3779 // Tricky: when checking for both prefix and compounding
3780 // we run into the prefix flag first.
3781 // Remember that it's OK, so that we accept the prefix
3782 // when arriving at a compound flag.
3783 sp->ts_flags |= TSF_PREFIXOK;
3784 }
3785 }
3786
3787 // Check NEEDCOMPOUND: can't use word without compounding. Do try
3788 // appending another compound word below.
3789 if (sp->ts_complen == sp->ts_compsplit && fword_ends
3790 && (flags & WF_NEEDCOMP))
3791 goodword_ends = false;
3792 else
3793 goodword_ends = true;
3794
3795 p = NULL;
3796 compound_ok = true;
3797 if (sp->ts_complen > sp->ts_compsplit) {
3798 if (slang->sl_nobreak) {
3799 // There was a word before this word. When there was no
3800 // change in this word (it was correct) add the first word
3801 // as a suggestion. If this word was corrected too, we
3802 // need to check if a correct word follows.
3803 if (sp->ts_fidx - sp->ts_splitfidx
3804 == sp->ts_twordlen - sp->ts_splitoff
3805 && STRNCMP(fword + sp->ts_splitfidx,
3806 tword + sp->ts_splitoff,
3807 sp->ts_fidx - sp->ts_splitfidx) == 0) {
3808 preword[sp->ts_prewordlen] = NUL;
3809 newscore = score_wordcount_adj(slang, sp->ts_score,
3810 preword + sp->ts_prewordlen,
3811 sp->ts_prewordlen > 0);
3812 // Add the suggestion if the score isn't too bad.
3813 if (newscore <= su->su_maxscore)
3814 add_suggestion(su, &su->su_ga, preword,
3815 sp->ts_splitfidx - repextra,
3816 newscore, 0, false,
3817 lp->lp_sallang, false);
3818 break;
3819 }
3820 } else {
3821 // There was a compound word before this word. If this
3822 // word does not support compounding then give up
3823 // (splitting is tried for the word without compound
3824 // flag).
3825 if (((unsigned)flags >> 24) == 0
3826 || sp->ts_twordlen - sp->ts_splitoff
3827 < slang->sl_compminlen)
3828 break;
3829 // For multi-byte chars check character length against
3830 // COMPOUNDMIN.
3831 if (has_mbyte
3832 && slang->sl_compminlen > 0
3833 && mb_charlen(tword + sp->ts_splitoff)
3834 < slang->sl_compminlen)
3835 break;
3836
3837 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
3838 compflags[sp->ts_complen + 1] = NUL;
3839 STRLCPY(preword + sp->ts_prewordlen,
3840 tword + sp->ts_splitoff,
3841 sp->ts_twordlen - sp->ts_splitoff + 1);
3842
3843 // Verify CHECKCOMPOUNDPATTERN rules.
3844 if (match_checkcompoundpattern(preword, sp->ts_prewordlen,
3845 &slang->sl_comppat))
3846 compound_ok = false;
3847
3848 if (compound_ok) {
3849 p = preword;
3850 while (*skiptowhite(p) != NUL)
3851 p = skipwhite(skiptowhite(p));
3852 if (fword_ends && !can_compound(slang, p,
3853 compflags + sp->ts_compsplit))
3854 // Compound is not allowed. But it may still be
3855 // possible if we add another (short) word.
3856 compound_ok = false;
3857 }
3858
3859 // Get pointer to last char of previous word.
3860 p = preword + sp->ts_prewordlen;
3861 MB_PTR_BACK(preword, p);
3862 }
3863 }
3864
3865 // Form the word with proper case in preword.
3866 // If there is a word from a previous split, append.
3867 // For the soundfold tree don't change the case, simply append.
3868 if (soundfold)
3869 STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff);
3870 else if (flags & WF_KEEPCAP)
3871 // Must find the word in the keep-case tree.
3872 find_keepcap_word(slang, tword + sp->ts_splitoff,
3873 preword + sp->ts_prewordlen);
3874 else {
3875 // Include badflags: If the badword is onecap or allcap
3876 // use that for the goodword too. But if the badword is
3877 // allcap and it's only one char long use onecap.
3878 c = su->su_badflags;
3879 if ((c & WF_ALLCAP)
3880 && su->su_badlen == (*mb_ptr2len)(su->su_badptr)
3881 )
3882 c = WF_ONECAP;
3883 c |= flags;
3884
3885 // When appending a compound word after a word character don't
3886 // use Onecap.
3887 if (p != NULL && spell_iswordp_nmw(p, curwin))
3888 c &= ~WF_ONECAP;
3889 make_case_word(tword + sp->ts_splitoff,
3890 preword + sp->ts_prewordlen, c);
3891 }
3892
3893 if (!soundfold) {
3894 // Don't use a banned word. It may appear again as a good
3895 // word, thus remember it.
3896 if (flags & WF_BANNED) {
3897 add_banned(su, preword + sp->ts_prewordlen);
3898 break;
3899 }
3900 if ((sp->ts_complen == sp->ts_compsplit
3901 && WAS_BANNED(su, preword + sp->ts_prewordlen))
3902 || WAS_BANNED(su, preword)) {
3903 if (slang->sl_compprog == NULL)
3904 break;
3905 // the word so far was banned but we may try compounding
3906 goodword_ends = false;
3907 }
3908 }
3909
3910 newscore = 0;
3911 if (!soundfold) { // soundfold words don't have flags
3912 if ((flags & WF_REGION)
3913 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
3914 newscore += SCORE_REGION;
3915 if (flags & WF_RARE)
3916 newscore += SCORE_RARE;
3917
3918 if (!spell_valid_case(su->su_badflags,
3919 captype(preword + sp->ts_prewordlen, NULL)))
3920 newscore += SCORE_ICASE;
3921 }
3922
3923 // TODO: how about splitting in the soundfold tree?
3924 if (fword_ends
3925 && goodword_ends
3926 && sp->ts_fidx >= sp->ts_fidxtry
3927 && compound_ok) {
3928 // The badword also ends: add suggestions.
3929#ifdef DEBUG_TRIEWALK
3930 if (soundfold && STRCMP(preword, "smwrd") == 0) {
3931 int j;
3932
3933 // print the stack of changes that brought us here
3934 smsg("------ %s -------", fword);
3935 for (j = 0; j < depth; ++j)
3936 smsg("%s", changename[j]);
3937 }
3938#endif
3939 if (soundfold) {
3940 // For soundfolded words we need to find the original
3941 // words, the edit distance and then add them.
3942 add_sound_suggest(su, preword, sp->ts_score, lp);
3943 } else if (sp->ts_fidx > 0) {
3944 // Give a penalty when changing non-word char to word
3945 // char, e.g., "thes," -> "these".
3946 p = fword + sp->ts_fidx;
3947 MB_PTR_BACK(fword, p);
3948 if (!spell_iswordp(p, curwin)) {
3949 p = preword + STRLEN(preword);
3950 MB_PTR_BACK(preword, p);
3951 if (spell_iswordp(p, curwin)) {
3952 newscore += SCORE_NONWORD;
3953 }
3954 }
3955
3956 // Give a bonus to words seen before.
3957 score = score_wordcount_adj(slang,
3958 sp->ts_score + newscore,
3959 preword + sp->ts_prewordlen,
3960 sp->ts_prewordlen > 0);
3961
3962 // Add the suggestion if the score isn't too bad.
3963 if (score <= su->su_maxscore) {
3964 add_suggestion(su, &su->su_ga, preword,
3965 sp->ts_fidx - repextra,
3966 score, 0, false, lp->lp_sallang, false);
3967
3968 if (su->su_badflags & WF_MIXCAP) {
3969 // We really don't know if the word should be
3970 // upper or lower case, add both.
3971 c = captype(preword, NULL);
3972 if (c == 0 || c == WF_ALLCAP) {
3973 make_case_word(tword + sp->ts_splitoff,
3974 preword + sp->ts_prewordlen,
3975 c == 0 ? WF_ALLCAP : 0);
3976
3977 add_suggestion(su, &su->su_ga, preword,
3978 sp->ts_fidx - repextra,
3979 score + SCORE_ICASE, 0, false,
3980 lp->lp_sallang, false);
3981 }
3982 }
3983 }
3984 }
3985 }
3986
3987 // Try word split and/or compounding.
3988 if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends)
3989 // Don't split in the middle of a character
3990 && (!has_mbyte || sp->ts_tcharlen == 0)
3991 ) {
3992 bool try_compound;
3993 int try_split;
3994
3995 // If past the end of the bad word don't try a split.
3996 // Otherwise try changing the next word. E.g., find
3997 // suggestions for "the the" where the second "the" is
3998 // different. It's done like a split.
3999 // TODO: word split for soundfold words
4000 try_split = (sp->ts_fidx - repextra < su->su_badlen)
4001 && !soundfold;
4002
4003 // Get here in several situations:
4004 // 1. The word in the tree ends:
4005 // If the word allows compounding try that. Otherwise try
4006 // a split by inserting a space. For both check that a
4007 // valid words starts at fword[sp->ts_fidx].
4008 // For NOBREAK do like compounding to be able to check if
4009 // the next word is valid.
4010 // 2. The badword does end, but it was due to a change (e.g.,
4011 // a swap). No need to split, but do check that the
4012 // following word is valid.
4013 // 3. The badword and the word in the tree end. It may still
4014 // be possible to compound another (short) word.
4015 try_compound = false;
4016 if (!soundfold
4017 && !slang->sl_nocompoundsugs
4018 && slang->sl_compprog != NULL
4019 && ((unsigned)flags >> 24) != 0
4020 && sp->ts_twordlen - sp->ts_splitoff
4021 >= slang->sl_compminlen
4022 && (!has_mbyte
4023 || slang->sl_compminlen == 0
4024 || mb_charlen(tword + sp->ts_splitoff)
4025 >= slang->sl_compminlen)
4026 && (slang->sl_compsylmax < MAXWLEN
4027 || sp->ts_complen + 1 - sp->ts_compsplit
4028 < slang->sl_compmax)
4029 && (can_be_compound(sp, slang,
4030 compflags, ((unsigned)flags >> 24)))) {
4031 try_compound = true;
4032 compflags[sp->ts_complen] = ((unsigned)flags >> 24);
4033 compflags[sp->ts_complen + 1] = NUL;
4034 }
4035
4036 // For NOBREAK we never try splitting, it won't make any word
4037 // valid.
4038 if (slang->sl_nobreak && !slang->sl_nocompoundsugs) {
4039 try_compound = true;
4040 } else if (!fword_ends
4041 && try_compound
4042 && (sp->ts_flags & TSF_DIDSPLIT) == 0) {
4043 // If we could add a compound word, and it's also possible to
4044 // split at this point, do the split first and set
4045 // TSF_DIDSPLIT to avoid doing it again.
4046 try_compound = false;
4047 sp->ts_flags |= TSF_DIDSPLIT;
4048 --sp->ts_curi; // do the same NUL again
4049 compflags[sp->ts_complen] = NUL;
4050 } else {
4051 sp->ts_flags &= ~TSF_DIDSPLIT;
4052 }
4053
4054 if (try_split || try_compound) {
4055 if (!try_compound && (!fword_ends || !goodword_ends)) {
4056 // If we're going to split need to check that the
4057 // words so far are valid for compounding. If there
4058 // is only one word it must not have the NEEDCOMPOUND
4059 // flag.
4060 if (sp->ts_complen == sp->ts_compsplit
4061 && (flags & WF_NEEDCOMP))
4062 break;
4063 p = preword;
4064 while (*skiptowhite(p) != NUL)
4065 p = skipwhite(skiptowhite(p));
4066 if (sp->ts_complen > sp->ts_compsplit
4067 && !can_compound(slang, p,
4068 compflags + sp->ts_compsplit))
4069 break;
4070
4071 if (slang->sl_nosplitsugs)
4072 newscore += SCORE_SPLIT_NO;
4073 else
4074 newscore += SCORE_SPLIT;
4075
4076 // Give a bonus to words seen before.
4077 newscore = score_wordcount_adj(slang, newscore,
4078 preword + sp->ts_prewordlen, true);
4079 }
4080
4081 if (TRY_DEEPER(su, stack, depth, newscore)) {
4082 go_deeper(stack, depth, newscore);
4083#ifdef DEBUG_TRIEWALK
4084 if (!try_compound && !fword_ends)
4085 sprintf(changename[depth], "%.*s-%s: split",
4086 sp->ts_twordlen, tword, fword + sp->ts_fidx);
4087 else
4088 sprintf(changename[depth], "%.*s-%s: compound",
4089 sp->ts_twordlen, tword, fword + sp->ts_fidx);
4090#endif
4091 // Save things to be restored at STATE_SPLITUNDO.
4092 sp->ts_save_badflags = su->su_badflags;
4093 PROF_STORE(sp->ts_state)
4094 sp->ts_state = STATE_SPLITUNDO;
4095
4096 ++depth;
4097 sp = &stack[depth];
4098
4099 // Append a space to preword when splitting.
4100 if (!try_compound && !fword_ends)
4101 STRCAT(preword, " ");
4102 sp->ts_prewordlen = (char_u)STRLEN(preword);
4103 sp->ts_splitoff = sp->ts_twordlen;
4104 sp->ts_splitfidx = sp->ts_fidx;
4105
4106 // If the badword has a non-word character at this
4107 // position skip it. That means replacing the
4108 // non-word character with a space. Always skip a
4109 // character when the word ends. But only when the
4110 // good word can end.
4111 if (((!try_compound && !spell_iswordp_nmw(fword
4112 + sp->ts_fidx,
4113 curwin))
4114 || fword_ends)
4115 && fword[sp->ts_fidx] != NUL
4116 && goodword_ends) {
4117 int l;
4118
4119 l = MB_PTR2LEN(fword + sp->ts_fidx);
4120 if (fword_ends) {
4121 // Copy the skipped character to preword.
4122 memmove(preword + sp->ts_prewordlen,
4123 fword + sp->ts_fidx, l);
4124 sp->ts_prewordlen += l;
4125 preword[sp->ts_prewordlen] = NUL;
4126 } else
4127 sp->ts_score -= SCORE_SPLIT - SCORE_SUBST;
4128 sp->ts_fidx += l;
4129 }
4130
4131 // When compounding include compound flag in
4132 // compflags[] (already set above). When splitting we
4133 // may start compounding over again.
4134 if (try_compound)
4135 ++sp->ts_complen;
4136 else
4137 sp->ts_compsplit = sp->ts_complen;
4138 sp->ts_prefixdepth = PFD_NOPREFIX;
4139
4140 // set su->su_badflags to the caps type at this
4141 // position
4142 if (has_mbyte)
4143 n = nofold_len(fword, sp->ts_fidx, su->su_badptr);
4144 else
4145 n = sp->ts_fidx;
4146 su->su_badflags = badword_captype(su->su_badptr + n,
4147 su->su_badptr + su->su_badlen);
4148
4149 // Restart at top of the tree.
4150 sp->ts_arridx = 0;
4151
4152 // If there are postponed prefixes, try these too.
4153 if (pbyts != NULL) {
4154 byts = pbyts;
4155 idxs = pidxs;
4156 sp->ts_prefixdepth = PFD_PREFIXTREE;
4157 PROF_STORE(sp->ts_state)
4158 sp->ts_state = STATE_NOPREFIX;
4159 }
4160 }
4161 }
4162 }
4163 break;
4164
4165 case STATE_SPLITUNDO:
4166 // Undo the changes done for word split or compound word.
4167 su->su_badflags = sp->ts_save_badflags;
4168
4169 // Continue looking for NUL bytes.
4170 PROF_STORE(sp->ts_state)
4171 sp->ts_state = STATE_START;
4172
4173 // In case we went into the prefix tree.
4174 byts = fbyts;
4175 idxs = fidxs;
4176 break;
4177
4178 case STATE_ENDNUL:
4179 // Past the NUL bytes in the node.
4180 su->su_badflags = sp->ts_save_badflags;
4181 if (fword[sp->ts_fidx] == NUL
4182 && sp->ts_tcharlen == 0
4183 ) {
4184 // The badword ends, can't use STATE_PLAIN.
4185 PROF_STORE(sp->ts_state)
4186 sp->ts_state = STATE_DEL;
4187 break;
4188 }
4189 PROF_STORE(sp->ts_state)
4190 sp->ts_state = STATE_PLAIN;
4191 FALLTHROUGH;
4192
4193 case STATE_PLAIN:
4194 // Go over all possible bytes at this node, add each to tword[]
4195 // and use child node. "ts_curi" is the index.
4196 arridx = sp->ts_arridx;
4197 if (sp->ts_curi > byts[arridx]) {
4198 // Done all bytes at this node, do next state. When still at
4199 // already changed bytes skip the other tricks.
4200 PROF_STORE(sp->ts_state)
4201 if (sp->ts_fidx >= sp->ts_fidxtry) {
4202 sp->ts_state = STATE_DEL;
4203 } else {
4204 sp->ts_state = STATE_FINAL;
4205 }
4206 } else {
4207 arridx += sp->ts_curi++;
4208 c = byts[arridx];
4209
4210 // Normal byte, go one level deeper. If it's not equal to the
4211 // byte in the bad word adjust the score. But don't even try
4212 // when the byte was already changed. And don't try when we
4213 // just deleted this byte, accepting it is always cheaper than
4214 // delete + substitute.
4215 if (c == fword[sp->ts_fidx]
4216 || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE)
4217 )
4218 newscore = 0;
4219 else
4220 newscore = SCORE_SUBST;
4221 if ((newscore == 0
4222 || (sp->ts_fidx >= sp->ts_fidxtry
4223 && ((sp->ts_flags & TSF_DIDDEL) == 0
4224 || c != fword[sp->ts_delidx])))
4225 && TRY_DEEPER(su, stack, depth, newscore)) {
4226 go_deeper(stack, depth, newscore);
4227#ifdef DEBUG_TRIEWALK
4228 if (newscore > 0)
4229 sprintf(changename[depth], "%.*s-%s: subst %c to %c",
4230 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4231 fword[sp->ts_fidx], c);
4232 else
4233 sprintf(changename[depth], "%.*s-%s: accept %c",
4234 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4235 fword[sp->ts_fidx]);
4236#endif
4237 ++depth;
4238 sp = &stack[depth];
4239 ++sp->ts_fidx;
4240 tword[sp->ts_twordlen++] = c;
4241 sp->ts_arridx = idxs[arridx];
4242 if (newscore == SCORE_SUBST)
4243 sp->ts_isdiff = DIFF_YES;
4244 if (has_mbyte) {
4245 // Multi-byte characters are a bit complicated to
4246 // handle: They differ when any of the bytes differ
4247 // and then their length may also differ.
4248 if (sp->ts_tcharlen == 0) {
4249 // First byte.
4250 sp->ts_tcharidx = 0;
4251 sp->ts_tcharlen = MB_BYTE2LEN(c);
4252 sp->ts_fcharstart = sp->ts_fidx - 1;
4253 sp->ts_isdiff = (newscore != 0)
4254 ? DIFF_YES : DIFF_NONE;
4255 } else if (sp->ts_isdiff == DIFF_INSERT)
4256 // When inserting trail bytes don't advance in the
4257 // bad word.
4258 --sp->ts_fidx;
4259 if (++sp->ts_tcharidx == sp->ts_tcharlen) {
4260 // Last byte of character.
4261 if (sp->ts_isdiff == DIFF_YES) {
4262 // Correct ts_fidx for the byte length of the
4263 // character (we didn't check that before).
4264 sp->ts_fidx = sp->ts_fcharstart
4265 + MB_PTR2LEN(fword + sp->ts_fcharstart);
4266
4267 // For changing a composing character adjust
4268 // the score from SCORE_SUBST to
4269 // SCORE_SUBCOMP.
4270 if (enc_utf8
4271 && utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen
4272 - sp->ts_tcharlen))
4273 && utf_iscomposing(utf_ptr2char(fword
4274 + sp->ts_fcharstart))) {
4275 sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP;
4276 } else if (
4277 !soundfold
4278 && slang->sl_has_map
4279 && similar_chars(
4280 slang,
4281 utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen),
4282 utf_ptr2char(fword + sp->ts_fcharstart))) {
4283 // For a similar character adjust score from
4284 // SCORE_SUBST to SCORE_SIMILAR.
4285 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
4286 }
4287 } else if (sp->ts_isdiff == DIFF_INSERT
4288 && sp->ts_twordlen > sp->ts_tcharlen) {
4289 p = tword + sp->ts_twordlen - sp->ts_tcharlen;
4290 c = utf_ptr2char(p);
4291 if (utf_iscomposing(c)) {
4292 // Inserting a composing char doesn't
4293 // count that much.
4294 sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
4295 } else {
4296 // If the previous character was the same,
4297 // thus doubling a character, give a bonus
4298 // to the score. Also for the soundfold
4299 // tree (might seem illogical but does
4300 // give better scores).
4301 MB_PTR_BACK(tword, p);
4302 if (c == utf_ptr2char(p)) {
4303 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
4304 }
4305 }
4306 }
4307
4308 // Starting a new char, reset the length.
4309 sp->ts_tcharlen = 0;
4310 }
4311 } else {
4312 // If we found a similar char adjust the score.
4313 // We do this after calling go_deeper() because
4314 // it's slow.
4315 if (newscore != 0
4316 && !soundfold
4317 && slang->sl_has_map
4318 && similar_chars(slang,
4319 c, fword[sp->ts_fidx - 1]))
4320 sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR;
4321 }
4322 }
4323 }
4324 break;
4325
4326 case STATE_DEL:
4327 // When past the first byte of a multi-byte char don't try
4328 // delete/insert/swap a character.
4329 if (has_mbyte && sp->ts_tcharlen > 0) {
4330 PROF_STORE(sp->ts_state)
4331 sp->ts_state = STATE_FINAL;
4332 break;
4333 }
4334 // Try skipping one character in the bad word (delete it).
4335 PROF_STORE(sp->ts_state)
4336 sp->ts_state = STATE_INS_PREP;
4337 sp->ts_curi = 1;
4338 if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*')
4339 // Deleting a vowel at the start of a word counts less, see
4340 // soundalike_score().
4341 newscore = 2 * SCORE_DEL / 3;
4342 else
4343 newscore = SCORE_DEL;
4344 if (fword[sp->ts_fidx] != NUL
4345 && TRY_DEEPER(su, stack, depth, newscore)) {
4346 go_deeper(stack, depth, newscore);
4347#ifdef DEBUG_TRIEWALK
4348 sprintf(changename[depth], "%.*s-%s: delete %c",
4349 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4350 fword[sp->ts_fidx]);
4351#endif
4352 ++depth;
4353
4354 // Remember what character we deleted, so that we can avoid
4355 // inserting it again.
4356 stack[depth].ts_flags |= TSF_DIDDEL;
4357 stack[depth].ts_delidx = sp->ts_fidx;
4358
4359 // Advance over the character in fword[]. Give a bonus to the
4360 // score if the same character is following "nn" -> "n". It's
4361 // a bit illogical for soundfold tree but it does give better
4362 // results.
4363 c = utf_ptr2char(fword + sp->ts_fidx);
4364 stack[depth].ts_fidx += MB_PTR2LEN(fword + sp->ts_fidx);
4365 if (utf_iscomposing(c)) {
4366 stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
4367 } else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) {
4368 stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
4369 }
4370
4371 break;
4372 }
4373 FALLTHROUGH;
4374
4375 case STATE_INS_PREP:
4376 if (sp->ts_flags & TSF_DIDDEL) {
4377 // If we just deleted a byte then inserting won't make sense,
4378 // a substitute is always cheaper.
4379 PROF_STORE(sp->ts_state)
4380 sp->ts_state = STATE_SWAP;
4381 break;
4382 }
4383
4384 // skip over NUL bytes
4385 n = sp->ts_arridx;
4386 for (;; ) {
4387 if (sp->ts_curi > byts[n]) {
4388 // Only NUL bytes at this node, go to next state.
4389 PROF_STORE(sp->ts_state)
4390 sp->ts_state = STATE_SWAP;
4391 break;
4392 }
4393 if (byts[n + sp->ts_curi] != NUL) {
4394 // Found a byte to insert.
4395 PROF_STORE(sp->ts_state)
4396 sp->ts_state = STATE_INS;
4397 break;
4398 }
4399 ++sp->ts_curi;
4400 }
4401 break;
4402
4403 FALLTHROUGH;
4404
4405 case STATE_INS:
4406 // Insert one byte. Repeat this for each possible byte at this
4407 // node.
4408 n = sp->ts_arridx;
4409 if (sp->ts_curi > byts[n]) {
4410 // Done all bytes at this node, go to next state.
4411 PROF_STORE(sp->ts_state)
4412 sp->ts_state = STATE_SWAP;
4413 break;
4414 }
4415
4416 // Do one more byte at this node, but:
4417 // - Skip NUL bytes.
4418 // - Skip the byte if it's equal to the byte in the word,
4419 // accepting that byte is always better.
4420 n += sp->ts_curi++;
4421 c = byts[n];
4422 if (soundfold && sp->ts_twordlen == 0 && c == '*')
4423 // Inserting a vowel at the start of a word counts less,
4424 // see soundalike_score().
4425 newscore = 2 * SCORE_INS / 3;
4426 else
4427 newscore = SCORE_INS;
4428 if (c != fword[sp->ts_fidx]
4429 && TRY_DEEPER(su, stack, depth, newscore)) {
4430 go_deeper(stack, depth, newscore);
4431#ifdef DEBUG_TRIEWALK
4432 sprintf(changename[depth], "%.*s-%s: insert %c",
4433 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4434 c);
4435#endif
4436 ++depth;
4437 sp = &stack[depth];
4438 tword[sp->ts_twordlen++] = c;
4439 sp->ts_arridx = idxs[n];
4440 if (has_mbyte) {
4441 fl = MB_BYTE2LEN(c);
4442 if (fl > 1) {
4443 // There are following bytes for the same character.
4444 // We must find all bytes before trying
4445 // delete/insert/swap/etc.
4446 sp->ts_tcharlen = fl;
4447 sp->ts_tcharidx = 1;
4448 sp->ts_isdiff = DIFF_INSERT;
4449 }
4450 } else
4451 fl = 1;
4452 if (fl == 1) {
4453 // If the previous character was the same, thus doubling a
4454 // character, give a bonus to the score. Also for
4455 // soundfold words (illogical but does give a better
4456 // score).
4457 if (sp->ts_twordlen >= 2
4458 && tword[sp->ts_twordlen - 2] == c)
4459 sp->ts_score -= SCORE_INS - SCORE_INSDUP;
4460 }
4461 }
4462 break;
4463
4464 case STATE_SWAP:
4465 // Swap two bytes in the bad word: "12" -> "21".
4466 // We change "fword" here, it's changed back afterwards at
4467 // STATE_UNSWAP.
4468 p = fword + sp->ts_fidx;
4469 c = *p;
4470 if (c == NUL) {
4471 // End of word, can't swap or replace.
4472 PROF_STORE(sp->ts_state)
4473 sp->ts_state = STATE_FINAL;
4474 break;
4475 }
4476
4477 // Don't swap if the first character is not a word character.
4478 // SWAP3 etc. also don't make sense then.
4479 if (!soundfold && !spell_iswordp(p, curwin)) {
4480 PROF_STORE(sp->ts_state)
4481 sp->ts_state = STATE_REP_INI;
4482 break;
4483 }
4484
4485 n = MB_CPTR2LEN(p);
4486 c = utf_ptr2char(p);
4487 if (p[n] == NUL) {
4488 c2 = NUL;
4489 } else if (!soundfold && !spell_iswordp(p + n, curwin)) {
4490 c2 = c; // don't swap non-word char
4491 } else {
4492 c2 = utf_ptr2char(p + n);
4493 }
4494
4495 // When the second character is NUL we can't swap.
4496 if (c2 == NUL) {
4497 PROF_STORE(sp->ts_state)
4498 sp->ts_state = STATE_REP_INI;
4499 break;
4500 }
4501
4502 // When characters are identical, swap won't do anything.
4503 // Also get here if the second char is not a word character.
4504 if (c == c2) {
4505 PROF_STORE(sp->ts_state)
4506 sp->ts_state = STATE_SWAP3;
4507 break;
4508 }
4509 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP)) {
4510 go_deeper(stack, depth, SCORE_SWAP);
4511#ifdef DEBUG_TRIEWALK
4512 snprintf(changename[depth], sizeof(changename[0]),
4513 "%.*s-%s: swap %c and %c",
4514 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4515 c, c2);
4516#endif
4517 PROF_STORE(sp->ts_state)
4518 sp->ts_state = STATE_UNSWAP;
4519 depth++;
4520 fl = mb_char2len(c2);
4521 memmove(p, p + n, fl);
4522 utf_char2bytes(c, p + fl);
4523 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
4524 } else {
4525 // If this swap doesn't work then SWAP3 won't either.
4526 PROF_STORE(sp->ts_state)
4527 sp->ts_state = STATE_REP_INI;
4528 }
4529 break;
4530
4531 case STATE_UNSWAP:
4532 // Undo the STATE_SWAP swap: "21" -> "12".
4533 p = fword + sp->ts_fidx;
4534 n = MB_PTR2LEN(p);
4535 c = utf_ptr2char(p + n);
4536 memmove(p + MB_PTR2LEN(p + n), p, n);
4537 utf_char2bytes(c, p);
4538
4539 FALLTHROUGH;
4540
4541 case STATE_SWAP3:
4542 // Swap two bytes, skipping one: "123" -> "321". We change
4543 // "fword" here, it's changed back afterwards at STATE_UNSWAP3.
4544 p = fword + sp->ts_fidx;
4545 n = MB_CPTR2LEN(p);
4546 c = utf_ptr2char(p);
4547 fl = MB_CPTR2LEN(p + n);
4548 c2 = utf_ptr2char(p + n);
4549 if (!soundfold && !spell_iswordp(p + n + fl, curwin)) {
4550 c3 = c; // don't swap non-word char
4551 } else {
4552 c3 = utf_ptr2char(p + n + fl);
4553 }
4554
4555 // When characters are identical: "121" then SWAP3 result is
4556 // identical, ROT3L result is same as SWAP: "211", ROT3L result is
4557 // same as SWAP on next char: "112". Thus skip all swapping.
4558 // Also skip when c3 is NUL.
4559 // Also get here when the third character is not a word character.
4560 // Second character may any char: "a.b" -> "b.a"
4561 if (c == c3 || c3 == NUL) {
4562 PROF_STORE(sp->ts_state)
4563 sp->ts_state = STATE_REP_INI;
4564 break;
4565 }
4566 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) {
4567 go_deeper(stack, depth, SCORE_SWAP3);
4568#ifdef DEBUG_TRIEWALK
4569 sprintf(changename[depth], "%.*s-%s: swap3 %c and %c",
4570 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4571 c, c3);
4572#endif
4573 PROF_STORE(sp->ts_state)
4574 sp->ts_state = STATE_UNSWAP3;
4575 depth++;
4576 tl = mb_char2len(c3);
4577 memmove(p, p + n + fl, tl);
4578 utf_char2bytes(c2, p + tl);
4579 utf_char2bytes(c, p + fl + tl);
4580 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl;
4581 } else {
4582 PROF_STORE(sp->ts_state)
4583 sp->ts_state = STATE_REP_INI;
4584 }
4585 break;
4586
4587 case STATE_UNSWAP3:
4588 // Undo STATE_SWAP3: "321" -> "123"
4589 p = fword + sp->ts_fidx;
4590 n = MB_PTR2LEN(p);
4591 c2 = utf_ptr2char(p + n);
4592 fl = MB_PTR2LEN(p + n);
4593 c = utf_ptr2char(p + n + fl);
4594 tl = MB_PTR2LEN(p + n + fl);
4595 memmove(p + fl + tl, p, n);
4596 utf_char2bytes(c, p);
4597 utf_char2bytes(c2, p + tl);
4598 p = p + tl;
4599
4600 if (!soundfold && !spell_iswordp(p, curwin)) {
4601 // Middle char is not a word char, skip the rotate. First and
4602 // third char were already checked at swap and swap3.
4603 PROF_STORE(sp->ts_state)
4604 sp->ts_state = STATE_REP_INI;
4605 break;
4606 }
4607
4608 // Rotate three characters left: "123" -> "231". We change
4609 // "fword" here, it's changed back afterwards at STATE_UNROT3L.
4610 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) {
4611 go_deeper(stack, depth, SCORE_SWAP3);
4612#ifdef DEBUG_TRIEWALK
4613 p = fword + sp->ts_fidx;
4614 sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c",
4615 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4616 p[0], p[1], p[2]);
4617#endif
4618 PROF_STORE(sp->ts_state)
4619 sp->ts_state = STATE_UNROT3L;
4620 ++depth;
4621 p = fword + sp->ts_fidx;
4622 n = MB_CPTR2LEN(p);
4623 c = utf_ptr2char(p);
4624 fl = MB_CPTR2LEN(p + n);
4625 fl += MB_CPTR2LEN(p + n + fl);
4626 memmove(p, p + n, fl);
4627 utf_char2bytes(c, p + fl);
4628 stack[depth].ts_fidxtry = sp->ts_fidx + n + fl;
4629 } else {
4630 PROF_STORE(sp->ts_state)
4631 sp->ts_state = STATE_REP_INI;
4632 }
4633 break;
4634
4635 case STATE_UNROT3L:
4636 // Undo ROT3L: "231" -> "123"
4637 p = fword + sp->ts_fidx;
4638 n = MB_PTR2LEN(p);
4639 n += MB_PTR2LEN(p + n);
4640 c = utf_ptr2char(p + n);
4641 tl = MB_PTR2LEN(p + n);
4642 memmove(p + tl, p, n);
4643 utf_char2bytes(c, p);
4644
4645 // Rotate three bytes right: "123" -> "312". We change "fword"
4646 // here, it's changed back afterwards at STATE_UNROT3R.
4647 if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) {
4648 go_deeper(stack, depth, SCORE_SWAP3);
4649#ifdef DEBUG_TRIEWALK
4650 p = fword + sp->ts_fidx;
4651 sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c",
4652 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4653 p[0], p[1], p[2]);
4654#endif
4655 PROF_STORE(sp->ts_state)
4656 sp->ts_state = STATE_UNROT3R;
4657 ++depth;
4658 p = fword + sp->ts_fidx;
4659 n = MB_CPTR2LEN(p);
4660 n += MB_CPTR2LEN(p + n);
4661 c = utf_ptr2char(p + n);
4662 tl = MB_CPTR2LEN(p + n);
4663 memmove(p + tl, p, n);
4664 utf_char2bytes(c, p);
4665 stack[depth].ts_fidxtry = sp->ts_fidx + n + tl;
4666 } else {
4667 PROF_STORE(sp->ts_state)
4668 sp->ts_state = STATE_REP_INI;
4669 }
4670 break;
4671
4672 case STATE_UNROT3R:
4673 // Undo ROT3R: "312" -> "123"
4674 p = fword + sp->ts_fidx;
4675 c = utf_ptr2char(p);
4676 tl = MB_PTR2LEN(p);
4677 n = MB_PTR2LEN(p + tl);
4678 n += MB_PTR2LEN(p + tl + n);
4679 memmove(p, p + tl, n);
4680 utf_char2bytes(c, p + n);
4681
4682 FALLTHROUGH;
4683
4684 case STATE_REP_INI:
4685 // Check if matching with REP items from the .aff file would work.
4686 // Quickly skip if:
4687 // - there are no REP items and we are not in the soundfold trie
4688 // - the score is going to be too high anyway
4689 // - already applied a REP item or swapped here
4690 if ((lp->lp_replang == NULL && !soundfold)
4691 || sp->ts_score + SCORE_REP >= su->su_maxscore
4692 || sp->ts_fidx < sp->ts_fidxtry) {
4693 PROF_STORE(sp->ts_state)
4694 sp->ts_state = STATE_FINAL;
4695 break;
4696 }
4697
4698 // Use the first byte to quickly find the first entry that may
4699 // match. If the index is -1 there is none.
4700 if (soundfold)
4701 sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]];
4702 else
4703 sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]];
4704
4705 if (sp->ts_curi < 0) {
4706 PROF_STORE(sp->ts_state)
4707 sp->ts_state = STATE_FINAL;
4708 break;
4709 }
4710
4711 PROF_STORE(sp->ts_state)
4712 sp->ts_state = STATE_REP;
4713 FALLTHROUGH;
4714
4715 case STATE_REP:
4716 // Try matching with REP items from the .aff file. For each match
4717 // replace the characters and check if the resulting word is
4718 // valid.
4719 p = fword + sp->ts_fidx;
4720
4721 if (soundfold)
4722 gap = &slang->sl_repsal;
4723 else
4724 gap = &lp->lp_replang->sl_rep;
4725 while (sp->ts_curi < gap->ga_len) {
4726 ftp = (fromto_T *)gap->ga_data + sp->ts_curi++;
4727 if (*ftp->ft_from != *p) {
4728 // past possible matching entries
4729 sp->ts_curi = gap->ga_len;
4730 break;
4731 }
4732 if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0
4733 && TRY_DEEPER(su, stack, depth, SCORE_REP)) {
4734 go_deeper(stack, depth, SCORE_REP);
4735#ifdef DEBUG_TRIEWALK
4736 sprintf(changename[depth], "%.*s-%s: replace %s with %s",
4737 sp->ts_twordlen, tword, fword + sp->ts_fidx,
4738 ftp->ft_from, ftp->ft_to);
4739#endif
4740 // Need to undo this afterwards.
4741 PROF_STORE(sp->ts_state)
4742 sp->ts_state = STATE_REP_UNDO;
4743
4744 // Change the "from" to the "to" string.
4745 ++depth;
4746 fl = (int)STRLEN(ftp->ft_from);
4747 tl = (int)STRLEN(ftp->ft_to);
4748 if (fl != tl) {
4749 STRMOVE(p + tl, p + fl);
4750 repextra += tl - fl;
4751 }
4752 memmove(p, ftp->ft_to, tl);
4753 stack[depth].ts_fidxtry = sp->ts_fidx + tl;
4754 stack[depth].ts_tcharlen = 0;
4755 break;
4756 }
4757 }
4758
4759 if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP)
4760 // No (more) matches.
4761 PROF_STORE(sp->ts_state)
4762 sp->ts_state = STATE_FINAL;
4763
4764 break;
4765
4766 case STATE_REP_UNDO:
4767 // Undo a REP replacement and continue with the next one.
4768 if (soundfold)
4769 gap = &slang->sl_repsal;
4770 else
4771 gap = &lp->lp_replang->sl_rep;
4772 ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1;
4773 fl = (int)STRLEN(ftp->ft_from);
4774 tl = (int)STRLEN(ftp->ft_to);
4775 p = fword + sp->ts_fidx;
4776 if (fl != tl) {
4777 STRMOVE(p + fl, p + tl);
4778 repextra -= tl - fl;
4779 }
4780 memmove(p, ftp->ft_from, fl);
4781 PROF_STORE(sp->ts_state)
4782 sp->ts_state = STATE_REP;
4783 break;
4784
4785 default:
4786 // Did all possible states at this level, go up one level.
4787 --depth;
4788
4789 if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) {
4790 // Continue in or go back to the prefix tree.
4791 byts = pbyts;
4792 idxs = pidxs;
4793 }
4794
4795 // Don't check for CTRL-C too often, it takes time.
4796 if (--breakcheckcount == 0) {
4797 os_breakcheck();
4798 breakcheckcount = 1000;
4799 }
4800 }
4801 }
4802}
4803
4804
4805// Go one level deeper in the tree.
4806static void go_deeper(trystate_T *stack, int depth, int score_add)
4807{
4808 stack[depth + 1] = stack[depth];
4809 stack[depth + 1].ts_state = STATE_START;
4810 stack[depth + 1].ts_score = stack[depth].ts_score + score_add;
4811 stack[depth + 1].ts_curi = 1; // start just after length byte
4812 stack[depth + 1].ts_flags = 0;
4813}
4814
4815// Case-folding may change the number of bytes: Count nr of chars in
4816// fword[flen] and return the byte length of that many chars in "word".
4817static int nofold_len(char_u *fword, int flen, char_u *word)
4818{
4819 char_u *p;
4820 int i = 0;
4821
4822 for (p = fword; p < fword + flen; MB_PTR_ADV(p)) {
4823 i++;
4824 }
4825 for (p = word; i > 0; MB_PTR_ADV(p)) {
4826 i--;
4827 }
4828 return (int)(p - word);
4829}
4830
4831// "fword" is a good word with case folded. Find the matching keep-case
4832// words and put it in "kword".
4833// Theoretically there could be several keep-case words that result in the
4834// same case-folded word, but we only find one...
4835static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword)
4836{
4837 char_u uword[MAXWLEN]; // "fword" in upper-case
4838 int depth;
4839 idx_T tryidx;
4840
4841 // The following arrays are used at each depth in the tree.
4842 idx_T arridx[MAXWLEN];
4843 int round[MAXWLEN];
4844 int fwordidx[MAXWLEN];
4845 int uwordidx[MAXWLEN];
4846 int kwordlen[MAXWLEN];
4847
4848 int flen, ulen;
4849 int l;
4850 int len;
4851 int c;
4852 idx_T lo, hi, m;
4853 char_u *p;
4854 char_u *byts = slang->sl_kbyts; // array with bytes of the words
4855 idx_T *idxs = slang->sl_kidxs; // array with indexes
4856
4857 if (byts == NULL) {
4858 // array is empty: "cannot happen"
4859 *kword = NUL;
4860 return;
4861 }
4862
4863 // Make an all-cap version of "fword".
4864 allcap_copy(fword, uword);
4865
4866 // Each character needs to be tried both case-folded and upper-case.
4867 // All this gets very complicated if we keep in mind that changing case
4868 // may change the byte length of a multi-byte character...
4869 depth = 0;
4870 arridx[0] = 0;
4871 round[0] = 0;
4872 fwordidx[0] = 0;
4873 uwordidx[0] = 0;
4874 kwordlen[0] = 0;
4875 while (depth >= 0) {
4876 if (fword[fwordidx[depth]] == NUL) {
4877 // We are at the end of "fword". If the tree allows a word to end
4878 // here we have found a match.
4879 if (byts[arridx[depth] + 1] == 0) {
4880 kword[kwordlen[depth]] = NUL;
4881 return;
4882 }
4883
4884 // kword is getting too long, continue one level up
4885 --depth;
4886 } else if (++round[depth] > 2) {
4887 // tried both fold-case and upper-case character, continue one
4888 // level up
4889 --depth;
4890 } else {
4891 // round[depth] == 1: Try using the folded-case character.
4892 // round[depth] == 2: Try using the upper-case character.
4893 if (has_mbyte) {
4894 flen = MB_CPTR2LEN(fword + fwordidx[depth]);
4895 ulen = MB_CPTR2LEN(uword + uwordidx[depth]);
4896 } else {
4897 ulen = flen = 1;
4898 }
4899 if (round[depth] == 1) {
4900 p = fword + fwordidx[depth];
4901 l = flen;
4902 } else {
4903 p = uword + uwordidx[depth];
4904 l = ulen;
4905 }
4906
4907 for (tryidx = arridx[depth]; l > 0; --l) {
4908 // Perform a binary search in the list of accepted bytes.
4909 len = byts[tryidx++];
4910 c = *p++;
4911 lo = tryidx;
4912 hi = tryidx + len - 1;
4913 while (lo < hi) {
4914 m = (lo + hi) / 2;
4915 if (byts[m] > c)
4916 hi = m - 1;
4917 else if (byts[m] < c)
4918 lo = m + 1;
4919 else {
4920 lo = hi = m;
4921 break;
4922 }
4923 }
4924
4925 // Stop if there is no matching byte.
4926 if (hi < lo || byts[lo] != c)
4927 break;
4928
4929 // Continue at the child (if there is one).
4930 tryidx = idxs[lo];
4931 }
4932
4933 if (l == 0) {
4934 // Found the matching char. Copy it to "kword" and go a
4935 // level deeper.
4936 if (round[depth] == 1) {
4937 STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth],
4938 flen);
4939 kwordlen[depth + 1] = kwordlen[depth] + flen;
4940 } else {
4941 STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth],
4942 ulen);
4943 kwordlen[depth + 1] = kwordlen[depth] + ulen;
4944 }
4945 fwordidx[depth + 1] = fwordidx[depth] + flen;
4946 uwordidx[depth + 1] = uwordidx[depth] + ulen;
4947
4948 ++depth;
4949 arridx[depth] = tryidx;
4950 round[depth] = 0;
4951 }
4952 }
4953 }
4954
4955 // Didn't find it: "cannot happen".
4956 *kword = NUL;
4957}
4958
4959// Compute the sound-a-like score for suggestions in su->su_ga and add them to
4960// su->su_sga.
4961static void score_comp_sal(suginfo_T *su)
4962{
4963 langp_T *lp;
4964 char_u badsound[MAXWLEN];
4965 int i;
4966 suggest_T *stp;
4967 suggest_T *sstp;
4968 int score;
4969
4970 ga_grow(&su->su_sga, su->su_ga.ga_len);
4971
4972 // Use the sound-folding of the first language that supports it.
4973 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
4974 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
4975 if (!GA_EMPTY(&lp->lp_slang->sl_sal)) {
4976 // soundfold the bad word
4977 spell_soundfold(lp->lp_slang, su->su_fbadword, true, badsound);
4978
4979 for (i = 0; i < su->su_ga.ga_len; ++i) {
4980 stp = &SUG(su->su_ga, i);
4981
4982 // Case-fold the suggested word, sound-fold it and compute the
4983 // sound-a-like score.
4984 score = stp_sal_score(stp, su, lp->lp_slang, badsound);
4985 if (score < SCORE_MAXMAX) {
4986 // Add the suggestion.
4987 sstp = &SUG(su->su_sga, su->su_sga.ga_len);
4988 sstp->st_word = vim_strsave(stp->st_word);
4989 sstp->st_wordlen = stp->st_wordlen;
4990 sstp->st_score = score;
4991 sstp->st_altscore = 0;
4992 sstp->st_orglen = stp->st_orglen;
4993 ++su->su_sga.ga_len;
4994 }
4995 }
4996 break;
4997 }
4998 }
4999}
5000
5001// Combine the list of suggestions in su->su_ga and su->su_sga.
5002// They are entwined.
5003static void score_combine(suginfo_T *su)
5004{
5005 garray_T ga;
5006 garray_T *gap;
5007 langp_T *lp;
5008 suggest_T *stp;
5009 char_u *p;
5010 char_u badsound[MAXWLEN];
5011 int round;
5012 slang_T *slang = NULL;
5013
5014 // Add the alternate score to su_ga.
5015 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
5016 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
5017 if (!GA_EMPTY(&lp->lp_slang->sl_sal)) {
5018 // soundfold the bad word
5019 slang = lp->lp_slang;
5020 spell_soundfold(slang, su->su_fbadword, true, badsound);
5021
5022 for (int i = 0; i < su->su_ga.ga_len; ++i) {
5023 stp = &SUG(su->su_ga, i);
5024 stp->st_altscore = stp_sal_score(stp, su, slang, badsound);
5025 if (stp->st_altscore == SCORE_MAXMAX)
5026 stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4;
5027 else
5028 stp->st_score = (stp->st_score * 3
5029 + stp->st_altscore) / 4;
5030 stp->st_salscore = false;
5031 }
5032 break;
5033 }
5034 }
5035
5036 if (slang == NULL) { // Using "double" without sound folding.
5037 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore,
5038 su->su_maxcount);
5039 return;
5040 }
5041
5042 // Add the alternate score to su_sga.
5043 for (int i = 0; i < su->su_sga.ga_len; ++i) {
5044 stp = &SUG(su->su_sga, i);
5045 stp->st_altscore = spell_edit_score(slang,
5046 su->su_badword, stp->st_word);
5047 if (stp->st_score == SCORE_MAXMAX)
5048 stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8;
5049 else
5050 stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8;
5051 stp->st_salscore = true;
5052 }
5053
5054 // Remove bad suggestions, sort the suggestions and truncate at "maxcount"
5055 // for both lists.
5056 check_suggestions(su, &su->su_ga);
5057 (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount);
5058 check_suggestions(su, &su->su_sga);
5059 (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount);
5060
5061 ga_init(&ga, (int)sizeof(suginfo_T), 1);
5062 ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len);
5063
5064 stp = &SUG(ga, 0);
5065 for (int i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i) {
5066 // round 1: get a suggestion from su_ga
5067 // round 2: get a suggestion from su_sga
5068 for (round = 1; round <= 2; ++round) {
5069 gap = round == 1 ? &su->su_ga : &su->su_sga;
5070 if (i < gap->ga_len) {
5071 // Don't add a word if it's already there.
5072 p = SUG(*gap, i).st_word;
5073 int j;
5074 for (j = 0; j < ga.ga_len; ++j)
5075 if (STRCMP(stp[j].st_word, p) == 0)
5076 break;
5077 if (j == ga.ga_len)
5078 stp[ga.ga_len++] = SUG(*gap, i);
5079 else
5080 xfree(p);
5081 }
5082 }
5083 }
5084
5085 ga_clear(&su->su_ga);
5086 ga_clear(&su->su_sga);
5087
5088 // Truncate the list to the number of suggestions that will be displayed.
5089 if (ga.ga_len > su->su_maxcount) {
5090 for (int i = su->su_maxcount; i < ga.ga_len; ++i) {
5091 xfree(stp[i].st_word);
5092 }
5093 ga.ga_len = su->su_maxcount;
5094 }
5095
5096 su->su_ga = ga;
5097}
5098
5099// For the goodword in "stp" compute the soundalike score compared to the
5100// badword.
5101static int
5102stp_sal_score (
5103 suggest_T *stp,
5104 suginfo_T *su,
5105 slang_T *slang,
5106 char_u *badsound // sound-folded badword
5107)
5108{
5109 char_u *p;
5110 char_u *pbad;
5111 char_u *pgood;
5112 char_u badsound2[MAXWLEN];
5113 char_u fword[MAXWLEN];
5114 char_u goodsound[MAXWLEN];
5115 char_u goodword[MAXWLEN];
5116 int lendiff;
5117
5118 lendiff = su->su_badlen - stp->st_orglen;
5119 if (lendiff >= 0)
5120 pbad = badsound;
5121 else {
5122 // soundfold the bad word with more characters following
5123 (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN);
5124
5125 // When joining two words the sound often changes a lot. E.g., "t he"
5126 // sounds like "t h" while "the" sounds like "@". Avoid that by
5127 // removing the space. Don't do it when the good word also contains a
5128 // space.
5129 if (ascii_iswhite(su->su_badptr[su->su_badlen])
5130 && *skiptowhite(stp->st_word) == NUL)
5131 for (p = fword; *(p = skiptowhite(p)) != NUL; )
5132 STRMOVE(p, p + 1);
5133
5134 spell_soundfold(slang, fword, true, badsound2);
5135 pbad = badsound2;
5136 }
5137
5138 if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN) {
5139 // Add part of the bad word to the good word, so that we soundfold
5140 // what replaces the bad word.
5141 STRCPY(goodword, stp->st_word);
5142 STRLCPY(goodword + stp->st_wordlen,
5143 su->su_badptr + su->su_badlen - lendiff, lendiff + 1);
5144 pgood = goodword;
5145 } else
5146 pgood = stp->st_word;
5147
5148 // Sound-fold the word and compute the score for the difference.
5149 spell_soundfold(slang, pgood, false, goodsound);
5150
5151 return soundalike_score(goodsound, pbad);
5152}
5153
5154static sftword_T dumsft;
5155#define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft)))
5156#define HI2SFT(hi) HIKEY2SFT((hi)->hi_key)
5157
5158// Prepare for calling suggest_try_soundalike().
5159static void suggest_try_soundalike_prep(void)
5160{
5161 langp_T *lp;
5162 slang_T *slang;
5163
5164 // Do this for all languages that support sound folding and for which a
5165 // .sug file has been loaded.
5166 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
5167 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
5168 slang = lp->lp_slang;
5169 if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL)
5170 // prepare the hashtable used by add_sound_suggest()
5171 hash_init(&slang->sl_sounddone);
5172 }
5173}
5174
5175// Find suggestions by comparing the word in a sound-a-like form.
5176// Note: This doesn't support postponed prefixes.
5177static void suggest_try_soundalike(suginfo_T *su)
5178{
5179 char_u salword[MAXWLEN];
5180 langp_T *lp;
5181 slang_T *slang;
5182
5183 // Do this for all languages that support sound folding and for which a
5184 // .sug file has been loaded.
5185 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
5186 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
5187 slang = lp->lp_slang;
5188 if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL) {
5189 // soundfold the bad word
5190 spell_soundfold(slang, su->su_fbadword, true, salword);
5191
5192 // try all kinds of inserts/deletes/swaps/etc.
5193 // TODO: also soundfold the next words, so that we can try joining
5194 // and splitting
5195#ifdef SUGGEST_PROFILE
5196 prof_init();
5197#endif
5198 suggest_trie_walk(su, lp, salword, true);
5199#ifdef SUGGEST_PROFILE
5200 prof_report("soundalike");
5201#endif
5202 }
5203 }
5204}
5205
5206// Finish up after calling suggest_try_soundalike().
5207static void suggest_try_soundalike_finish(void)
5208{
5209 langp_T *lp;
5210 slang_T *slang;
5211 int todo;
5212 hashitem_T *hi;
5213
5214 // Do this for all languages that support sound folding and for which a
5215 // .sug file has been loaded.
5216 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
5217 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
5218 slang = lp->lp_slang;
5219 if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL) {
5220 // Free the info about handled words.
5221 todo = (int)slang->sl_sounddone.ht_used;
5222 for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi)
5223 if (!HASHITEM_EMPTY(hi)) {
5224 xfree(HI2SFT(hi));
5225 --todo;
5226 }
5227
5228 // Clear the hashtable, it may also be used by another region.
5229 hash_clear(&slang->sl_sounddone);
5230 hash_init(&slang->sl_sounddone);
5231 }
5232 }
5233}
5234
5235// A match with a soundfolded word is found. Add the good word(s) that
5236// produce this soundfolded word.
5237static void
5238add_sound_suggest (
5239 suginfo_T *su,
5240 char_u *goodword,
5241 int score, // soundfold score
5242 langp_T *lp
5243)
5244{
5245 slang_T *slang = lp->lp_slang; // language for sound folding
5246 int sfwordnr;
5247 char_u *nrline;
5248 int orgnr;
5249 char_u theword[MAXWLEN];
5250 int i;
5251 int wlen;
5252 char_u *byts;
5253 idx_T *idxs;
5254 int n;
5255 int wordcount;
5256 int wc;
5257 int goodscore;
5258 hash_T hash;
5259 hashitem_T *hi;
5260 sftword_T *sft;
5261 int bc, gc;
5262 int limit;
5263
5264 // It's very well possible that the same soundfold word is found several
5265 // times with different scores. Since the following is quite slow only do
5266 // the words that have a better score than before. Use a hashtable to
5267 // remember the words that have been done.
5268 hash = hash_hash(goodword);
5269 const size_t goodword_len = STRLEN(goodword);
5270 hi = hash_lookup(&slang->sl_sounddone, (const char *)goodword, goodword_len,
5271 hash);
5272 if (HASHITEM_EMPTY(hi)) {
5273 sft = xmalloc(sizeof(sftword_T) + goodword_len);
5274 sft->sft_score = score;
5275 memcpy(sft->sft_word, goodword, goodword_len + 1);
5276 hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash);
5277 } else {
5278 sft = HI2SFT(hi);
5279 if (score >= sft->sft_score)
5280 return;
5281 sft->sft_score = score;
5282 }
5283
5284 // Find the word nr in the soundfold tree.
5285 sfwordnr = soundfold_find(slang, goodword);
5286 if (sfwordnr < 0) {
5287 internal_error("add_sound_suggest()");
5288 return;
5289 }
5290
5291 // Go over the list of good words that produce this soundfold word
5292 nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)sfwordnr + 1, false);
5293 orgnr = 0;
5294 while (*nrline != NUL) {
5295 // The wordnr was stored in a minimal nr of bytes as an offset to the
5296 // previous wordnr.
5297 orgnr += bytes2offset(&nrline);
5298
5299 byts = slang->sl_fbyts;
5300 idxs = slang->sl_fidxs;
5301
5302 // Lookup the word "orgnr" one of the two tries.
5303 n = 0;
5304 wordcount = 0;
5305 for (wlen = 0; wlen < MAXWLEN - 3; ++wlen) {
5306 i = 1;
5307 if (wordcount == orgnr && byts[n + 1] == NUL)
5308 break; // found end of word
5309
5310 if (byts[n + 1] == NUL)
5311 ++wordcount;
5312
5313 // skip over the NUL bytes
5314 for (; byts[n + i] == NUL; ++i)
5315 if (i > byts[n]) { // safety check
5316 STRCPY(theword + wlen, "BAD");
5317 wlen += 3;
5318 goto badword;
5319 }
5320
5321 // One of the siblings must have the word.
5322 for (; i < byts[n]; ++i) {
5323 wc = idxs[idxs[n + i]]; // nr of words under this byte
5324 if (wordcount + wc > orgnr)
5325 break;
5326 wordcount += wc;
5327 }
5328
5329 theword[wlen] = byts[n + i];
5330 n = idxs[n + i];
5331 }
5332badword:
5333 theword[wlen] = NUL;
5334
5335 // Go over the possible flags and regions.
5336 for (; i <= byts[n] && byts[n + i] == NUL; ++i) {
5337 char_u cword[MAXWLEN];
5338 char_u *p;
5339 int flags = (int)idxs[n + i];
5340
5341 // Skip words with the NOSUGGEST flag
5342 if (flags & WF_NOSUGGEST)
5343 continue;
5344
5345 if (flags & WF_KEEPCAP) {
5346 // Must find the word in the keep-case tree.
5347 find_keepcap_word(slang, theword, cword);
5348 p = cword;
5349 } else {
5350 flags |= su->su_badflags;
5351 if ((flags & WF_CAPMASK) != 0) {
5352 // Need to fix case according to "flags".
5353 make_case_word(theword, cword, flags);
5354 p = cword;
5355 } else
5356 p = theword;
5357 }
5358
5359 // Add the suggestion.
5360 if (sps_flags & SPS_DOUBLE) {
5361 // Add the suggestion if the score isn't too bad.
5362 if (score <= su->su_maxscore)
5363 add_suggestion(su, &su->su_sga, p, su->su_badlen,
5364 score, 0, false, slang, false);
5365 } else {
5366 // Add a penalty for words in another region.
5367 if ((flags & WF_REGION)
5368 && (((unsigned)flags >> 16) & lp->lp_region) == 0)
5369 goodscore = SCORE_REGION;
5370 else
5371 goodscore = 0;
5372
5373 // Add a small penalty for changing the first letter from
5374 // lower to upper case. Helps for "tath" -> "Kath", which is
5375 // less common than "tath" -> "path". Don't do it when the
5376 // letter is the same, that has already been counted.
5377 gc = PTR2CHAR(p);
5378 if (SPELL_ISUPPER(gc)) {
5379 bc = PTR2CHAR(su->su_badword);
5380 if (!SPELL_ISUPPER(bc)
5381 && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc))
5382 goodscore += SCORE_ICASE / 2;
5383 }
5384
5385 // Compute the score for the good word. This only does letter
5386 // insert/delete/swap/replace. REP items are not considered,
5387 // which may make the score a bit higher.
5388 // Use a limit for the score to make it work faster. Use
5389 // MAXSCORE(), because RESCORE() will change the score.
5390 // If the limit is very high then the iterative method is
5391 // inefficient, using an array is quicker.
5392 limit = MAXSCORE(su->su_sfmaxscore - goodscore, score);
5393 if (limit > SCORE_LIMITMAX)
5394 goodscore += spell_edit_score(slang, su->su_badword, p);
5395 else
5396 goodscore += spell_edit_score_limit(slang, su->su_badword,
5397 p, limit);
5398
5399 // When going over the limit don't bother to do the rest.
5400 if (goodscore < SCORE_MAXMAX) {
5401 // Give a bonus to words seen before.
5402 goodscore = score_wordcount_adj(slang, goodscore, p, false);
5403
5404 // Add the suggestion if the score isn't too bad.
5405 goodscore = RESCORE(goodscore, score);
5406 if (goodscore <= su->su_sfmaxscore)
5407 add_suggestion(su, &su->su_ga, p, su->su_badlen,
5408 goodscore, score, true, slang, true);
5409 }
5410 }
5411 }
5412 }
5413}
5414
5415// Find word "word" in fold-case tree for "slang" and return the word number.
5416static int soundfold_find(slang_T *slang, char_u *word)
5417{
5418 idx_T arridx = 0;
5419 int len;
5420 int wlen = 0;
5421 int c;
5422 char_u *ptr = word;
5423 char_u *byts;
5424 idx_T *idxs;
5425 int wordnr = 0;
5426
5427 byts = slang->sl_sbyts;
5428 idxs = slang->sl_sidxs;
5429
5430 for (;; ) {
5431 // First byte is the number of possible bytes.
5432 len = byts[arridx++];
5433
5434 // If the first possible byte is a zero the word could end here.
5435 // If the word ends we found the word. If not skip the NUL bytes.
5436 c = ptr[wlen];
5437 if (byts[arridx] == NUL) {
5438 if (c == NUL)
5439 break;
5440
5441 // Skip over the zeros, there can be several.
5442 while (len > 0 && byts[arridx] == NUL) {
5443 ++arridx;
5444 --len;
5445 }
5446 if (len == 0)
5447 return -1; // no children, word should have ended here
5448 ++wordnr;
5449 }
5450
5451 // If the word ends we didn't find it.
5452 if (c == NUL)
5453 return -1;
5454
5455 // Perform a binary search in the list of accepted bytes.
5456 if (c == TAB) // <Tab> is handled like <Space>
5457 c = ' ';
5458 while (byts[arridx] < c) {
5459 // The word count is in the first idxs[] entry of the child.
5460 wordnr += idxs[idxs[arridx]];
5461 ++arridx;
5462 if (--len == 0) // end of the bytes, didn't find it
5463 return -1;
5464 }
5465 if (byts[arridx] != c) // didn't find the byte
5466 return -1;
5467
5468 // Continue at the child (if there is one).
5469 arridx = idxs[arridx];
5470 ++wlen;
5471
5472 // One space in the good word may stand for several spaces in the
5473 // checked word.
5474 if (c == ' ')
5475 while (ptr[wlen] == ' ' || ptr[wlen] == TAB)
5476 ++wlen;
5477 }
5478
5479 return wordnr;
5480}
5481
5482// Copy "fword" to "cword", fixing case according to "flags".
5483static void make_case_word(char_u *fword, char_u *cword, int flags)
5484{
5485 if (flags & WF_ALLCAP)
5486 // Make it all upper-case
5487 allcap_copy(fword, cword);
5488 else if (flags & WF_ONECAP)
5489 // Make the first letter upper-case
5490 onecap_copy(fword, cword, true);
5491 else
5492 // Use goodword as-is.
5493 STRCPY(cword, fword);
5494}
5495
5496// Returns true if "c1" and "c2" are similar characters according to the MAP
5497// lines in the .aff file.
5498static bool similar_chars(slang_T *slang, int c1, int c2)
5499{
5500 int m1, m2;
5501 char_u buf[MB_MAXBYTES + 1];
5502 hashitem_T *hi;
5503
5504 if (c1 >= 256) {
5505 buf[utf_char2bytes(c1, buf)] = 0;
5506 hi = hash_find(&slang->sl_map_hash, buf);
5507 if (HASHITEM_EMPTY(hi)) {
5508 m1 = 0;
5509 } else {
5510 m1 = utf_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
5511 }
5512 } else {
5513 m1 = slang->sl_map_array[c1];
5514 }
5515 if (m1 == 0) {
5516 return false;
5517 }
5518
5519 if (c2 >= 256) {
5520 buf[utf_char2bytes(c2, buf)] = 0;
5521 hi = hash_find(&slang->sl_map_hash, buf);
5522 if (HASHITEM_EMPTY(hi)) {
5523 m2 = 0;
5524 } else {
5525 m2 = utf_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1);
5526 }
5527 } else {
5528 m2 = slang->sl_map_array[c2];
5529 }
5530
5531 return m1 == m2;
5532}
5533
5534// Adds a suggestion to the list of suggestions.
5535// For a suggestion that is already in the list the lowest score is remembered.
5536static void
5537add_suggestion (
5538 suginfo_T *su,
5539 garray_T *gap, // either su_ga or su_sga
5540 const char_u *goodword,
5541 int badlenarg, // len of bad word replaced with "goodword"
5542 int score,
5543 int altscore,
5544 bool had_bonus, // value for st_had_bonus
5545 slang_T *slang, // language for sound folding
5546 bool maxsf // su_maxscore applies to soundfold score,
5547 // su_sfmaxscore to the total score.
5548)
5549{
5550 int goodlen; // len of goodword changed
5551 int badlen; // len of bad word changed
5552 suggest_T *stp;
5553 suggest_T new_sug;
5554
5555 // Minimize "badlen" for consistency. Avoids that changing "the the" to
5556 // "thee the" is added next to changing the first "the" the "thee".
5557 const char_u *pgood = goodword + STRLEN(goodword);
5558 char_u *pbad = su->su_badptr + badlenarg;
5559 for (;; ) {
5560 goodlen = (int)(pgood - goodword);
5561 badlen = (int)(pbad - su->su_badptr);
5562 if (goodlen <= 0 || badlen <= 0)
5563 break;
5564 MB_PTR_BACK(goodword, pgood);
5565 MB_PTR_BACK(su->su_badptr, pbad);
5566 if (utf_ptr2char(pgood) != utf_ptr2char(pbad)) {
5567 break;
5568 }
5569 }
5570
5571 if (badlen == 0 && goodlen == 0)
5572 // goodword doesn't change anything; may happen for "the the" changing
5573 // the first "the" to itself.
5574 return;
5575
5576 int i;
5577 if (GA_EMPTY(gap)) {
5578 i = -1;
5579 } else {
5580 // Check if the word is already there. Also check the length that is
5581 // being replaced "thes," -> "these" is a different suggestion from
5582 // "thes" -> "these".
5583 stp = &SUG(*gap, 0);
5584 for (i = gap->ga_len; --i >= 0; ++stp) {
5585 if (stp->st_wordlen == goodlen
5586 && stp->st_orglen == badlen
5587 && STRNCMP(stp->st_word, goodword, goodlen) == 0) {
5588 // Found it. Remember the word with the lowest score.
5589 if (stp->st_slang == NULL)
5590 stp->st_slang = slang;
5591
5592 new_sug.st_score = score;
5593 new_sug.st_altscore = altscore;
5594 new_sug.st_had_bonus = had_bonus;
5595
5596 if (stp->st_had_bonus != had_bonus) {
5597 // Only one of the two had the soundalike score computed.
5598 // Need to do that for the other one now, otherwise the
5599 // scores can't be compared. This happens because
5600 // suggest_try_change() doesn't compute the soundalike
5601 // word to keep it fast, while some special methods set
5602 // the soundalike score to zero.
5603 if (had_bonus)
5604 rescore_one(su, stp);
5605 else {
5606 new_sug.st_word = stp->st_word;
5607 new_sug.st_wordlen = stp->st_wordlen;
5608 new_sug.st_slang = stp->st_slang;
5609 new_sug.st_orglen = badlen;
5610 rescore_one(su, &new_sug);
5611 }
5612 }
5613
5614 if (stp->st_score > new_sug.st_score) {
5615 stp->st_score = new_sug.st_score;
5616 stp->st_altscore = new_sug.st_altscore;
5617 stp->st_had_bonus = new_sug.st_had_bonus;
5618 }
5619 break;
5620 }
5621 }
5622 }
5623
5624 if (i < 0) {
5625 // Add a suggestion.
5626 stp = GA_APPEND_VIA_PTR(suggest_T, gap);
5627 stp->st_word = vim_strnsave(goodword, goodlen);
5628 stp->st_wordlen = goodlen;
5629 stp->st_score = score;
5630 stp->st_altscore = altscore;
5631 stp->st_had_bonus = had_bonus;
5632 stp->st_orglen = badlen;
5633 stp->st_slang = slang;
5634
5635 // If we have too many suggestions now, sort the list and keep
5636 // the best suggestions.
5637 if (gap->ga_len > SUG_MAX_COUNT(su)) {
5638 if (maxsf)
5639 su->su_sfmaxscore = cleanup_suggestions(gap,
5640 su->su_sfmaxscore, SUG_CLEAN_COUNT(su));
5641 else
5642 su->su_maxscore = cleanup_suggestions(gap,
5643 su->su_maxscore, SUG_CLEAN_COUNT(su));
5644 }
5645 }
5646}
5647
5648// Suggestions may in fact be flagged as errors. Esp. for banned words and
5649// for split words, such as "the the". Remove these from the list here.
5650static void
5651check_suggestions (
5652 suginfo_T *su,
5653 garray_T *gap // either su_ga or su_sga
5654)
5655{
5656 suggest_T *stp;
5657 char_u longword[MAXWLEN + 1];
5658 int len;
5659 hlf_T attr;
5660
5661 stp = &SUG(*gap, 0);
5662 for (int i = gap->ga_len - 1; i >= 0; --i) {
5663 // Need to append what follows to check for "the the".
5664 STRLCPY(longword, stp[i].st_word, MAXWLEN + 1);
5665 len = stp[i].st_wordlen;
5666 STRLCPY(longword + len, su->su_badptr + stp[i].st_orglen,
5667 MAXWLEN - len + 1);
5668 attr = HLF_COUNT;
5669 (void)spell_check(curwin, longword, &attr, NULL, false);
5670 if (attr != HLF_COUNT) {
5671 // Remove this entry.
5672 xfree(stp[i].st_word);
5673 --gap->ga_len;
5674 if (i < gap->ga_len)
5675 memmove(stp + i, stp + i + 1,
5676 sizeof(suggest_T) * (gap->ga_len - i));
5677 }
5678 }
5679}
5680
5681
5682// Add a word to be banned.
5683static void add_banned(suginfo_T *su, char_u *word)
5684{
5685 char_u *s;
5686 hash_T hash;
5687 hashitem_T *hi;
5688
5689 hash = hash_hash(word);
5690 const size_t word_len = STRLEN(word);
5691 hi = hash_lookup(&su->su_banned, (const char *)word, word_len, hash);
5692 if (HASHITEM_EMPTY(hi)) {
5693 s = xmemdupz(word, word_len);
5694 hash_add_item(&su->su_banned, hi, s, hash);
5695 }
5696}
5697
5698// Recompute the score for all suggestions if sound-folding is possible. This
5699// is slow, thus only done for the final results.
5700static void rescore_suggestions(suginfo_T *su)
5701{
5702 if (su->su_sallang != NULL) {
5703 for (int i = 0; i < su->su_ga.ga_len; ++i) {
5704 rescore_one(su, &SUG(su->su_ga, i));
5705 }
5706 }
5707}
5708
5709// Recompute the score for one suggestion if sound-folding is possible.
5710static void rescore_one(suginfo_T *su, suggest_T *stp)
5711{
5712 slang_T *slang = stp->st_slang;
5713 char_u sal_badword[MAXWLEN];
5714 char_u *p;
5715
5716 // Only rescore suggestions that have no sal score yet and do have a
5717 // language.
5718 if (slang != NULL && !GA_EMPTY(&slang->sl_sal) && !stp->st_had_bonus) {
5719 if (slang == su->su_sallang)
5720 p = su->su_sal_badword;
5721 else {
5722 spell_soundfold(slang, su->su_fbadword, true, sal_badword);
5723 p = sal_badword;
5724 }
5725
5726 stp->st_altscore = stp_sal_score(stp, su, slang, p);
5727 if (stp->st_altscore == SCORE_MAXMAX)
5728 stp->st_altscore = SCORE_BIG;
5729 stp->st_score = RESCORE(stp->st_score, stp->st_altscore);
5730 stp->st_had_bonus = true;
5731 }
5732}
5733
5734
5735// Function given to qsort() to sort the suggestions on st_score.
5736// First on "st_score", then "st_altscore" then alphabetically.
5737static int sug_compare(const void *s1, const void *s2)
5738{
5739 suggest_T *p1 = (suggest_T *)s1;
5740 suggest_T *p2 = (suggest_T *)s2;
5741 int n = p1->st_score - p2->st_score;
5742
5743 if (n == 0) {
5744 n = p1->st_altscore - p2->st_altscore;
5745 if (n == 0)
5746 n = STRICMP(p1->st_word, p2->st_word);
5747 }
5748 return n;
5749}
5750
5751// Cleanup the suggestions:
5752// - Sort on score.
5753// - Remove words that won't be displayed.
5754// Returns the maximum score in the list or "maxscore" unmodified.
5755static int
5756cleanup_suggestions (
5757 garray_T *gap,
5758 int maxscore,
5759 int keep // nr of suggestions to keep
5760)
5761{
5762 suggest_T *stp = &SUG(*gap, 0);
5763
5764 // Sort the list.
5765 qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare);
5766
5767 // Truncate the list to the number of suggestions that will be displayed.
5768 if (gap->ga_len > keep) {
5769 for (int i = keep; i < gap->ga_len; ++i) {
5770 xfree(stp[i].st_word);
5771 }
5772 gap->ga_len = keep;
5773 return stp[keep - 1].st_score;
5774 }
5775 return maxscore;
5776}
5777
5778/// Soundfold a string, for soundfold()
5779///
5780/// @param[in] word Word to soundfold.
5781///
5782/// @return [allocated] soundfolded string or NULL in case of error. May return
5783/// copy of the input string if soundfolding is not
5784/// supported by any of the languages in &spellang.
5785char *eval_soundfold(const char *const word)
5786 FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_MALLOC FUNC_ATTR_NONNULL_ALL
5787{
5788 if (curwin->w_p_spell && *curwin->w_s->b_p_spl != NUL) {
5789 // Use the sound-folding of the first language that supports it.
5790 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) {
5791 langp_T *const lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
5792 if (!GA_EMPTY(&lp->lp_slang->sl_sal)) {
5793 // soundfold the word
5794 char_u sound[MAXWLEN];
5795 spell_soundfold(lp->lp_slang, (char_u *)word, false, sound);
5796 return xstrdup((const char *)sound);
5797 }
5798 }
5799 }
5800
5801 // No language with sound folding, return word as-is.
5802 return xstrdup(word);
5803}
5804
5805/// Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
5806///
5807/// There are many ways to turn a word into a sound-a-like representation. The
5808/// oldest is Soundex (1918!). A nice overview can be found in "Approximate
5809/// swedish name matching - survey and test of different algorithms" by Klas
5810/// Erikson.
5811///
5812/// We support two methods:
5813/// 1. SOFOFROM/SOFOTO do a simple character mapping.
5814/// 2. SAL items define a more advanced sound-folding (and much slower).
5815///
5816/// @param[in] slang
5817/// @param[in] inword word to soundfold
5818/// @param[in] folded whether inword is already case-folded
5819/// @param[in,out] res destination for soundfolded word
5820void spell_soundfold(slang_T *slang, char_u *inword, bool folded, char_u *res)
5821{
5822 char_u fword[MAXWLEN];
5823 char_u *word;
5824
5825 if (slang->sl_sofo)
5826 // SOFOFROM and SOFOTO used
5827 spell_soundfold_sofo(slang, inword, res);
5828 else {
5829 // SAL items used. Requires the word to be case-folded.
5830 if (folded)
5831 word = inword;
5832 else {
5833 (void)spell_casefold(inword, (int)STRLEN(inword), fword, MAXWLEN);
5834 word = fword;
5835 }
5836
5837 if (has_mbyte)
5838 spell_soundfold_wsal(slang, word, res);
5839 else
5840 spell_soundfold_sal(slang, word, res);
5841 }
5842}
5843
5844// Perform sound folding of "inword" into "res" according to SOFOFROM and
5845// SOFOTO lines.
5846static void spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res)
5847{
5848 char_u *s;
5849 int ri = 0;
5850 int c;
5851
5852 if (has_mbyte) {
5853 int prevc = 0;
5854 int *ip;
5855
5856 // The sl_sal_first[] table contains the translation for chars up to
5857 // 255, sl_sal the rest.
5858 for (s = inword; *s != NUL; ) {
5859 c = mb_cptr2char_adv((const char_u **)&s);
5860 if (enc_utf8 ? utf_class(c) == 0 : ascii_iswhite(c)) {
5861 c = ' ';
5862 } else if (c < 256) {
5863 c = slang->sl_sal_first[c];
5864 } else {
5865 ip = ((int **)slang->sl_sal.ga_data)[c & 0xff];
5866 if (ip == NULL) // empty list, can't match
5867 c = NUL;
5868 else
5869 for (;; ) { // find "c" in the list
5870 if (*ip == 0) { // not found
5871 c = NUL;
5872 break;
5873 }
5874 if (*ip == c) { // match!
5875 c = ip[1];
5876 break;
5877 }
5878 ip += 2;
5879 }
5880 }
5881
5882 if (c != NUL && c != prevc) {
5883 ri += utf_char2bytes(c, res + ri);
5884 if (ri + MB_MAXBYTES > MAXWLEN) {
5885 break;
5886 }
5887 prevc = c;
5888 }
5889 }
5890 } else {
5891 // The sl_sal_first[] table contains the translation.
5892 for (s = inword; (c = *s) != NUL; ++s) {
5893 if (ascii_iswhite(c))
5894 c = ' ';
5895 else
5896 c = slang->sl_sal_first[c];
5897 if (c != NUL && (ri == 0 || res[ri - 1] != c))
5898 res[ri++] = c;
5899 }
5900 }
5901
5902 res[ri] = NUL;
5903}
5904
5905static void spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res)
5906{
5907 salitem_T *smp;
5908 char_u word[MAXWLEN];
5909 char_u *s = inword;
5910 char_u *t;
5911 char_u *pf;
5912 int i, j, z;
5913 int reslen;
5914 int n, k = 0;
5915 int z0;
5916 int k0;
5917 int n0;
5918 int c;
5919 int pri;
5920 int p0 = -333;
5921 int c0;
5922
5923 // Remove accents, if wanted. We actually remove all non-word characters.
5924 // But keep white space. We need a copy, the word may be changed here.
5925 if (slang->sl_rem_accents) {
5926 t = word;
5927 while (*s != NUL) {
5928 if (ascii_iswhite(*s)) {
5929 *t++ = ' ';
5930 s = skipwhite(s);
5931 } else {
5932 if (spell_iswordp_nmw(s, curwin))
5933 *t++ = *s;
5934 ++s;
5935 }
5936 }
5937 *t = NUL;
5938 } else
5939 STRLCPY(word, s, MAXWLEN);
5940
5941 smp = (salitem_T *)slang->sl_sal.ga_data;
5942
5943 // This comes from Aspell phonet.cpp. Converted from C++ to C.
5944 // Changed to keep spaces.
5945 i = reslen = z = 0;
5946 while ((c = word[i]) != NUL) {
5947 // Start with the first rule that has the character in the word.
5948 n = slang->sl_sal_first[c];
5949 z0 = 0;
5950
5951 if (n >= 0) {
5952 // check all rules for the same letter
5953 for (; (s = smp[n].sm_lead)[0] == c; ++n) {
5954 // Quickly skip entries that don't match the word. Most
5955 // entries are less then three chars, optimize for that.
5956 k = smp[n].sm_leadlen;
5957 if (k > 1) {
5958 if (word[i + 1] != s[1])
5959 continue;
5960 if (k > 2) {
5961 for (j = 2; j < k; ++j)
5962 if (word[i + j] != s[j])
5963 break;
5964 if (j < k)
5965 continue;
5966 }
5967 }
5968
5969 if ((pf = smp[n].sm_oneof) != NULL) {
5970 // Check for match with one of the chars in "sm_oneof".
5971 while (*pf != NUL && *pf != word[i + k])
5972 ++pf;
5973 if (*pf == NUL)
5974 continue;
5975 ++k;
5976 }
5977 s = smp[n].sm_rules;
5978 pri = 5; // default priority
5979
5980 p0 = *s;
5981 k0 = k;
5982 while (*s == '-' && k > 1) {
5983 k--;
5984 s++;
5985 }
5986 if (*s == '<')
5987 s++;
5988 if (ascii_isdigit(*s)) {
5989 // determine priority
5990 pri = *s - '0';
5991 s++;
5992 }
5993 if (*s == '^' && *(s + 1) == '^')
5994 s++;
5995
5996 if (*s == NUL
5997 || (*s == '^'
5998 && (i == 0 || !(word[i - 1] == ' '
5999 || spell_iswordp(word + i - 1, curwin)))
6000 && (*(s + 1) != '$'
6001 || (!spell_iswordp(word + i + k0, curwin))))
6002 || (*s == '$' && i > 0
6003 && spell_iswordp(word + i - 1, curwin)
6004 && (!spell_iswordp(word + i + k0, curwin)))) {
6005 // search for followup rules, if:
6006 // followup and k > 1 and NO '-' in searchstring
6007 c0 = word[i + k - 1];
6008 n0 = slang->sl_sal_first[c0];
6009
6010 if (slang->sl_followup && k > 1 && n0 >= 0
6011 && p0 != '-' && word[i + k] != NUL) {
6012 // test follow-up rule for "word[i + k]"
6013 for (; (s = smp[n0].sm_lead)[0] == c0; ++n0) {
6014 // Quickly skip entries that don't match the word.
6015 k0 = smp[n0].sm_leadlen;
6016 if (k0 > 1) {
6017 if (word[i + k] != s[1])
6018 continue;
6019 if (k0 > 2) {
6020 pf = word + i + k + 1;
6021 for (j = 2; j < k0; ++j)
6022 if (*pf++ != s[j])
6023 break;
6024 if (j < k0)
6025 continue;
6026 }
6027 }
6028 k0 += k - 1;
6029
6030 if ((pf = smp[n0].sm_oneof) != NULL) {
6031 // Check for match with one of the chars in
6032 // "sm_oneof".
6033 while (*pf != NUL && *pf != word[i + k0])
6034 ++pf;
6035 if (*pf == NUL)
6036 continue;
6037 ++k0;
6038 }
6039
6040 p0 = 5;
6041 s = smp[n0].sm_rules;
6042 while (*s == '-') {
6043 // "k0" gets NOT reduced because
6044 // "if (k0 == k)"
6045 s++;
6046 }
6047 if (*s == '<')
6048 s++;
6049 if (ascii_isdigit(*s)) {
6050 p0 = *s - '0';
6051 s++;
6052 }
6053
6054 if (*s == NUL
6055 // *s == '^' cuts
6056 || (*s == '$'
6057 && !spell_iswordp(word + i + k0,
6058 curwin))) {
6059 if (k0 == k)
6060 // this is just a piece of the string
6061 continue;
6062
6063 if (p0 < pri)
6064 // priority too low
6065 continue;
6066 // rule fits; stop search
6067 break;
6068 }
6069 }
6070
6071 if (p0 >= pri && smp[n0].sm_lead[0] == c0)
6072 continue;
6073 }
6074
6075 // replace string
6076 s = smp[n].sm_to;
6077 if (s == NULL)
6078 s = (char_u *)"";
6079 pf = smp[n].sm_rules;
6080 p0 = (vim_strchr(pf, '<') != NULL) ? 1 : 0;
6081 if (p0 == 1 && z == 0) {
6082 // rule with '<' is used
6083 if (reslen > 0 && *s != NUL && (res[reslen - 1] == c
6084 || res[reslen - 1] == *s))
6085 reslen--;
6086 z0 = 1;
6087 z = 1;
6088 k0 = 0;
6089 while (*s != NUL && word[i + k0] != NUL) {
6090 word[i + k0] = *s;
6091 k0++;
6092 s++;
6093 }
6094 if (k > k0)
6095 STRMOVE(word + i + k0, word + i + k);
6096
6097 // new "actual letter"
6098 c = word[i];
6099 } else {
6100 // no '<' rule used
6101 i += k - 1;
6102 z = 0;
6103 while (*s != NUL && s[1] != NUL && reslen < MAXWLEN) {
6104 if (reslen == 0 || res[reslen - 1] != *s)
6105 res[reslen++] = *s;
6106 s++;
6107 }
6108 // new "actual letter"
6109 c = *s;
6110 if (strstr((char *)pf, "^^") != NULL) {
6111 if (c != NUL)
6112 res[reslen++] = c;
6113 STRMOVE(word, word + i + 1);
6114 i = 0;
6115 z0 = 1;
6116 }
6117 }
6118 break;
6119 }
6120 }
6121 } else if (ascii_iswhite(c)) {
6122 c = ' ';
6123 k = 1;
6124 }
6125
6126 if (z0 == 0) {
6127 if (k && !p0 && reslen < MAXWLEN && c != NUL
6128 && (!slang->sl_collapse || reslen == 0
6129 || res[reslen - 1] != c))
6130 // condense only double letters
6131 res[reslen++] = c;
6132
6133 i++;
6134 z = 0;
6135 k = 0;
6136 }
6137 }
6138
6139 res[reslen] = NUL;
6140}
6141
6142// Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]".
6143// Multi-byte version of spell_soundfold().
6144static void spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res)
6145{
6146 salitem_T *smp = (salitem_T *)slang->sl_sal.ga_data;
6147 int word[MAXWLEN];
6148 int wres[MAXWLEN];
6149 int l;
6150 int *ws;
6151 int *pf;
6152 int i, j, z;
6153 int reslen;
6154 int n, k = 0;
6155 int z0;
6156 int k0;
6157 int n0;
6158 int c;
6159 int pri;
6160 int p0 = -333;
6161 int c0;
6162 bool did_white = false;
6163 int wordlen;
6164
6165
6166 // Convert the multi-byte string to a wide-character string.
6167 // Remove accents, if wanted. We actually remove all non-word characters.
6168 // But keep white space.
6169 wordlen = 0;
6170 for (const char_u *s = inword; *s != NUL; ) {
6171 const char_u *t = s;
6172 c = mb_cptr2char_adv((const char_u **)&s);
6173 if (slang->sl_rem_accents) {
6174 if (enc_utf8 ? utf_class(c) == 0 : ascii_iswhite(c)) {
6175 if (did_white)
6176 continue;
6177 c = ' ';
6178 did_white = true;
6179 } else {
6180 did_white = false;
6181 if (!spell_iswordp_nmw(t, curwin)) {
6182 continue;
6183 }
6184 }
6185 }
6186 word[wordlen++] = c;
6187 }
6188 word[wordlen] = NUL;
6189
6190 // This algorithm comes from Aspell phonet.cpp.
6191 // Converted from C++ to C. Added support for multi-byte chars.
6192 // Changed to keep spaces.
6193 i = reslen = z = 0;
6194 while ((c = word[i]) != NUL) {
6195 // Start with the first rule that has the character in the word.
6196 n = slang->sl_sal_first[c & 0xff];
6197 z0 = 0;
6198
6199 if (n >= 0) {
6200 // Check all rules for the same index byte.
6201 // If c is 0x300 need extra check for the end of the array, as
6202 // (c & 0xff) is NUL.
6203 for (; ((ws = smp[n].sm_lead_w)[0] & 0xff) == (c & 0xff)
6204 && ws[0] != NUL; ++n) {
6205 // Quickly skip entries that don't match the word. Most
6206 // entries are less then three chars, optimize for that.
6207 if (c != ws[0])
6208 continue;
6209 k = smp[n].sm_leadlen;
6210 if (k > 1) {
6211 if (word[i + 1] != ws[1])
6212 continue;
6213 if (k > 2) {
6214 for (j = 2; j < k; ++j)
6215 if (word[i + j] != ws[j])
6216 break;
6217 if (j < k)
6218 continue;
6219 }
6220 }
6221
6222 if ((pf = smp[n].sm_oneof_w) != NULL) {
6223 // Check for match with one of the chars in "sm_oneof".
6224 while (*pf != NUL && *pf != word[i + k])
6225 ++pf;
6226 if (*pf == NUL)
6227 continue;
6228 ++k;
6229 }
6230 char_u *s = smp[n].sm_rules;
6231 pri = 5; // default priority
6232
6233 p0 = *s;
6234 k0 = k;
6235 while (*s == '-' && k > 1) {
6236 k--;
6237 s++;
6238 }
6239 if (*s == '<')
6240 s++;
6241 if (ascii_isdigit(*s)) {
6242 // determine priority
6243 pri = *s - '0';
6244 s++;
6245 }
6246 if (*s == '^' && *(s + 1) == '^')
6247 s++;
6248
6249 if (*s == NUL
6250 || (*s == '^'
6251 && (i == 0 || !(word[i - 1] == ' '
6252 || spell_iswordp_w(word + i - 1, curwin)))
6253 && (*(s + 1) != '$'
6254 || (!spell_iswordp_w(word + i + k0, curwin))))
6255 || (*s == '$' && i > 0
6256 && spell_iswordp_w(word + i - 1, curwin)
6257 && (!spell_iswordp_w(word + i + k0, curwin)))) {
6258 // search for followup rules, if:
6259 // followup and k > 1 and NO '-' in searchstring
6260 c0 = word[i + k - 1];
6261 n0 = slang->sl_sal_first[c0 & 0xff];
6262
6263 if (slang->sl_followup && k > 1 && n0 >= 0
6264 && p0 != '-' && word[i + k] != NUL) {
6265 // Test follow-up rule for "word[i + k]"; loop over
6266 // all entries with the same index byte.
6267 for (; ((ws = smp[n0].sm_lead_w)[0] & 0xff)
6268 == (c0 & 0xff); ++n0) {
6269 // Quickly skip entries that don't match the word.
6270 if (c0 != ws[0])
6271 continue;
6272 k0 = smp[n0].sm_leadlen;
6273 if (k0 > 1) {
6274 if (word[i + k] != ws[1])
6275 continue;
6276 if (k0 > 2) {
6277 pf = word + i + k + 1;
6278 for (j = 2; j < k0; ++j)
6279 if (*pf++ != ws[j])
6280 break;
6281 if (j < k0)
6282 continue;
6283 }
6284 }
6285 k0 += k - 1;
6286
6287 if ((pf = smp[n0].sm_oneof_w) != NULL) {
6288 // Check for match with one of the chars in
6289 // "sm_oneof".
6290 while (*pf != NUL && *pf != word[i + k0])
6291 ++pf;
6292 if (*pf == NUL)
6293 continue;
6294 ++k0;
6295 }
6296
6297 p0 = 5;
6298 s = smp[n0].sm_rules;
6299 while (*s == '-') {
6300 // "k0" gets NOT reduced because
6301 // "if (k0 == k)"
6302 s++;
6303 }
6304 if (*s == '<')
6305 s++;
6306 if (ascii_isdigit(*s)) {
6307 p0 = *s - '0';
6308 s++;
6309 }
6310
6311 if (*s == NUL
6312 // *s == '^' cuts
6313 || (*s == '$'
6314 && !spell_iswordp_w(word + i + k0,
6315 curwin))) {
6316 if (k0 == k)
6317 // this is just a piece of the string
6318 continue;
6319
6320 if (p0 < pri)
6321 // priority too low
6322 continue;
6323 // rule fits; stop search
6324 break;
6325 }
6326 }
6327
6328 if (p0 >= pri && (smp[n0].sm_lead_w[0] & 0xff)
6329 == (c0 & 0xff))
6330 continue;
6331 }
6332
6333 // replace string
6334 ws = smp[n].sm_to_w;
6335 s = smp[n].sm_rules;
6336 p0 = (vim_strchr(s, '<') != NULL) ? 1 : 0;
6337 if (p0 == 1 && z == 0) {
6338 // rule with '<' is used
6339 if (reslen > 0 && ws != NULL && *ws != NUL
6340 && (wres[reslen - 1] == c
6341 || wres[reslen - 1] == *ws))
6342 reslen--;
6343 z0 = 1;
6344 z = 1;
6345 k0 = 0;
6346 if (ws != NULL)
6347 while (*ws != NUL && word[i + k0] != NUL) {
6348 word[i + k0] = *ws;
6349 k0++;
6350 ws++;
6351 }
6352 if (k > k0)
6353 memmove(word + i + k0, word + i + k,
6354 sizeof(int) * (wordlen - (i + k) + 1));
6355
6356 // new "actual letter"
6357 c = word[i];
6358 } else {
6359 // no '<' rule used
6360 i += k - 1;
6361 z = 0;
6362 if (ws != NULL)
6363 while (*ws != NUL && ws[1] != NUL
6364 && reslen < MAXWLEN) {
6365 if (reslen == 0 || wres[reslen - 1] != *ws)
6366 wres[reslen++] = *ws;
6367 ws++;
6368 }
6369 // new "actual letter"
6370 if (ws == NULL)
6371 c = NUL;
6372 else
6373 c = *ws;
6374 if (strstr((char *)s, "^^") != NULL) {
6375 if (c != NUL)
6376 wres[reslen++] = c;
6377 memmove(word, word + i + 1,
6378 sizeof(int) * (wordlen - (i + 1) + 1));
6379 i = 0;
6380 z0 = 1;
6381 }
6382 }
6383 break;
6384 }
6385 }
6386 } else if (ascii_iswhite(c)) {
6387 c = ' ';
6388 k = 1;
6389 }
6390
6391 if (z0 == 0) {
6392 if (k && !p0 && reslen < MAXWLEN && c != NUL
6393 && (!slang->sl_collapse || reslen == 0
6394 || wres[reslen - 1] != c))
6395 // condense only double letters
6396 wres[reslen++] = c;
6397
6398 i++;
6399 z = 0;
6400 k = 0;
6401 }
6402 }
6403
6404 // Convert wide characters in "wres" to a multi-byte string in "res".
6405 l = 0;
6406 for (n = 0; n < reslen; n++) {
6407 l += utf_char2bytes(wres[n], res + l);
6408 if (l + MB_MAXBYTES > MAXWLEN) {
6409 break;
6410 }
6411 }
6412 res[l] = NUL;
6413}
6414
6415// Compute a score for two sound-a-like words.
6416// This permits up to two inserts/deletes/swaps/etc. to keep things fast.
6417// Instead of a generic loop we write out the code. That keeps it fast by
6418// avoiding checks that will not be possible.
6419static int
6420soundalike_score (
6421 char_u *goodstart, // sound-folded good word
6422 char_u *badstart // sound-folded bad word
6423)
6424{
6425 char_u *goodsound = goodstart;
6426 char_u *badsound = badstart;
6427 int goodlen;
6428 int badlen;
6429 int n;
6430 char_u *pl, *ps;
6431 char_u *pl2, *ps2;
6432 int score = 0;
6433
6434 // Adding/inserting "*" at the start (word starts with vowel) shouldn't be
6435 // counted so much, vowels in the middle of the word aren't counted at all.
6436 if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) {
6437 if ((badsound[0] == NUL && goodsound[1] == NUL)
6438 || (goodsound[0] == NUL && badsound[1] == NUL))
6439 // changing word with vowel to word without a sound
6440 return SCORE_DEL;
6441 if (badsound[0] == NUL || goodsound[0] == NUL)
6442 // more than two changes
6443 return SCORE_MAXMAX;
6444
6445 if (badsound[1] == goodsound[1]
6446 || (badsound[1] != NUL
6447 && goodsound[1] != NUL
6448 && badsound[2] == goodsound[2])) {
6449 // handle like a substitute
6450 } else {
6451 score = 2 * SCORE_DEL / 3;
6452 if (*badsound == '*')
6453 ++badsound;
6454 else
6455 ++goodsound;
6456 }
6457 }
6458
6459 goodlen = (int)STRLEN(goodsound);
6460 badlen = (int)STRLEN(badsound);
6461
6462 // Return quickly if the lengths are too different to be fixed by two
6463 // changes.
6464 n = goodlen - badlen;
6465 if (n < -2 || n > 2)
6466 return SCORE_MAXMAX;
6467
6468 if (n > 0) {
6469 pl = goodsound; // goodsound is longest
6470 ps = badsound;
6471 } else {
6472 pl = badsound; // badsound is longest
6473 ps = goodsound;
6474 }
6475
6476 // Skip over the identical part.
6477 while (*pl == *ps && *pl != NUL) {
6478 ++pl;
6479 ++ps;
6480 }
6481
6482 switch (n) {
6483 case -2:
6484 case 2:
6485 // Must delete two characters from "pl".
6486 ++pl; // first delete
6487 while (*pl == *ps) {
6488 ++pl;
6489 ++ps;
6490 }
6491 // strings must be equal after second delete
6492 if (STRCMP(pl + 1, ps) == 0)
6493 return score + SCORE_DEL * 2;
6494
6495 // Failed to compare.
6496 break;
6497
6498 case -1:
6499 case 1:
6500 // Minimal one delete from "pl" required.
6501
6502 // 1: delete
6503 pl2 = pl + 1;
6504 ps2 = ps;
6505 while (*pl2 == *ps2) {
6506 if (*pl2 == NUL) // reached the end
6507 return score + SCORE_DEL;
6508 ++pl2;
6509 ++ps2;
6510 }
6511
6512 // 2: delete then swap, then rest must be equal
6513 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
6514 && STRCMP(pl2 + 2, ps2 + 2) == 0)
6515 return score + SCORE_DEL + SCORE_SWAP;
6516
6517 // 3: delete then substitute, then the rest must be equal
6518 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
6519 return score + SCORE_DEL + SCORE_SUBST;
6520
6521 // 4: first swap then delete
6522 if (pl[0] == ps[1] && pl[1] == ps[0]) {
6523 pl2 = pl + 2; // swap, skip two chars
6524 ps2 = ps + 2;
6525 while (*pl2 == *ps2) {
6526 ++pl2;
6527 ++ps2;
6528 }
6529 // delete a char and then strings must be equal
6530 if (STRCMP(pl2 + 1, ps2) == 0)
6531 return score + SCORE_SWAP + SCORE_DEL;
6532 }
6533
6534 // 5: first substitute then delete
6535 pl2 = pl + 1; // substitute, skip one char
6536 ps2 = ps + 1;
6537 while (*pl2 == *ps2) {
6538 ++pl2;
6539 ++ps2;
6540 }
6541 // delete a char and then strings must be equal
6542 if (STRCMP(pl2 + 1, ps2) == 0)
6543 return score + SCORE_SUBST + SCORE_DEL;
6544
6545 // Failed to compare.
6546 break;
6547
6548 case 0:
6549 // Lengths are equal, thus changes must result in same length: An
6550 // insert is only possible in combination with a delete.
6551 // 1: check if for identical strings
6552 if (*pl == NUL)
6553 return score;
6554
6555 // 2: swap
6556 if (pl[0] == ps[1] && pl[1] == ps[0]) {
6557 pl2 = pl + 2; // swap, skip two chars
6558 ps2 = ps + 2;
6559 while (*pl2 == *ps2) {
6560 if (*pl2 == NUL) // reached the end
6561 return score + SCORE_SWAP;
6562 ++pl2;
6563 ++ps2;
6564 }
6565 // 3: swap and swap again
6566 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
6567 && STRCMP(pl2 + 2, ps2 + 2) == 0)
6568 return score + SCORE_SWAP + SCORE_SWAP;
6569
6570 // 4: swap and substitute
6571 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
6572 return score + SCORE_SWAP + SCORE_SUBST;
6573 }
6574
6575 // 5: substitute
6576 pl2 = pl + 1;
6577 ps2 = ps + 1;
6578 while (*pl2 == *ps2) {
6579 if (*pl2 == NUL) // reached the end
6580 return score + SCORE_SUBST;
6581 ++pl2;
6582 ++ps2;
6583 }
6584
6585 // 6: substitute and swap
6586 if (pl2[0] == ps2[1] && pl2[1] == ps2[0]
6587 && STRCMP(pl2 + 2, ps2 + 2) == 0)
6588 return score + SCORE_SUBST + SCORE_SWAP;
6589
6590 // 7: substitute and substitute
6591 if (STRCMP(pl2 + 1, ps2 + 1) == 0)
6592 return score + SCORE_SUBST + SCORE_SUBST;
6593
6594 // 8: insert then delete
6595 pl2 = pl;
6596 ps2 = ps + 1;
6597 while (*pl2 == *ps2) {
6598 ++pl2;
6599 ++ps2;
6600 }
6601 if (STRCMP(pl2 + 1, ps2) == 0)
6602 return score + SCORE_INS + SCORE_DEL;
6603
6604 // 9: delete then insert
6605 pl2 = pl + 1;
6606 ps2 = ps;
6607 while (*pl2 == *ps2) {
6608 ++pl2;
6609 ++ps2;
6610 }
6611 if (STRCMP(pl2, ps2 + 1) == 0)
6612 return score + SCORE_INS + SCORE_DEL;
6613
6614 // Failed to compare.
6615 break;
6616 }
6617
6618 return SCORE_MAXMAX;
6619}
6620
6621// Compute the "edit distance" to turn "badword" into "goodword". The less
6622// deletes/inserts/substitutes/swaps are required the lower the score.
6623//
6624// The algorithm is described by Du and Chang, 1992.
6625// The implementation of the algorithm comes from Aspell editdist.cpp,
6626// edit_distance(). It has been converted from C++ to C and modified to
6627// support multi-byte characters.
6628static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword)
6629{
6630 int *cnt;
6631 int j, i;
6632 int t;
6633 int bc, gc;
6634 int pbc, pgc;
6635 int wbadword[MAXWLEN];
6636 int wgoodword[MAXWLEN];
6637 const bool l_has_mbyte = has_mbyte;
6638
6639 // Lengths with NUL.
6640 int badlen;
6641 int goodlen;
6642 if (l_has_mbyte) {
6643 // Get the characters from the multi-byte strings and put them in an
6644 // int array for easy access.
6645 badlen = 0;
6646 for (const char_u *p = badword; *p != NUL; ) {
6647 wbadword[badlen++] = mb_cptr2char_adv(&p);
6648 }
6649 wbadword[badlen++] = 0;
6650 goodlen = 0;
6651 for (const char_u *p = goodword; *p != NUL; ) {
6652 wgoodword[goodlen++] = mb_cptr2char_adv(&p);
6653 }
6654 wgoodword[goodlen++] = 0;
6655 } else {
6656 badlen = (int)STRLEN(badword) + 1;
6657 goodlen = (int)STRLEN(goodword) + 1;
6658 }
6659
6660 // We use "cnt" as an array: CNT(badword_idx, goodword_idx).
6661#define CNT(a, b) cnt[(a) + (b) * (badlen + 1)]
6662 cnt = xmalloc(sizeof(int) * (badlen + 1) * (goodlen + 1));
6663
6664 CNT(0, 0) = 0;
6665 for (j = 1; j <= goodlen; ++j)
6666 CNT(0, j) = CNT(0, j - 1) + SCORE_INS;
6667
6668 for (i = 1; i <= badlen; ++i) {
6669 CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL;
6670 for (j = 1; j <= goodlen; ++j) {
6671 if (l_has_mbyte) {
6672 bc = wbadword[i - 1];
6673 gc = wgoodword[j - 1];
6674 } else {
6675 bc = badword[i - 1];
6676 gc = goodword[j - 1];
6677 }
6678 if (bc == gc)
6679 CNT(i, j) = CNT(i - 1, j - 1);
6680 else {
6681 // Use a better score when there is only a case difference.
6682 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
6683 CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1);
6684 else {
6685 // For a similar character use SCORE_SIMILAR.
6686 if (slang != NULL
6687 && slang->sl_has_map
6688 && similar_chars(slang, gc, bc))
6689 CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1);
6690 else
6691 CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1);
6692 }
6693
6694 if (i > 1 && j > 1) {
6695 if (l_has_mbyte) {
6696 pbc = wbadword[i - 2];
6697 pgc = wgoodword[j - 2];
6698 } else {
6699 pbc = badword[i - 2];
6700 pgc = goodword[j - 2];
6701 }
6702 if (bc == pgc && pbc == gc) {
6703 t = SCORE_SWAP + CNT(i - 2, j - 2);
6704 if (t < CNT(i, j))
6705 CNT(i, j) = t;
6706 }
6707 }
6708 t = SCORE_DEL + CNT(i - 1, j);
6709 if (t < CNT(i, j))
6710 CNT(i, j) = t;
6711 t = SCORE_INS + CNT(i, j - 1);
6712 if (t < CNT(i, j))
6713 CNT(i, j) = t;
6714 }
6715 }
6716 }
6717
6718 i = CNT(badlen - 1, goodlen - 1);
6719 xfree(cnt);
6720 return i;
6721}
6722
6723// Like spell_edit_score(), but with a limit on the score to make it faster.
6724// May return SCORE_MAXMAX when the score is higher than "limit".
6725//
6726// This uses a stack for the edits still to be tried.
6727// The idea comes from Aspell leditdist.cpp. Rewritten in C and added support
6728// for multi-byte characters.
6729static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit)
6730{
6731 limitscore_T stack[10]; // allow for over 3 * 2 edits
6732 int stackidx;
6733 int bi, gi;
6734 int bi2, gi2;
6735 int bc, gc;
6736 int score;
6737 int score_off;
6738 int minscore;
6739 int round;
6740
6741 // Multi-byte characters require a bit more work, use a different function
6742 // to avoid testing "has_mbyte" quite often.
6743 if (has_mbyte)
6744 return spell_edit_score_limit_w(slang, badword, goodword, limit);
6745
6746 // The idea is to go from start to end over the words. So long as
6747 // characters are equal just continue, this always gives the lowest score.
6748 // When there is a difference try several alternatives. Each alternative
6749 // increases "score" for the edit distance. Some of the alternatives are
6750 // pushed unto a stack and tried later, some are tried right away. At the
6751 // end of the word the score for one alternative is known. The lowest
6752 // possible score is stored in "minscore".
6753 stackidx = 0;
6754 bi = 0;
6755 gi = 0;
6756 score = 0;
6757 minscore = limit + 1;
6758
6759 for (;; ) {
6760 // Skip over an equal part, score remains the same.
6761 for (;; ) {
6762 bc = badword[bi];
6763 gc = goodword[gi];
6764 if (bc != gc) // stop at a char that's different
6765 break;
6766 if (bc == NUL) { // both words end
6767 if (score < minscore)
6768 minscore = score;
6769 goto pop; // do next alternative
6770 }
6771 ++bi;
6772 ++gi;
6773 }
6774
6775 if (gc == NUL) { // goodword ends, delete badword chars
6776 do {
6777 if ((score += SCORE_DEL) >= minscore)
6778 goto pop; // do next alternative
6779 } while (badword[++bi] != NUL);
6780 minscore = score;
6781 } else if (bc == NUL) { // badword ends, insert badword chars
6782 do {
6783 if ((score += SCORE_INS) >= minscore)
6784 goto pop; // do next alternative
6785 } while (goodword[++gi] != NUL);
6786 minscore = score;
6787 } else { // both words continue
6788 // If not close to the limit, perform a change. Only try changes
6789 // that may lead to a lower score than "minscore".
6790 // round 0: try deleting a char from badword
6791 // round 1: try inserting a char in badword
6792 for (round = 0; round <= 1; ++round) {
6793 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
6794 if (score_off < minscore) {
6795 if (score_off + SCORE_EDIT_MIN >= minscore) {
6796 // Near the limit, rest of the words must match. We
6797 // can check that right now, no need to push an item
6798 // onto the stack.
6799 bi2 = bi + 1 - round;
6800 gi2 = gi + round;
6801 while (goodword[gi2] == badword[bi2]) {
6802 if (goodword[gi2] == NUL) {
6803 minscore = score_off;
6804 break;
6805 }
6806 ++bi2;
6807 ++gi2;
6808 }
6809 } else {
6810 // try deleting/inserting a character later
6811 stack[stackidx].badi = bi + 1 - round;
6812 stack[stackidx].goodi = gi + round;
6813 stack[stackidx].score = score_off;
6814 ++stackidx;
6815 }
6816 }
6817 }
6818
6819 if (score + SCORE_SWAP < minscore) {
6820 // If swapping two characters makes a match then the
6821 // substitution is more expensive, thus there is no need to
6822 // try both.
6823 if (gc == badword[bi + 1] && bc == goodword[gi + 1]) {
6824 // Swap two characters, that is: skip them.
6825 gi += 2;
6826 bi += 2;
6827 score += SCORE_SWAP;
6828 continue;
6829 }
6830 }
6831
6832 // Substitute one character for another which is the same
6833 // thing as deleting a character from both goodword and badword.
6834 // Use a better score when there is only a case difference.
6835 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
6836 score += SCORE_ICASE;
6837 else {
6838 // For a similar character use SCORE_SIMILAR.
6839 if (slang != NULL
6840 && slang->sl_has_map
6841 && similar_chars(slang, gc, bc))
6842 score += SCORE_SIMILAR;
6843 else
6844 score += SCORE_SUBST;
6845 }
6846
6847 if (score < minscore) {
6848 // Do the substitution.
6849 ++gi;
6850 ++bi;
6851 continue;
6852 }
6853 }
6854pop:
6855 // Get here to try the next alternative, pop it from the stack.
6856 if (stackidx == 0) // stack is empty, finished
6857 break;
6858
6859 // pop an item from the stack
6860 --stackidx;
6861 gi = stack[stackidx].goodi;
6862 bi = stack[stackidx].badi;
6863 score = stack[stackidx].score;
6864 }
6865
6866 // When the score goes over "limit" it may actually be much higher.
6867 // Return a very large number to avoid going below the limit when giving a
6868 // bonus.
6869 if (minscore > limit)
6870 return SCORE_MAXMAX;
6871 return minscore;
6872}
6873
6874// Multi-byte version of spell_edit_score_limit().
6875// Keep it in sync with the above!
6876static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit)
6877{
6878 limitscore_T stack[10]; // allow for over 3 * 2 edits
6879 int stackidx;
6880 int bi, gi;
6881 int bi2, gi2;
6882 int bc, gc;
6883 int score;
6884 int score_off;
6885 int minscore;
6886 int round;
6887 int wbadword[MAXWLEN];
6888 int wgoodword[MAXWLEN];
6889
6890 // Get the characters from the multi-byte strings and put them in an
6891 // int array for easy access.
6892 bi = 0;
6893 for (const char_u *p = badword; *p != NUL; ) {
6894 wbadword[bi++] = mb_cptr2char_adv(&p);
6895 }
6896 wbadword[bi++] = 0;
6897 gi = 0;
6898 for (const char_u *p = goodword; *p != NUL; ) {
6899 wgoodword[gi++] = mb_cptr2char_adv(&p);
6900 }
6901 wgoodword[gi++] = 0;
6902
6903 // The idea is to go from start to end over the words. So long as
6904 // characters are equal just continue, this always gives the lowest score.
6905 // When there is a difference try several alternatives. Each alternative
6906 // increases "score" for the edit distance. Some of the alternatives are
6907 // pushed unto a stack and tried later, some are tried right away. At the
6908 // end of the word the score for one alternative is known. The lowest
6909 // possible score is stored in "minscore".
6910 stackidx = 0;
6911 bi = 0;
6912 gi = 0;
6913 score = 0;
6914 minscore = limit + 1;
6915
6916 for (;; ) {
6917 // Skip over an equal part, score remains the same.
6918 for (;; ) {
6919 bc = wbadword[bi];
6920 gc = wgoodword[gi];
6921
6922 if (bc != gc) // stop at a char that's different
6923 break;
6924 if (bc == NUL) { // both words end
6925 if (score < minscore)
6926 minscore = score;
6927 goto pop; // do next alternative
6928 }
6929 ++bi;
6930 ++gi;
6931 }
6932
6933 if (gc == NUL) { // goodword ends, delete badword chars
6934 do {
6935 if ((score += SCORE_DEL) >= minscore)
6936 goto pop; // do next alternative
6937 } while (wbadword[++bi] != NUL);
6938 minscore = score;
6939 } else if (bc == NUL) { // badword ends, insert badword chars
6940 do {
6941 if ((score += SCORE_INS) >= minscore)
6942 goto pop; // do next alternative
6943 } while (wgoodword[++gi] != NUL);
6944 minscore = score;
6945 } else { // both words continue
6946 // If not close to the limit, perform a change. Only try changes
6947 // that may lead to a lower score than "minscore".
6948 // round 0: try deleting a char from badword
6949 // round 1: try inserting a char in badword
6950 for (round = 0; round <= 1; ++round) {
6951 score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS);
6952 if (score_off < minscore) {
6953 if (score_off + SCORE_EDIT_MIN >= minscore) {
6954 // Near the limit, rest of the words must match. We
6955 // can check that right now, no need to push an item
6956 // onto the stack.
6957 bi2 = bi + 1 - round;
6958 gi2 = gi + round;
6959 while (wgoodword[gi2] == wbadword[bi2]) {
6960 if (wgoodword[gi2] == NUL) {
6961 minscore = score_off;
6962 break;
6963 }
6964 ++bi2;
6965 ++gi2;
6966 }
6967 } else {
6968 // try deleting a character from badword later
6969 stack[stackidx].badi = bi + 1 - round;
6970 stack[stackidx].goodi = gi + round;
6971 stack[stackidx].score = score_off;
6972 ++stackidx;
6973 }
6974 }
6975 }
6976
6977 if (score + SCORE_SWAP < minscore) {
6978 // If swapping two characters makes a match then the
6979 // substitution is more expensive, thus there is no need to
6980 // try both.
6981 if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1]) {
6982 // Swap two characters, that is: skip them.
6983 gi += 2;
6984 bi += 2;
6985 score += SCORE_SWAP;
6986 continue;
6987 }
6988 }
6989
6990 // Substitute one character for another which is the same
6991 // thing as deleting a character from both goodword and badword.
6992 // Use a better score when there is only a case difference.
6993 if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc))
6994 score += SCORE_ICASE;
6995 else {
6996 // For a similar character use SCORE_SIMILAR.
6997 if (slang != NULL
6998 && slang->sl_has_map
6999 && similar_chars(slang, gc, bc))
7000 score += SCORE_SIMILAR;
7001 else
7002 score += SCORE_SUBST;
7003 }
7004
7005 if (score < minscore) {
7006 // Do the substitution.
7007 ++gi;
7008 ++bi;
7009 continue;
7010 }
7011 }
7012pop:
7013 // Get here to try the next alternative, pop it from the stack.
7014 if (stackidx == 0) // stack is empty, finished
7015 break;
7016
7017 // pop an item from the stack
7018 --stackidx;
7019 gi = stack[stackidx].goodi;
7020 bi = stack[stackidx].badi;
7021 score = stack[stackidx].score;
7022 }
7023
7024 // When the score goes over "limit" it may actually be much higher.
7025 // Return a very large number to avoid going below the limit when giving a
7026 // bonus.
7027 if (minscore > limit)
7028 return SCORE_MAXMAX;
7029 return minscore;
7030}
7031
7032// ":spellinfo"
7033void ex_spellinfo(exarg_T *eap)
7034{
7035 if (no_spell_checking(curwin)) {
7036 return;
7037 }
7038
7039 msg_start();
7040 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len && !got_int; lpi++) {
7041 langp_T *const lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
7042 msg_puts("file: ");
7043 msg_puts((const char *)lp->lp_slang->sl_fname);
7044 msg_putchar('\n');
7045 const char *const p = (const char *)lp->lp_slang->sl_info;
7046 if (p != NULL) {
7047 msg_puts(p);
7048 msg_putchar('\n');
7049 }
7050 }
7051 msg_end();
7052}
7053
7054#define DUMPFLAG_KEEPCASE 1 // round 2: keep-case tree
7055#define DUMPFLAG_COUNT 2 // include word count
7056#define DUMPFLAG_ICASE 4 // ignore case when finding matches
7057#define DUMPFLAG_ONECAP 8 // pattern starts with capital
7058#define DUMPFLAG_ALLCAP 16 // pattern is all capitals
7059
7060// ":spelldump"
7061void ex_spelldump(exarg_T *eap)
7062{
7063 char_u *spl;
7064 long dummy;
7065
7066 if (no_spell_checking(curwin)) {
7067 return;
7068 }
7069 get_option_value((char_u *)"spl", &dummy, &spl, OPT_LOCAL);
7070
7071 // Create a new empty buffer in a new window.
7072 do_cmdline_cmd("new");
7073
7074 // enable spelling locally in the new window
7075 set_option_value("spell", true, "", OPT_LOCAL);
7076 set_option_value("spl", dummy, (char *)spl, OPT_LOCAL);
7077 xfree(spl);
7078
7079 if (!BUFEMPTY()) {
7080 return;
7081 }
7082
7083 spell_dump_compl(NULL, 0, NULL, eap->forceit ? DUMPFLAG_COUNT : 0);
7084
7085 // Delete the empty line that we started with.
7086 if (curbuf->b_ml.ml_line_count > 1) {
7087 ml_delete(curbuf->b_ml.ml_line_count, false);
7088 }
7089 redraw_later(NOT_VALID);
7090}
7091
7092// Go through all possible words and:
7093// 1. When "pat" is NULL: dump a list of all words in the current buffer.
7094// "ic" and "dir" are not used.
7095// 2. When "pat" is not NULL: add matching words to insert mode completion.
7096void
7097spell_dump_compl (
7098 char_u *pat, // leading part of the word
7099 int ic, // ignore case
7100 int *dir, // direction for adding matches
7101 int dumpflags_arg // DUMPFLAG_*
7102)
7103{
7104 langp_T *lp;
7105 slang_T *slang;
7106 idx_T arridx[MAXWLEN];
7107 int curi[MAXWLEN];
7108 char_u word[MAXWLEN];
7109 int c;
7110 char_u *byts;
7111 idx_T *idxs;
7112 linenr_T lnum = 0;
7113 int round;
7114 int depth;
7115 int n;
7116 int flags;
7117 char_u *region_names = NULL; // region names being used
7118 bool do_region = true; // dump region names and numbers
7119 char_u *p;
7120 int dumpflags = dumpflags_arg;
7121 int patlen;
7122
7123 // When ignoring case or when the pattern starts with capital pass this on
7124 // to dump_word().
7125 if (pat != NULL) {
7126 if (ic)
7127 dumpflags |= DUMPFLAG_ICASE;
7128 else {
7129 n = captype(pat, NULL);
7130 if (n == WF_ONECAP)
7131 dumpflags |= DUMPFLAG_ONECAP;
7132 else if (n == WF_ALLCAP
7133 && (int)STRLEN(pat) > mb_ptr2len(pat)
7134 )
7135 dumpflags |= DUMPFLAG_ALLCAP;
7136 }
7137 }
7138
7139 // Find out if we can support regions: All languages must support the same
7140 // regions or none at all.
7141 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
7142 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
7143 p = lp->lp_slang->sl_regions;
7144 if (p[0] != 0) {
7145 if (region_names == NULL) // first language with regions
7146 region_names = p;
7147 else if (STRCMP(region_names, p) != 0) {
7148 do_region = false; // region names are different
7149 break;
7150 }
7151 }
7152 }
7153
7154 if (do_region && region_names != NULL) {
7155 if (pat == NULL) {
7156 vim_snprintf((char *)IObuff, IOSIZE, "/regions=%s", region_names);
7157 ml_append(lnum++, IObuff, (colnr_T)0, false);
7158 }
7159 } else
7160 do_region = false;
7161
7162 // Loop over all files loaded for the entries in 'spelllang'.
7163 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
7164 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
7165 slang = lp->lp_slang;
7166 if (slang->sl_fbyts == NULL) // reloading failed
7167 continue;
7168
7169 if (pat == NULL) {
7170 vim_snprintf((char *)IObuff, IOSIZE, "# file: %s", slang->sl_fname);
7171 ml_append(lnum++, IObuff, (colnr_T)0, false);
7172 }
7173
7174 // When matching with a pattern and there are no prefixes only use
7175 // parts of the tree that match "pat".
7176 if (pat != NULL && slang->sl_pbyts == NULL)
7177 patlen = (int)STRLEN(pat);
7178 else
7179 patlen = -1;
7180
7181 // round 1: case-folded tree
7182 // round 2: keep-case tree
7183 for (round = 1; round <= 2; ++round) {
7184 if (round == 1) {
7185 dumpflags &= ~DUMPFLAG_KEEPCASE;
7186 byts = slang->sl_fbyts;
7187 idxs = slang->sl_fidxs;
7188 } else {
7189 dumpflags |= DUMPFLAG_KEEPCASE;
7190 byts = slang->sl_kbyts;
7191 idxs = slang->sl_kidxs;
7192 }
7193 if (byts == NULL)
7194 continue; // array is empty
7195
7196 depth = 0;
7197 arridx[0] = 0;
7198 curi[0] = 1;
7199 while (depth >= 0 && !got_int
7200 && (pat == NULL || !compl_interrupted)) {
7201 if (curi[depth] > byts[arridx[depth]]) {
7202 // Done all bytes at this node, go up one level.
7203 --depth;
7204 line_breakcheck();
7205 ins_compl_check_keys(50, false);
7206 } else {
7207 // Do one more byte at this node.
7208 n = arridx[depth] + curi[depth];
7209 ++curi[depth];
7210 c = byts[n];
7211 if (c == 0) {
7212 // End of word, deal with the word.
7213 // Don't use keep-case words in the fold-case tree,
7214 // they will appear in the keep-case tree.
7215 // Only use the word when the region matches.
7216 flags = (int)idxs[n];
7217 if ((round == 2 || (flags & WF_KEEPCAP) == 0)
7218 && (flags & WF_NEEDCOMP) == 0
7219 && (do_region
7220 || (flags & WF_REGION) == 0
7221 || (((unsigned)flags >> 16)
7222 & lp->lp_region) != 0)) {
7223 word[depth] = NUL;
7224 if (!do_region)
7225 flags &= ~WF_REGION;
7226
7227 // Dump the basic word if there is no prefix or
7228 // when it's the first one.
7229 c = (unsigned)flags >> 24;
7230 if (c == 0 || curi[depth] == 2) {
7231 dump_word(slang, word, pat, dir,
7232 dumpflags, flags, lnum);
7233 if (pat == NULL)
7234 ++lnum;
7235 }
7236
7237 // Apply the prefix, if there is one.
7238 if (c != 0)
7239 lnum = dump_prefixes(slang, word, pat, dir,
7240 dumpflags, flags, lnum);
7241 }
7242 } else {
7243 // Normal char, go one level deeper.
7244 word[depth++] = c;
7245 arridx[depth] = idxs[n];
7246 curi[depth] = 1;
7247
7248 // Check if this characters matches with the pattern.
7249 // If not skip the whole tree below it.
7250 // Always ignore case here, dump_word() will check
7251 // proper case later. This isn't exactly right when
7252 // length changes for multi-byte characters with
7253 // ignore case...
7254 assert(depth >= 0);
7255 if (depth <= patlen
7256 && mb_strnicmp(word, pat, (size_t)depth) != 0)
7257 --depth;
7258 }
7259 }
7260 }
7261 }
7262 }
7263}
7264
7265// Dumps one word: apply case modifications and append a line to the buffer.
7266// When "lnum" is zero add insert mode completion.
7267static void dump_word(slang_T *slang, char_u *word, char_u *pat, int *dir, int dumpflags, int wordflags, linenr_T lnum)
7268{
7269 bool keepcap = false;
7270 char_u *p;
7271 char_u *tw;
7272 char_u cword[MAXWLEN];
7273 char_u badword[MAXWLEN + 10];
7274 int i;
7275 int flags = wordflags;
7276
7277 if (dumpflags & DUMPFLAG_ONECAP)
7278 flags |= WF_ONECAP;
7279 if (dumpflags & DUMPFLAG_ALLCAP)
7280 flags |= WF_ALLCAP;
7281
7282 if ((dumpflags & DUMPFLAG_KEEPCASE) == 0 && (flags & WF_CAPMASK) != 0) {
7283 // Need to fix case according to "flags".
7284 make_case_word(word, cword, flags);
7285 p = cword;
7286 } else {
7287 p = word;
7288 if ((dumpflags & DUMPFLAG_KEEPCASE)
7289 && ((captype(word, NULL) & WF_KEEPCAP) == 0
7290 || (flags & WF_FIXCAP) != 0))
7291 keepcap = true;
7292 }
7293 tw = p;
7294
7295 if (pat == NULL) {
7296 // Add flags and regions after a slash.
7297 if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) {
7298 STRCPY(badword, p);
7299 STRCAT(badword, "/");
7300 if (keepcap) {
7301 STRCAT(badword, "=");
7302 }
7303 if (flags & WF_BANNED) {
7304 STRCAT(badword, "!");
7305 } else if (flags & WF_RARE) {
7306 STRCAT(badword, "?");
7307 }
7308 if (flags & WF_REGION) {
7309 for (i = 0; i < 7; i++) {
7310 if (flags & (0x10000 << i)) {
7311 const size_t badword_len = STRLEN(badword);
7312 snprintf((char *)badword + badword_len,
7313 sizeof(badword) - badword_len,
7314 "%d", i + 1);
7315 }
7316 }
7317 }
7318 p = badword;
7319 }
7320
7321 if (dumpflags & DUMPFLAG_COUNT) {
7322 hashitem_T *hi;
7323
7324 // Include the word count for ":spelldump!".
7325 hi = hash_find(&slang->sl_wordcount, tw);
7326 if (!HASHITEM_EMPTY(hi)) {
7327 vim_snprintf((char *)IObuff, IOSIZE, "%s\t%d",
7328 tw, HI2WC(hi)->wc_count);
7329 p = IObuff;
7330 }
7331 }
7332
7333 ml_append(lnum, p, (colnr_T)0, false);
7334 } else if (((dumpflags & DUMPFLAG_ICASE)
7335 ? mb_strnicmp(p, pat, STRLEN(pat)) == 0
7336 : STRNCMP(p, pat, STRLEN(pat)) == 0)
7337 && ins_compl_add_infercase(p, (int)STRLEN(p),
7338 p_ic, NULL, *dir, false) == OK) {
7339 // if dir was BACKWARD then honor it just once
7340 *dir = FORWARD;
7341 }
7342}
7343
7344// For ":spelldump": Find matching prefixes for "word". Prepend each to
7345// "word" and append a line to the buffer.
7346// When "lnum" is zero add insert mode completion.
7347// Return the updated line number.
7348static linenr_T
7349dump_prefixes (
7350 slang_T *slang,
7351 char_u *word, // case-folded word
7352 char_u *pat,
7353 int *dir,
7354 int dumpflags,
7355 int flags, // flags with prefix ID
7356 linenr_T startlnum
7357)
7358{
7359 idx_T arridx[MAXWLEN];
7360 int curi[MAXWLEN];
7361 char_u prefix[MAXWLEN];
7362 char_u word_up[MAXWLEN];
7363 bool has_word_up = false;
7364 int c;
7365 char_u *byts;
7366 idx_T *idxs;
7367 linenr_T lnum = startlnum;
7368 int depth;
7369 int n;
7370 int len;
7371 int i;
7372
7373 // If the word starts with a lower-case letter make the word with an
7374 // upper-case letter in word_up[].
7375 c = PTR2CHAR(word);
7376 if (SPELL_TOUPPER(c) != c) {
7377 onecap_copy(word, word_up, true);
7378 has_word_up = true;
7379 }
7380
7381 byts = slang->sl_pbyts;
7382 idxs = slang->sl_pidxs;
7383 if (byts != NULL) { // array not is empty
7384 // Loop over all prefixes, building them byte-by-byte in prefix[].
7385 // When at the end of a prefix check that it supports "flags".
7386 depth = 0;
7387 arridx[0] = 0;
7388 curi[0] = 1;
7389 while (depth >= 0 && !got_int) {
7390 n = arridx[depth];
7391 len = byts[n];
7392 if (curi[depth] > len) {
7393 // Done all bytes at this node, go up one level.
7394 --depth;
7395 line_breakcheck();
7396 } else {
7397 // Do one more byte at this node.
7398 n += curi[depth];
7399 ++curi[depth];
7400 c = byts[n];
7401 if (c == 0) {
7402 // End of prefix, find out how many IDs there are.
7403 for (i = 1; i < len; ++i)
7404 if (byts[n + i] != 0)
7405 break;
7406 curi[depth] += i - 1;
7407
7408 c = valid_word_prefix(i, n, flags, word, slang, false);
7409 if (c != 0) {
7410 STRLCPY(prefix + depth, word, MAXWLEN - depth);
7411 dump_word(slang, prefix, pat, dir, dumpflags,
7412 (c & WF_RAREPFX) ? (flags | WF_RARE)
7413 : flags, lnum);
7414 if (lnum != 0)
7415 ++lnum;
7416 }
7417
7418 // Check for prefix that matches the word when the
7419 // first letter is upper-case, but only if the prefix has
7420 // a condition.
7421 if (has_word_up) {
7422 c = valid_word_prefix(i, n, flags, word_up, slang,
7423 true);
7424 if (c != 0) {
7425 STRLCPY(prefix + depth, word_up, MAXWLEN - depth);
7426 dump_word(slang, prefix, pat, dir, dumpflags,
7427 (c & WF_RAREPFX) ? (flags | WF_RARE)
7428 : flags, lnum);
7429 if (lnum != 0)
7430 ++lnum;
7431 }
7432 }
7433 } else {
7434 // Normal char, go one level deeper.
7435 prefix[depth++] = c;
7436 arridx[depth] = idxs[n];
7437 curi[depth] = 1;
7438 }
7439 }
7440 }
7441 }
7442
7443 return lnum;
7444}
7445
7446// Move "p" to the end of word "start".
7447// Uses the spell-checking word characters.
7448char_u *spell_to_word_end(char_u *start, win_T *win)
7449{
7450 char_u *p = start;
7451
7452 while (*p != NUL && spell_iswordp(p, win)) {
7453 MB_PTR_ADV(p);
7454 }
7455 return p;
7456}
7457
7458// For Insert mode completion CTRL-X s:
7459// Find start of the word in front of column "startcol".
7460// We don't check if it is badly spelled, with completion we can only change
7461// the word in front of the cursor.
7462// Returns the column number of the word.
7463int spell_word_start(int startcol)
7464{
7465 char_u *line;
7466 char_u *p;
7467 int col = 0;
7468
7469 if (no_spell_checking(curwin)) {
7470 return startcol;
7471 }
7472
7473 // Find a word character before "startcol".
7474 line = get_cursor_line_ptr();
7475 for (p = line + startcol; p > line; ) {
7476 MB_PTR_BACK(line, p);
7477 if (spell_iswordp_nmw(p, curwin)) {
7478 break;
7479 }
7480 }
7481
7482 // Go back to start of the word.
7483 while (p > line) {
7484 col = (int)(p - line);
7485 MB_PTR_BACK(line, p);
7486 if (!spell_iswordp(p, curwin)) {
7487 break;
7488 }
7489 col = 0;
7490 }
7491
7492 return col;
7493}
7494
7495// Need to check for 'spellcapcheck' now, the word is removed before
7496// expand_spelling() is called. Therefore the ugly global variable.
7497static bool spell_expand_need_cap;
7498
7499void spell_expand_check_cap(colnr_T col)
7500{
7501 spell_expand_need_cap = check_need_cap(curwin->w_cursor.lnum, col);
7502}
7503
7504// Get list of spelling suggestions.
7505// Used for Insert mode completion CTRL-X ?.
7506// Returns the number of matches. The matches are in "matchp[]", array of
7507// allocated strings.
7508int expand_spelling(linenr_T lnum, char_u *pat, char_u ***matchp)
7509{
7510 garray_T ga;
7511
7512 spell_suggest_list(&ga, pat, 100, spell_expand_need_cap, true);
7513 *matchp = ga.ga_data;
7514 return ga.ga_len;
7515}
7516