1 | // This is an open source non-commercial project. Dear PVS-Studio, please check |
2 | // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com |
3 | |
4 | // spell.c: code for spell checking |
5 | // |
6 | // See spellfile.c for the Vim spell file format. |
7 | // |
8 | // The spell checking mechanism uses a tree (aka trie). Each node in the tree |
9 | // has a list of bytes that can appear (siblings). For each byte there is a |
10 | // pointer to the node with the byte that follows in the word (child). |
11 | // |
12 | // A NUL byte is used where the word may end. The bytes are sorted, so that |
13 | // binary searching can be used and the NUL bytes are at the start. The |
14 | // number of possible bytes is stored before the list of bytes. |
15 | // |
16 | // The tree uses two arrays: "byts" stores the characters, "idxs" stores |
17 | // either the next index or flags. The tree starts at index 0. For example, |
18 | // to lookup "vi" this sequence is followed: |
19 | // i = 0 |
20 | // len = byts[i] |
21 | // n = where "v" appears in byts[i + 1] to byts[i + len] |
22 | // i = idxs[n] |
23 | // len = byts[i] |
24 | // n = where "i" appears in byts[i + 1] to byts[i + len] |
25 | // i = idxs[n] |
26 | // len = byts[i] |
27 | // find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi". |
28 | // |
29 | // There are two word trees: one with case-folded words and one with words in |
30 | // original case. The second one is only used for keep-case words and is |
31 | // usually small. |
32 | // |
33 | // There is one additional tree for when not all prefixes are applied when |
34 | // generating the .spl file. This tree stores all the possible prefixes, as |
35 | // if they were words. At each word (prefix) end the prefix nr is stored, the |
36 | // following word must support this prefix nr. And the condition nr is |
37 | // stored, used to lookup the condition that the word must match with. |
38 | // |
39 | // Thanks to Olaf Seibert for providing an example implementation of this tree |
40 | // and the compression mechanism. |
41 | // LZ trie ideas: |
42 | // http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf |
43 | // More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html |
44 | // |
45 | // Matching involves checking the caps type: Onecap ALLCAP KeepCap. |
46 | // |
47 | // Why doesn't Vim use aspell/ispell/myspell/etc.? |
48 | // See ":help develop-spell". |
49 | |
50 | // Use SPELL_PRINTTREE for debugging: dump the word tree after adding a word. |
51 | // Only use it for small word lists! |
52 | |
53 | // Use SPELL_COMPRESS_ALLWAYS for debugging: compress the word tree after |
54 | // adding a word. Only use it for small word lists! |
55 | |
56 | // Use DEBUG_TRIEWALK to print the changes made in suggest_trie_walk() for a |
57 | // specific word. |
58 | |
59 | // Use this to adjust the score after finding suggestions, based on the |
60 | // suggested word sounding like the bad word. This is much faster than doing |
61 | // it for every possible suggestion. |
62 | // Disadvantage: When "the" is typed as "hte" it sounds quite different ("@" |
63 | // vs "ht") and goes down in the list. |
64 | // Used when 'spellsuggest' is set to "best". |
65 | #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) |
66 | |
67 | // Do the opposite: based on a maximum end score and a known sound score, |
68 | // compute the maximum word score that can be used. |
69 | #define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3) |
70 | |
71 | #include <assert.h> |
72 | #include <inttypes.h> |
73 | #include <limits.h> |
74 | #include <stdbool.h> |
75 | #include <string.h> |
76 | #include <stdlib.h> |
77 | #include <wctype.h> |
78 | |
79 | /* for offsetof() */ |
80 | #include <stddef.h> |
81 | |
82 | #include "nvim/vim.h" |
83 | #include "nvim/ascii.h" |
84 | #include "nvim/spell.h" |
85 | #include "nvim/buffer.h" |
86 | #include "nvim/change.h" |
87 | #include "nvim/charset.h" |
88 | #include "nvim/cursor.h" |
89 | #include "nvim/edit.h" |
90 | #include "nvim/eval.h" |
91 | #include "nvim/ex_cmds.h" |
92 | #include "nvim/ex_cmds2.h" |
93 | #include "nvim/ex_docmd.h" |
94 | #include "nvim/fileio.h" |
95 | #include "nvim/func_attr.h" |
96 | #include "nvim/getchar.h" |
97 | #include "nvim/hashtab.h" |
98 | #include "nvim/mark.h" |
99 | #include "nvim/mbyte.h" |
100 | #include "nvim/memline.h" |
101 | #include "nvim/memory.h" |
102 | #include "nvim/message.h" |
103 | #include "nvim/misc1.h" |
104 | #include "nvim/garray.h" |
105 | #include "nvim/normal.h" |
106 | #include "nvim/option.h" |
107 | #include "nvim/os_unix.h" |
108 | #include "nvim/path.h" |
109 | #include "nvim/regexp.h" |
110 | #include "nvim/screen.h" |
111 | #include "nvim/search.h" |
112 | #include "nvim/spellfile.h" |
113 | #include "nvim/strings.h" |
114 | #include "nvim/syntax.h" |
115 | #include "nvim/undo.h" |
116 | #include "nvim/os/os.h" |
117 | #include "nvim/os/input.h" |
118 | |
119 | // only used for su_badflags |
120 | #define WF_MIXCAP 0x20 // mix of upper and lower case: macaRONI |
121 | |
122 | #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP | WF_FIXCAP) |
123 | |
124 | // Result values. Lower number is accepted over higher one. |
125 | #define SP_BANNED -1 |
126 | #define SP_RARE 0 |
127 | #define SP_OK 1 |
128 | #define SP_LOCAL 2 |
129 | #define SP_BAD 3 |
130 | |
131 | // First language that is loaded, start of the linked list of loaded |
132 | // languages. |
133 | slang_T *first_lang = NULL; |
134 | |
135 | // file used for "zG" and "zW" |
136 | char_u *int_wordlist = NULL; |
137 | |
138 | typedef struct wordcount_S { |
139 | uint16_t wc_count; // nr of times word was seen |
140 | char_u wc_word[1]; // word, actually longer |
141 | } wordcount_T; |
142 | |
143 | #define WC_KEY_OFF offsetof(wordcount_T, wc_word) |
144 | #define HI2WC(hi) ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF)) |
145 | #define MAXWORDCOUNT 0xffff |
146 | |
147 | // Information used when looking for suggestions. |
148 | typedef struct suginfo_S { |
149 | garray_T su_ga; // suggestions, contains "suggest_T" |
150 | int su_maxcount; // max. number of suggestions displayed |
151 | int su_maxscore; // maximum score for adding to su_ga |
152 | int su_sfmaxscore; // idem, for when doing soundfold words |
153 | garray_T su_sga; // like su_ga, sound-folded scoring |
154 | char_u *su_badptr; // start of bad word in line |
155 | int su_badlen; // length of detected bad word in line |
156 | int su_badflags; // caps flags for bad word |
157 | char_u su_badword[MAXWLEN]; // bad word truncated at su_badlen |
158 | char_u su_fbadword[MAXWLEN]; // su_badword case-folded |
159 | char_u su_sal_badword[MAXWLEN]; // su_badword soundfolded |
160 | hashtab_T su_banned; // table with banned words |
161 | slang_T *su_sallang; // default language for sound folding |
162 | } suginfo_T; |
163 | |
164 | // One word suggestion. Used in "si_ga". |
165 | typedef struct { |
166 | char_u *st_word; // suggested word, allocated string |
167 | int st_wordlen; // STRLEN(st_word) |
168 | int st_orglen; // length of replaced text |
169 | int st_score; // lower is better |
170 | int st_altscore; // used when st_score compares equal |
171 | bool st_salscore; // st_score is for soundalike |
172 | bool st_had_bonus; // bonus already included in score |
173 | slang_T *st_slang; // language used for sound folding |
174 | } suggest_T; |
175 | |
176 | #define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i]) |
177 | |
178 | // True if a word appears in the list of banned words. |
179 | #define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word))) |
180 | |
181 | // Number of suggestions kept when cleaning up. We need to keep more than |
182 | // what is displayed, because when rescore_suggestions() is called the score |
183 | // may change and wrong suggestions may be removed later. |
184 | #define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < \ |
185 | 130 ? 150 : (su)->su_maxcount + 20) |
186 | |
187 | // Threshold for sorting and cleaning up suggestions. Don't want to keep lots |
188 | // of suggestions that are not going to be displayed. |
189 | #define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50) |
190 | |
191 | // score for various changes |
192 | #define SCORE_SPLIT 149 // split bad word |
193 | #define SCORE_SPLIT_NO 249 // split bad word with NOSPLITSUGS |
194 | #define SCORE_ICASE 52 // slightly different case |
195 | #define SCORE_REGION 200 // word is for different region |
196 | #define SCORE_RARE 180 // rare word |
197 | #define SCORE_SWAP 75 // swap two characters |
198 | #define SCORE_SWAP3 110 // swap two characters in three |
199 | #define SCORE_REP 65 // REP replacement |
200 | #define SCORE_SUBST 93 // substitute a character |
201 | #define SCORE_SIMILAR 33 // substitute a similar character |
202 | #define SCORE_SUBCOMP 33 // substitute a composing character |
203 | #define SCORE_DEL 94 // delete a character |
204 | #define SCORE_DELDUP 66 // delete a duplicated character |
205 | #define SCORE_DELCOMP 28 // delete a composing character |
206 | #define SCORE_INS 96 // insert a character |
207 | #define SCORE_INSDUP 67 // insert a duplicate character |
208 | #define SCORE_INSCOMP 30 // insert a composing character |
209 | #define SCORE_NONWORD 103 // change non-word to word char |
210 | |
211 | #define SCORE_FILE 30 // suggestion from a file |
212 | #define SCORE_MAXINIT 350 // Initial maximum score: higher == slower. |
213 | // 350 allows for about three changes. |
214 | |
215 | #define SCORE_COMMON1 30 // subtracted for words seen before |
216 | #define SCORE_COMMON2 40 // subtracted for words often seen |
217 | #define SCORE_COMMON3 50 // subtracted for words very often seen |
218 | #define SCORE_THRES2 10 // word count threshold for COMMON2 |
219 | #define SCORE_THRES3 100 // word count threshold for COMMON3 |
220 | |
221 | // When trying changed soundfold words it becomes slow when trying more than |
222 | // two changes. With less then two changes it's slightly faster but we miss a |
223 | // few good suggestions. In rare cases we need to try three of four changes. |
224 | #define SCORE_SFMAX1 200 // maximum score for first try |
225 | #define SCORE_SFMAX2 300 // maximum score for second try |
226 | #define SCORE_SFMAX3 400 // maximum score for third try |
227 | |
228 | #define SCORE_BIG SCORE_INS * 3 // big difference |
229 | #define SCORE_MAXMAX 999999 // accept any score |
230 | #define SCORE_LIMITMAX 350 // for spell_edit_score_limit() |
231 | |
232 | // for spell_edit_score_limit() we need to know the minimum value of |
233 | // SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS |
234 | #define SCORE_EDIT_MIN SCORE_SIMILAR |
235 | |
236 | // Structure to store info for word matching. |
237 | typedef struct matchinf_S { |
238 | langp_T *mi_lp; // info for language and region |
239 | |
240 | // pointers to original text to be checked |
241 | char_u *mi_word; // start of word being checked |
242 | char_u *mi_end; // end of matching word so far |
243 | char_u *mi_fend; // next char to be added to mi_fword |
244 | char_u *mi_cend; // char after what was used for |
245 | // mi_capflags |
246 | |
247 | // case-folded text |
248 | char_u mi_fword[MAXWLEN + 1]; // mi_word case-folded |
249 | int mi_fwordlen; // nr of valid bytes in mi_fword |
250 | |
251 | // for when checking word after a prefix |
252 | int mi_prefarridx; // index in sl_pidxs with list of |
253 | // affixID/condition |
254 | int mi_prefcnt; // number of entries at mi_prefarridx |
255 | int mi_prefixlen; // byte length of prefix |
256 | int mi_cprefixlen; // byte length of prefix in original |
257 | // case |
258 | |
259 | // for when checking a compound word |
260 | int mi_compoff; // start of following word offset |
261 | char_u mi_compflags[MAXWLEN]; // flags for compound words used |
262 | int mi_complen; // nr of compound words used |
263 | int ; // nr of COMPOUNDROOT words |
264 | |
265 | // others |
266 | int mi_result; // result so far: SP_BAD, SP_OK, etc. |
267 | int mi_capflags; // WF_ONECAP WF_ALLCAP WF_KEEPCAP |
268 | win_T *mi_win; // buffer being checked |
269 | |
270 | // for NOBREAK |
271 | int mi_result2; // "mi_resul" without following word |
272 | char_u *mi_end2; // "mi_end" without following word |
273 | } matchinf_T; |
274 | |
275 | // Structure used for the cookie argument of do_in_runtimepath(). |
276 | typedef struct spelload_S { |
277 | char_u sl_lang[MAXWLEN + 1]; // language name |
278 | slang_T *sl_slang; // resulting slang_T struct |
279 | int sl_nobreak; // NOBREAK language found |
280 | } spelload_T; |
281 | |
282 | #define SY_MAXLEN 30 |
283 | typedef struct syl_item_S { |
284 | char_u sy_chars[SY_MAXLEN]; // the sequence of chars |
285 | int sy_len; |
286 | } syl_item_T; |
287 | |
288 | spelltab_T spelltab; |
289 | int did_set_spelltab; |
290 | |
291 | // structure used to store soundfolded words that add_sound_suggest() has |
292 | // handled already. |
293 | typedef struct { |
294 | short sft_score; // lowest score used |
295 | char_u sft_word[1]; // soundfolded word, actually longer |
296 | } sftword_T; |
297 | |
298 | typedef struct { |
299 | int badi; |
300 | int goodi; |
301 | int score; |
302 | } limitscore_T; |
303 | |
304 | |
305 | #ifdef INCLUDE_GENERATED_DECLARATIONS |
306 | # include "spell.c.generated.h" |
307 | #endif |
308 | |
309 | // values for ts_isdiff |
310 | #define DIFF_NONE 0 // no different byte (yet) |
311 | #define DIFF_YES 1 // different byte found |
312 | #define DIFF_INSERT 2 // inserting character |
313 | |
314 | // values for ts_flags |
315 | #define TSF_PREFIXOK 1 // already checked that prefix is OK |
316 | #define TSF_DIDSPLIT 2 // tried split at this point |
317 | #define TSF_DIDDEL 4 // did a delete, "ts_delidx" has index |
318 | |
319 | // special values ts_prefixdepth |
320 | #define PFD_NOPREFIX 0xff // not using prefixes |
321 | #define PFD_PREFIXTREE 0xfe // walking through the prefix tree |
322 | #define PFD_NOTSPECIAL 0xfd // highest value that's not special |
323 | |
324 | // mode values for find_word |
325 | #define FIND_FOLDWORD 0 // find word case-folded |
326 | #define FIND_KEEPWORD 1 // find keep-case word |
327 | #define FIND_PREFIX 2 // find word after prefix |
328 | #define FIND_COMPOUND 3 // find case-folded compound word |
329 | #define FIND_KEEPCOMPOUND 4 // find keep-case compound word |
330 | |
331 | char *e_format = N_("E759: Format error in spell file" ); |
332 | |
333 | // Remember what "z?" replaced. |
334 | static char_u *repl_from = NULL; |
335 | static char_u *repl_to = NULL; |
336 | |
337 | // Main spell-checking function. |
338 | // "ptr" points to a character that could be the start of a word. |
339 | // "*attrp" is set to the highlight index for a badly spelled word. For a |
340 | // non-word or when it's OK it remains unchanged. |
341 | // This must only be called when 'spelllang' is not empty. |
342 | // |
343 | // "capcol" is used to check for a Capitalised word after the end of a |
344 | // sentence. If it's zero then perform the check. Return the column where to |
345 | // check next, or -1 when no sentence end was found. If it's NULL then don't |
346 | // worry. |
347 | // |
348 | // Returns the length of the word in bytes, also when it's OK, so that the |
349 | // caller can skip over the word. |
350 | size_t spell_check( |
351 | win_T *wp, // current window |
352 | char_u *ptr, |
353 | hlf_T *attrp, |
354 | int *capcol, // column to check for Capital |
355 | bool docount // count good words |
356 | ) |
357 | { |
358 | matchinf_T mi; // Most things are put in "mi" so that it can |
359 | // be passed to functions quickly. |
360 | size_t nrlen = 0; // found a number first |
361 | int c; |
362 | size_t wrongcaplen = 0; |
363 | int lpi; |
364 | bool count_word = docount; |
365 | |
366 | // A word never starts at a space or a control character. Return quickly |
367 | // then, skipping over the character. |
368 | if (*ptr <= ' ') { |
369 | return 1; |
370 | } |
371 | |
372 | // Return here when loading language files failed. |
373 | if (GA_EMPTY(&wp->w_s->b_langp)) { |
374 | return 1; |
375 | } |
376 | |
377 | memset(&mi, 0, sizeof(matchinf_T)); |
378 | |
379 | // A number is always OK. Also skip hexadecimal numbers 0xFF99 and |
380 | // 0X99FF. But always do check spelling to find "3GPP" and "11 |
381 | // julifeest". |
382 | if (*ptr >= '0' && *ptr <= '9') { |
383 | if (*ptr == '0' && (ptr[1] == 'b' || ptr[1] == 'B')) { |
384 | mi.mi_end = (char_u*) skipbin((char*) ptr + 2); |
385 | } else if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X')) { |
386 | mi.mi_end = skiphex(ptr + 2); |
387 | } else { |
388 | mi.mi_end = skipdigits(ptr); |
389 | } |
390 | nrlen = (size_t)(mi.mi_end - ptr); |
391 | } |
392 | |
393 | // Find the normal end of the word (until the next non-word character). |
394 | mi.mi_word = ptr; |
395 | mi.mi_fend = ptr; |
396 | if (spell_iswordp(mi.mi_fend, wp)) { |
397 | do { |
398 | MB_PTR_ADV(mi.mi_fend); |
399 | } while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend, wp)); |
400 | |
401 | if (capcol != NULL && *capcol == 0 && wp->w_s->b_cap_prog != NULL) { |
402 | // Check word starting with capital letter. |
403 | c = PTR2CHAR(ptr); |
404 | if (!SPELL_ISUPPER(c)) { |
405 | wrongcaplen = (size_t)(mi.mi_fend - ptr); |
406 | } |
407 | } |
408 | } |
409 | if (capcol != NULL) { |
410 | *capcol = -1; |
411 | } |
412 | |
413 | // We always use the characters up to the next non-word character, |
414 | // also for bad words. |
415 | mi.mi_end = mi.mi_fend; |
416 | |
417 | // Check caps type later. |
418 | mi.mi_capflags = 0; |
419 | mi.mi_cend = NULL; |
420 | mi.mi_win = wp; |
421 | |
422 | // case-fold the word with one non-word character, so that we can check |
423 | // for the word end. |
424 | if (*mi.mi_fend != NUL) { |
425 | MB_PTR_ADV(mi.mi_fend); |
426 | } |
427 | |
428 | (void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword, MAXWLEN + 1); |
429 | mi.mi_fwordlen = (int)STRLEN(mi.mi_fword); |
430 | |
431 | // The word is bad unless we recognize it. |
432 | mi.mi_result = SP_BAD; |
433 | mi.mi_result2 = SP_BAD; |
434 | |
435 | // Loop over the languages specified in 'spelllang'. |
436 | // We check them all, because a word may be matched longer in another |
437 | // language. |
438 | for (lpi = 0; lpi < wp->w_s->b_langp.ga_len; ++lpi) { |
439 | mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, lpi); |
440 | |
441 | // If reloading fails the language is still in the list but everything |
442 | // has been cleared. |
443 | if (mi.mi_lp->lp_slang->sl_fidxs == NULL) { |
444 | continue; |
445 | } |
446 | |
447 | // Check for a matching word in case-folded words. |
448 | find_word(&mi, FIND_FOLDWORD); |
449 | |
450 | // Check for a matching word in keep-case words. |
451 | find_word(&mi, FIND_KEEPWORD); |
452 | |
453 | // Check for matching prefixes. |
454 | find_prefix(&mi, FIND_FOLDWORD); |
455 | |
456 | // For a NOBREAK language, may want to use a word without a following |
457 | // word as a backup. |
458 | if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD |
459 | && mi.mi_result2 != SP_BAD) { |
460 | mi.mi_result = mi.mi_result2; |
461 | mi.mi_end = mi.mi_end2; |
462 | } |
463 | |
464 | // Count the word in the first language where it's found to be OK. |
465 | if (count_word && mi.mi_result == SP_OK) { |
466 | count_common_word(mi.mi_lp->lp_slang, ptr, |
467 | (int)(mi.mi_end - ptr), 1); |
468 | count_word = false; |
469 | } |
470 | } |
471 | |
472 | if (mi.mi_result != SP_OK) { |
473 | // If we found a number skip over it. Allows for "42nd". Do flag |
474 | // rare and local words, e.g., "3GPP". |
475 | if (nrlen > 0) { |
476 | if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) { |
477 | return nrlen; |
478 | } |
479 | } else if (!spell_iswordp_nmw(ptr, wp)) { |
480 | // When we are at a non-word character there is no error, just |
481 | // skip over the character (try looking for a word after it). |
482 | if (capcol != NULL && wp->w_s->b_cap_prog != NULL) { |
483 | regmatch_T regmatch; |
484 | |
485 | // Check for end of sentence. |
486 | regmatch.regprog = wp->w_s->b_cap_prog; |
487 | regmatch.rm_ic = false; |
488 | int r = vim_regexec(®match, ptr, 0); |
489 | wp->w_s->b_cap_prog = regmatch.regprog; |
490 | if (r) { |
491 | *capcol = (int)(regmatch.endp[0] - ptr); |
492 | } |
493 | } |
494 | |
495 | if (has_mbyte) { |
496 | return (size_t)(*mb_ptr2len)(ptr); |
497 | } |
498 | return 1; |
499 | } else if (mi.mi_end == ptr) { |
500 | // Always include at least one character. Required for when there |
501 | // is a mixup in "midword". |
502 | MB_PTR_ADV(mi.mi_end); |
503 | } else if (mi.mi_result == SP_BAD |
504 | && LANGP_ENTRY(wp->w_s->b_langp, 0)->lp_slang->sl_nobreak) { |
505 | char_u *p, *fp; |
506 | int save_result = mi.mi_result; |
507 | |
508 | // First language in 'spelllang' is NOBREAK. Find first position |
509 | // at which any word would be valid. |
510 | mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, 0); |
511 | if (mi.mi_lp->lp_slang->sl_fidxs != NULL) { |
512 | p = mi.mi_word; |
513 | fp = mi.mi_fword; |
514 | for (;;) { |
515 | MB_PTR_ADV(p); |
516 | MB_PTR_ADV(fp); |
517 | if (p >= mi.mi_end) { |
518 | break; |
519 | } |
520 | mi.mi_compoff = (int)(fp - mi.mi_fword); |
521 | find_word(&mi, FIND_COMPOUND); |
522 | if (mi.mi_result != SP_BAD) { |
523 | mi.mi_end = p; |
524 | break; |
525 | } |
526 | } |
527 | mi.mi_result = save_result; |
528 | } |
529 | } |
530 | |
531 | if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED) { |
532 | *attrp = HLF_SPB; |
533 | } else if (mi.mi_result == SP_RARE) { |
534 | *attrp = HLF_SPR; |
535 | } else { |
536 | *attrp = HLF_SPL; |
537 | } |
538 | } |
539 | |
540 | if (wrongcaplen > 0 && (mi.mi_result == SP_OK || mi.mi_result == SP_RARE)) { |
541 | // Report SpellCap only when the word isn't badly spelled. |
542 | *attrp = HLF_SPC; |
543 | return wrongcaplen; |
544 | } |
545 | |
546 | return (size_t)(mi.mi_end - ptr); |
547 | } |
548 | |
549 | // Check if the word at "mip->mi_word" is in the tree. |
550 | // When "mode" is FIND_FOLDWORD check in fold-case word tree. |
551 | // When "mode" is FIND_KEEPWORD check in keep-case word tree. |
552 | // When "mode" is FIND_PREFIX check for word after prefix in fold-case word |
553 | // tree. |
554 | // |
555 | // For a match mip->mi_result is updated. |
556 | static void find_word(matchinf_T *mip, int mode) |
557 | { |
558 | int wlen = 0; |
559 | int flen; |
560 | char_u *ptr; |
561 | slang_T *slang = mip->mi_lp->lp_slang; |
562 | char_u *byts; |
563 | idx_T *idxs; |
564 | |
565 | if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND) { |
566 | // Check for word with matching case in keep-case tree. |
567 | ptr = mip->mi_word; |
568 | flen = 9999; // no case folding, always enough bytes |
569 | byts = slang->sl_kbyts; |
570 | idxs = slang->sl_kidxs; |
571 | |
572 | if (mode == FIND_KEEPCOMPOUND) |
573 | // Skip over the previously found word(s). |
574 | wlen += mip->mi_compoff; |
575 | } else { |
576 | // Check for case-folded in case-folded tree. |
577 | ptr = mip->mi_fword; |
578 | flen = mip->mi_fwordlen; // available case-folded bytes |
579 | byts = slang->sl_fbyts; |
580 | idxs = slang->sl_fidxs; |
581 | |
582 | if (mode == FIND_PREFIX) { |
583 | // Skip over the prefix. |
584 | wlen = mip->mi_prefixlen; |
585 | flen -= mip->mi_prefixlen; |
586 | } else if (mode == FIND_COMPOUND) { |
587 | // Skip over the previously found word(s). |
588 | wlen = mip->mi_compoff; |
589 | flen -= mip->mi_compoff; |
590 | } |
591 | |
592 | } |
593 | |
594 | if (byts == NULL) |
595 | return; // array is empty |
596 | |
597 | idx_T arridx = 0; |
598 | int endlen[MAXWLEN]; // length at possible word endings |
599 | idx_T endidx[MAXWLEN]; // possible word endings |
600 | int endidxcnt = 0; |
601 | int len; |
602 | int c; |
603 | |
604 | // Repeat advancing in the tree until: |
605 | // - there is a byte that doesn't match, |
606 | // - we reach the end of the tree, |
607 | // - or we reach the end of the line. |
608 | for (;; ) { |
609 | if (flen <= 0 && *mip->mi_fend != NUL) |
610 | flen = fold_more(mip); |
611 | |
612 | len = byts[arridx++]; |
613 | |
614 | // If the first possible byte is a zero the word could end here. |
615 | // Remember this index, we first check for the longest word. |
616 | if (byts[arridx] == 0) { |
617 | if (endidxcnt == MAXWLEN) { |
618 | // Must be a corrupted spell file. |
619 | EMSG(_(e_format)); |
620 | return; |
621 | } |
622 | endlen[endidxcnt] = wlen; |
623 | endidx[endidxcnt++] = arridx++; |
624 | --len; |
625 | |
626 | // Skip over the zeros, there can be several flag/region |
627 | // combinations. |
628 | while (len > 0 && byts[arridx] == 0) { |
629 | ++arridx; |
630 | --len; |
631 | } |
632 | if (len == 0) |
633 | break; // no children, word must end here |
634 | } |
635 | |
636 | // Stop looking at end of the line. |
637 | if (ptr[wlen] == NUL) |
638 | break; |
639 | |
640 | // Perform a binary search in the list of accepted bytes. |
641 | c = ptr[wlen]; |
642 | if (c == TAB) // <Tab> is handled like <Space> |
643 | c = ' '; |
644 | idx_T lo = arridx; |
645 | idx_T hi = arridx + len - 1; |
646 | while (lo < hi) { |
647 | idx_T m = (lo + hi) / 2; |
648 | if (byts[m] > c) |
649 | hi = m - 1; |
650 | else if (byts[m] < c) |
651 | lo = m + 1; |
652 | else { |
653 | lo = hi = m; |
654 | break; |
655 | } |
656 | } |
657 | |
658 | // Stop if there is no matching byte. |
659 | if (hi < lo || byts[lo] != c) |
660 | break; |
661 | |
662 | // Continue at the child (if there is one). |
663 | arridx = idxs[lo]; |
664 | ++wlen; |
665 | --flen; |
666 | |
667 | // One space in the good word may stand for several spaces in the |
668 | // checked word. |
669 | if (c == ' ') { |
670 | for (;; ) { |
671 | if (flen <= 0 && *mip->mi_fend != NUL) |
672 | flen = fold_more(mip); |
673 | if (ptr[wlen] != ' ' && ptr[wlen] != TAB) |
674 | break; |
675 | ++wlen; |
676 | --flen; |
677 | } |
678 | } |
679 | } |
680 | |
681 | char_u *p; |
682 | bool word_ends; |
683 | |
684 | // Verify that one of the possible endings is valid. Try the longest |
685 | // first. |
686 | while (endidxcnt > 0) { |
687 | --endidxcnt; |
688 | arridx = endidx[endidxcnt]; |
689 | wlen = endlen[endidxcnt]; |
690 | |
691 | if (utf_head_off(ptr, ptr + wlen) > 0) { |
692 | continue; // not at first byte of character |
693 | } |
694 | if (spell_iswordp(ptr + wlen, mip->mi_win)) { |
695 | if (slang->sl_compprog == NULL && !slang->sl_nobreak) |
696 | continue; // next char is a word character |
697 | word_ends = false; |
698 | } else |
699 | word_ends = true; |
700 | // The prefix flag is before compound flags. Once a valid prefix flag |
701 | // has been found we try compound flags. |
702 | bool prefix_found = false; |
703 | |
704 | if (mode != FIND_KEEPWORD && has_mbyte) { |
705 | // Compute byte length in original word, length may change |
706 | // when folding case. This can be slow, take a shortcut when the |
707 | // case-folded word is equal to the keep-case word. |
708 | p = mip->mi_word; |
709 | if (STRNCMP(ptr, p, wlen) != 0) { |
710 | for (char_u *s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) { |
711 | MB_PTR_ADV(p); |
712 | } |
713 | wlen = (int)(p - mip->mi_word); |
714 | } |
715 | } |
716 | |
717 | // Check flags and region. For FIND_PREFIX check the condition and |
718 | // prefix ID. |
719 | // Repeat this if there are more flags/region alternatives until there |
720 | // is a match. |
721 | for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; |
722 | --len, ++arridx) { |
723 | uint32_t flags = idxs[arridx]; |
724 | |
725 | // For the fold-case tree check that the case of the checked word |
726 | // matches with what the word in the tree requires. |
727 | // For keep-case tree the case is always right. For prefixes we |
728 | // don't bother to check. |
729 | if (mode == FIND_FOLDWORD) { |
730 | if (mip->mi_cend != mip->mi_word + wlen) { |
731 | // mi_capflags was set for a different word length, need |
732 | // to do it again. |
733 | mip->mi_cend = mip->mi_word + wlen; |
734 | mip->mi_capflags = captype(mip->mi_word, mip->mi_cend); |
735 | } |
736 | |
737 | if (mip->mi_capflags == WF_KEEPCAP |
738 | || !spell_valid_case(mip->mi_capflags, flags)) |
739 | continue; |
740 | } |
741 | // When mode is FIND_PREFIX the word must support the prefix: |
742 | // check the prefix ID and the condition. Do that for the list at |
743 | // mip->mi_prefarridx that find_prefix() filled. |
744 | else if (mode == FIND_PREFIX && !prefix_found) { |
745 | c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx, |
746 | flags, |
747 | mip->mi_word + mip->mi_cprefixlen, slang, |
748 | false); |
749 | if (c == 0) |
750 | continue; |
751 | |
752 | // Use the WF_RARE flag for a rare prefix. |
753 | if (c & WF_RAREPFX) |
754 | flags |= WF_RARE; |
755 | prefix_found = true; |
756 | } |
757 | |
758 | if (slang->sl_nobreak) { |
759 | if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND) |
760 | && (flags & WF_BANNED) == 0) { |
761 | // NOBREAK: found a valid following word. That's all we |
762 | // need to know, so return. |
763 | mip->mi_result = SP_OK; |
764 | break; |
765 | } |
766 | } else if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND |
767 | || !word_ends)) { |
768 | // If there is no compound flag or the word is shorter than |
769 | // COMPOUNDMIN reject it quickly. |
770 | // Makes you wonder why someone puts a compound flag on a word |
771 | // that's too short... Myspell compatibility requires this |
772 | // anyway. |
773 | if (((unsigned)flags >> 24) == 0 |
774 | || wlen - mip->mi_compoff < slang->sl_compminlen) |
775 | continue; |
776 | // For multi-byte chars check character length against |
777 | // COMPOUNDMIN. |
778 | if (has_mbyte |
779 | && slang->sl_compminlen > 0 |
780 | && mb_charlen_len(mip->mi_word + mip->mi_compoff, |
781 | wlen - mip->mi_compoff) < slang->sl_compminlen) |
782 | continue; |
783 | |
784 | // Limit the number of compound words to COMPOUNDWORDMAX if no |
785 | // maximum for syllables is specified. |
786 | if (!word_ends && mip->mi_complen + mip->mi_compextra + 2 |
787 | > slang->sl_compmax |
788 | && slang->sl_compsylmax == MAXWLEN) |
789 | continue; |
790 | |
791 | // Don't allow compounding on a side where an affix was added, |
792 | // unless COMPOUNDPERMITFLAG was used. |
793 | if (mip->mi_complen > 0 && (flags & WF_NOCOMPBEF)) |
794 | continue; |
795 | if (!word_ends && (flags & WF_NOCOMPAFT)) |
796 | continue; |
797 | |
798 | // Quickly check if compounding is possible with this flag. |
799 | if (!byte_in_str(mip->mi_complen == 0 |
800 | ? slang->sl_compstartflags |
801 | : slang->sl_compallflags, |
802 | ((unsigned)flags >> 24))) |
803 | continue; |
804 | |
805 | // If there is a match with a CHECKCOMPOUNDPATTERN rule |
806 | // discard the compound word. |
807 | if (match_checkcompoundpattern(ptr, wlen, &slang->sl_comppat)) |
808 | continue; |
809 | |
810 | if (mode == FIND_COMPOUND) { |
811 | int capflags; |
812 | |
813 | // Need to check the caps type of the appended compound |
814 | // word. |
815 | if (has_mbyte && STRNCMP(ptr, mip->mi_word, |
816 | mip->mi_compoff) != 0) { |
817 | // case folding may have changed the length |
818 | p = mip->mi_word; |
819 | for (char_u *s = ptr; s < ptr + mip->mi_compoff; MB_PTR_ADV(s)) { |
820 | MB_PTR_ADV(p); |
821 | } |
822 | } else { |
823 | p = mip->mi_word + mip->mi_compoff; |
824 | } |
825 | capflags = captype(p, mip->mi_word + wlen); |
826 | if (capflags == WF_KEEPCAP || (capflags == WF_ALLCAP |
827 | && (flags & WF_FIXCAP) != 0)) |
828 | continue; |
829 | |
830 | if (capflags != WF_ALLCAP) { |
831 | // When the character before the word is a word |
832 | // character we do not accept a Onecap word. We do |
833 | // accept a no-caps word, even when the dictionary |
834 | // word specifies ONECAP. |
835 | MB_PTR_BACK(mip->mi_word, p); |
836 | if (spell_iswordp_nmw(p, mip->mi_win) |
837 | ? capflags == WF_ONECAP |
838 | : (flags & WF_ONECAP) != 0 |
839 | && capflags != WF_ONECAP) { |
840 | continue; |
841 | } |
842 | } |
843 | } |
844 | |
845 | // If the word ends the sequence of compound flags of the |
846 | // words must match with one of the COMPOUNDRULE items and |
847 | // the number of syllables must not be too large. |
848 | mip->mi_compflags[mip->mi_complen] = ((unsigned)flags >> 24); |
849 | mip->mi_compflags[mip->mi_complen + 1] = NUL; |
850 | if (word_ends) { |
851 | char_u fword[MAXWLEN]; |
852 | |
853 | if (slang->sl_compsylmax < MAXWLEN) { |
854 | // "fword" is only needed for checking syllables. |
855 | if (ptr == mip->mi_word) |
856 | (void)spell_casefold(ptr, wlen, fword, MAXWLEN); |
857 | else |
858 | STRLCPY(fword, ptr, endlen[endidxcnt] + 1); |
859 | } |
860 | if (!can_compound(slang, fword, mip->mi_compflags)) |
861 | continue; |
862 | } else if (slang->sl_comprules != NULL |
863 | && !match_compoundrule(slang, mip->mi_compflags)) |
864 | // The compound flags collected so far do not match any |
865 | // COMPOUNDRULE, discard the compounded word. |
866 | continue; |
867 | } |
868 | // Check NEEDCOMPOUND: can't use word without compounding. |
869 | else if (flags & WF_NEEDCOMP) |
870 | continue; |
871 | |
872 | int nobreak_result = SP_OK; |
873 | |
874 | if (!word_ends) { |
875 | int save_result = mip->mi_result; |
876 | char_u *save_end = mip->mi_end; |
877 | langp_T *save_lp = mip->mi_lp; |
878 | |
879 | // Check that a valid word follows. If there is one and we |
880 | // are compounding, it will set "mi_result", thus we are |
881 | // always finished here. For NOBREAK we only check that a |
882 | // valid word follows. |
883 | // Recursive! |
884 | if (slang->sl_nobreak) |
885 | mip->mi_result = SP_BAD; |
886 | |
887 | // Find following word in case-folded tree. |
888 | mip->mi_compoff = endlen[endidxcnt]; |
889 | if (has_mbyte && mode == FIND_KEEPWORD) { |
890 | // Compute byte length in case-folded word from "wlen": |
891 | // byte length in keep-case word. Length may change when |
892 | // folding case. This can be slow, take a shortcut when |
893 | // the case-folded word is equal to the keep-case word. |
894 | p = mip->mi_fword; |
895 | if (STRNCMP(ptr, p, wlen) != 0) { |
896 | for (char_u *s = ptr; s < ptr + wlen; MB_PTR_ADV(s)) { |
897 | MB_PTR_ADV(p); |
898 | } |
899 | mip->mi_compoff = (int)(p - mip->mi_fword); |
900 | } |
901 | } |
902 | #if 0 |
903 | c = mip->mi_compoff; |
904 | #endif |
905 | ++mip->mi_complen; |
906 | if (flags & WF_COMPROOT) |
907 | ++mip->mi_compextra; |
908 | |
909 | // For NOBREAK we need to try all NOBREAK languages, at least |
910 | // to find the ".add" file(s). |
911 | for (int lpi = 0; lpi < mip->mi_win->w_s->b_langp.ga_len; ++lpi) { |
912 | if (slang->sl_nobreak) { |
913 | mip->mi_lp = LANGP_ENTRY(mip->mi_win->w_s->b_langp, lpi); |
914 | if (mip->mi_lp->lp_slang->sl_fidxs == NULL |
915 | || !mip->mi_lp->lp_slang->sl_nobreak) |
916 | continue; |
917 | } |
918 | |
919 | find_word(mip, FIND_COMPOUND); |
920 | |
921 | // When NOBREAK any word that matches is OK. Otherwise we |
922 | // need to find the longest match, thus try with keep-case |
923 | // and prefix too. |
924 | if (!slang->sl_nobreak || mip->mi_result == SP_BAD) { |
925 | // Find following word in keep-case tree. |
926 | mip->mi_compoff = wlen; |
927 | find_word(mip, FIND_KEEPCOMPOUND); |
928 | |
929 | #if 0 // Disabled, a prefix must not appear halfway through a compound |
930 | // word, unless the COMPOUNDPERMITFLAG is used, in which case it |
931 | // can't be a postponed prefix. |
932 | if (!slang->sl_nobreak || mip->mi_result == SP_BAD) { |
933 | // Check for following word with prefix. |
934 | mip->mi_compoff = c; |
935 | find_prefix(mip, FIND_COMPOUND); |
936 | } |
937 | #endif |
938 | } |
939 | |
940 | if (!slang->sl_nobreak) |
941 | break; |
942 | } |
943 | --mip->mi_complen; |
944 | if (flags & WF_COMPROOT) |
945 | --mip->mi_compextra; |
946 | mip->mi_lp = save_lp; |
947 | |
948 | if (slang->sl_nobreak) { |
949 | nobreak_result = mip->mi_result; |
950 | mip->mi_result = save_result; |
951 | mip->mi_end = save_end; |
952 | } else { |
953 | if (mip->mi_result == SP_OK) |
954 | break; |
955 | continue; |
956 | } |
957 | } |
958 | |
959 | int res = SP_BAD; |
960 | if (flags & WF_BANNED) |
961 | res = SP_BANNED; |
962 | else if (flags & WF_REGION) { |
963 | // Check region. |
964 | if ((mip->mi_lp->lp_region & (flags >> 16)) != 0) |
965 | res = SP_OK; |
966 | else |
967 | res = SP_LOCAL; |
968 | } else if (flags & WF_RARE) |
969 | res = SP_RARE; |
970 | else |
971 | res = SP_OK; |
972 | |
973 | // Always use the longest match and the best result. For NOBREAK |
974 | // we separately keep the longest match without a following good |
975 | // word as a fall-back. |
976 | if (nobreak_result == SP_BAD) { |
977 | if (mip->mi_result2 > res) { |
978 | mip->mi_result2 = res; |
979 | mip->mi_end2 = mip->mi_word + wlen; |
980 | } else if (mip->mi_result2 == res |
981 | && mip->mi_end2 < mip->mi_word + wlen) |
982 | mip->mi_end2 = mip->mi_word + wlen; |
983 | } else if (mip->mi_result > res) { |
984 | mip->mi_result = res; |
985 | mip->mi_end = mip->mi_word + wlen; |
986 | } else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen) |
987 | mip->mi_end = mip->mi_word + wlen; |
988 | |
989 | if (mip->mi_result == SP_OK) |
990 | break; |
991 | } |
992 | |
993 | if (mip->mi_result == SP_OK) |
994 | break; |
995 | } |
996 | } |
997 | |
998 | // Returns true if there is a match between the word ptr[wlen] and |
999 | // CHECKCOMPOUNDPATTERN rules, assuming that we will concatenate with another |
1000 | // word. |
1001 | // A match means that the first part of CHECKCOMPOUNDPATTERN matches at the |
1002 | // end of ptr[wlen] and the second part matches after it. |
1003 | static bool |
1004 | match_checkcompoundpattern ( |
1005 | char_u *ptr, |
1006 | int wlen, |
1007 | garray_T *gap // &sl_comppat |
1008 | ) |
1009 | { |
1010 | char_u *p; |
1011 | int len; |
1012 | |
1013 | for (int i = 0; i + 1 < gap->ga_len; i += 2) { |
1014 | p = ((char_u **)gap->ga_data)[i + 1]; |
1015 | if (STRNCMP(ptr + wlen, p, STRLEN(p)) == 0) { |
1016 | // Second part matches at start of following compound word, now |
1017 | // check if first part matches at end of previous word. |
1018 | p = ((char_u **)gap->ga_data)[i]; |
1019 | len = (int)STRLEN(p); |
1020 | if (len <= wlen && STRNCMP(ptr + wlen - len, p, len) == 0) |
1021 | return true; |
1022 | } |
1023 | } |
1024 | return false; |
1025 | } |
1026 | |
1027 | // Returns true if "flags" is a valid sequence of compound flags and "word" |
1028 | // does not have too many syllables. |
1029 | static bool can_compound(slang_T *slang, char_u *word, char_u *flags) |
1030 | { |
1031 | char_u uflags[MAXWLEN * 2]; |
1032 | int i; |
1033 | char_u *p; |
1034 | |
1035 | if (slang->sl_compprog == NULL) |
1036 | return false; |
1037 | if (enc_utf8) { |
1038 | // Need to convert the single byte flags to utf8 characters. |
1039 | p = uflags; |
1040 | for (i = 0; flags[i] != NUL; i++) { |
1041 | p += utf_char2bytes(flags[i], p); |
1042 | } |
1043 | *p = NUL; |
1044 | p = uflags; |
1045 | } else |
1046 | p = flags; |
1047 | if (!vim_regexec_prog(&slang->sl_compprog, false, p, 0)) |
1048 | return false; |
1049 | |
1050 | // Count the number of syllables. This may be slow, do it last. If there |
1051 | // are too many syllables AND the number of compound words is above |
1052 | // COMPOUNDWORDMAX then compounding is not allowed. |
1053 | if (slang->sl_compsylmax < MAXWLEN |
1054 | && count_syllables(slang, word) > slang->sl_compsylmax) |
1055 | return (int)STRLEN(flags) < slang->sl_compmax; |
1056 | return true; |
1057 | } |
1058 | |
1059 | // Returns true when the sequence of flags in "compflags" plus "flag" can |
1060 | // possibly form a valid compounded word. This also checks the COMPOUNDRULE |
1061 | // lines if they don't contain wildcards. |
1062 | static bool can_be_compound(trystate_T *sp, slang_T *slang, char_u *compflags, int flag) |
1063 | { |
1064 | // If the flag doesn't appear in sl_compstartflags or sl_compallflags |
1065 | // then it can't possibly compound. |
1066 | if (!byte_in_str(sp->ts_complen == sp->ts_compsplit |
1067 | ? slang->sl_compstartflags : slang->sl_compallflags, flag)) |
1068 | return false; |
1069 | |
1070 | // If there are no wildcards, we can check if the flags collected so far |
1071 | // possibly can form a match with COMPOUNDRULE patterns. This only |
1072 | // makes sense when we have two or more words. |
1073 | if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit) { |
1074 | compflags[sp->ts_complen] = flag; |
1075 | compflags[sp->ts_complen + 1] = NUL; |
1076 | bool v = match_compoundrule(slang, compflags + sp->ts_compsplit); |
1077 | compflags[sp->ts_complen] = NUL; |
1078 | return v; |
1079 | } |
1080 | |
1081 | return true; |
1082 | } |
1083 | |
1084 | // Returns true if the compound flags in compflags[] match the start of any |
1085 | // compound rule. This is used to stop trying a compound if the flags |
1086 | // collected so far can't possibly match any compound rule. |
1087 | // Caller must check that slang->sl_comprules is not NULL. |
1088 | static bool match_compoundrule(slang_T *slang, char_u *compflags) |
1089 | { |
1090 | char_u *p; |
1091 | int i; |
1092 | int c; |
1093 | |
1094 | // loop over all the COMPOUNDRULE entries |
1095 | for (p = slang->sl_comprules; *p != NUL; ++p) { |
1096 | // loop over the flags in the compound word we have made, match |
1097 | // them against the current rule entry |
1098 | for (i = 0;; ++i) { |
1099 | c = compflags[i]; |
1100 | if (c == NUL) |
1101 | // found a rule that matches for the flags we have so far |
1102 | return true; |
1103 | if (*p == '/' || *p == NUL) |
1104 | break; // end of rule, it's too short |
1105 | if (*p == '[') { |
1106 | bool match = false; |
1107 | |
1108 | // compare against all the flags in [] |
1109 | ++p; |
1110 | while (*p != ']' && *p != NUL) |
1111 | if (*p++ == c) |
1112 | match = true; |
1113 | if (!match) |
1114 | break; // none matches |
1115 | } else if (*p != c) |
1116 | break; // flag of word doesn't match flag in pattern |
1117 | ++p; |
1118 | } |
1119 | |
1120 | // Skip to the next "/", where the next pattern starts. |
1121 | p = vim_strchr(p, '/'); |
1122 | if (p == NULL) |
1123 | break; |
1124 | } |
1125 | |
1126 | // Checked all the rules and none of them match the flags, so there |
1127 | // can't possibly be a compound starting with these flags. |
1128 | return false; |
1129 | } |
1130 | |
1131 | // Return non-zero if the prefix indicated by "arridx" matches with the prefix |
1132 | // ID in "flags" for the word "word". |
1133 | // The WF_RAREPFX flag is included in the return value for a rare prefix. |
1134 | static int |
1135 | valid_word_prefix ( |
1136 | int totprefcnt, // nr of prefix IDs |
1137 | int arridx, // idx in sl_pidxs[] |
1138 | int flags, |
1139 | char_u *word, |
1140 | slang_T *slang, |
1141 | bool cond_req // only use prefixes with a condition |
1142 | ) |
1143 | { |
1144 | int prefcnt; |
1145 | int pidx; |
1146 | int prefid; |
1147 | |
1148 | prefid = (unsigned)flags >> 24; |
1149 | for (prefcnt = totprefcnt - 1; prefcnt >= 0; --prefcnt) { |
1150 | pidx = slang->sl_pidxs[arridx + prefcnt]; |
1151 | |
1152 | // Check the prefix ID. |
1153 | if (prefid != (pidx & 0xff)) |
1154 | continue; |
1155 | |
1156 | // Check if the prefix doesn't combine and the word already has a |
1157 | // suffix. |
1158 | if ((flags & WF_HAS_AFF) && (pidx & WF_PFX_NC)) |
1159 | continue; |
1160 | |
1161 | // Check the condition, if there is one. The condition index is |
1162 | // stored in the two bytes above the prefix ID byte. |
1163 | regprog_T **rp = &slang->sl_prefprog[((unsigned)pidx >> 8) & 0xffff]; |
1164 | if (*rp != NULL) { |
1165 | if (!vim_regexec_prog(rp, false, word, 0)) { |
1166 | continue; |
1167 | } |
1168 | } else if (cond_req) |
1169 | continue; |
1170 | |
1171 | // It's a match! Return the WF_ flags. |
1172 | return pidx; |
1173 | } |
1174 | return 0; |
1175 | } |
1176 | |
1177 | // Check if the word at "mip->mi_word" has a matching prefix. |
1178 | // If it does, then check the following word. |
1179 | // |
1180 | // If "mode" is "FIND_COMPOUND" then do the same after another word, find a |
1181 | // prefix in a compound word. |
1182 | // |
1183 | // For a match mip->mi_result is updated. |
1184 | static void find_prefix(matchinf_T *mip, int mode) |
1185 | { |
1186 | idx_T arridx = 0; |
1187 | int len; |
1188 | int wlen = 0; |
1189 | int flen; |
1190 | int c; |
1191 | char_u *ptr; |
1192 | idx_T lo, hi, m; |
1193 | slang_T *slang = mip->mi_lp->lp_slang; |
1194 | char_u *byts; |
1195 | idx_T *idxs; |
1196 | |
1197 | byts = slang->sl_pbyts; |
1198 | if (byts == NULL) |
1199 | return; // array is empty |
1200 | |
1201 | // We use the case-folded word here, since prefixes are always |
1202 | // case-folded. |
1203 | ptr = mip->mi_fword; |
1204 | flen = mip->mi_fwordlen; // available case-folded bytes |
1205 | if (mode == FIND_COMPOUND) { |
1206 | // Skip over the previously found word(s). |
1207 | ptr += mip->mi_compoff; |
1208 | flen -= mip->mi_compoff; |
1209 | } |
1210 | idxs = slang->sl_pidxs; |
1211 | |
1212 | // Repeat advancing in the tree until: |
1213 | // - there is a byte that doesn't match, |
1214 | // - we reach the end of the tree, |
1215 | // - or we reach the end of the line. |
1216 | for (;; ) { |
1217 | if (flen == 0 && *mip->mi_fend != NUL) |
1218 | flen = fold_more(mip); |
1219 | |
1220 | len = byts[arridx++]; |
1221 | |
1222 | // If the first possible byte is a zero the prefix could end here. |
1223 | // Check if the following word matches and supports the prefix. |
1224 | if (byts[arridx] == 0) { |
1225 | // There can be several prefixes with different conditions. We |
1226 | // try them all, since we don't know which one will give the |
1227 | // longest match. The word is the same each time, pass the list |
1228 | // of possible prefixes to find_word(). |
1229 | mip->mi_prefarridx = arridx; |
1230 | mip->mi_prefcnt = len; |
1231 | while (len > 0 && byts[arridx] == 0) { |
1232 | ++arridx; |
1233 | --len; |
1234 | } |
1235 | mip->mi_prefcnt -= len; |
1236 | |
1237 | // Find the word that comes after the prefix. |
1238 | mip->mi_prefixlen = wlen; |
1239 | if (mode == FIND_COMPOUND) |
1240 | // Skip over the previously found word(s). |
1241 | mip->mi_prefixlen += mip->mi_compoff; |
1242 | |
1243 | if (has_mbyte) { |
1244 | // Case-folded length may differ from original length. |
1245 | mip->mi_cprefixlen = nofold_len(mip->mi_fword, |
1246 | mip->mi_prefixlen, mip->mi_word); |
1247 | } else |
1248 | mip->mi_cprefixlen = mip->mi_prefixlen; |
1249 | find_word(mip, FIND_PREFIX); |
1250 | |
1251 | |
1252 | if (len == 0) |
1253 | break; // no children, word must end here |
1254 | } |
1255 | |
1256 | // Stop looking at end of the line. |
1257 | if (ptr[wlen] == NUL) |
1258 | break; |
1259 | |
1260 | // Perform a binary search in the list of accepted bytes. |
1261 | c = ptr[wlen]; |
1262 | lo = arridx; |
1263 | hi = arridx + len - 1; |
1264 | while (lo < hi) { |
1265 | m = (lo + hi) / 2; |
1266 | if (byts[m] > c) |
1267 | hi = m - 1; |
1268 | else if (byts[m] < c) |
1269 | lo = m + 1; |
1270 | else { |
1271 | lo = hi = m; |
1272 | break; |
1273 | } |
1274 | } |
1275 | |
1276 | // Stop if there is no matching byte. |
1277 | if (hi < lo || byts[lo] != c) |
1278 | break; |
1279 | |
1280 | // Continue at the child (if there is one). |
1281 | arridx = idxs[lo]; |
1282 | ++wlen; |
1283 | --flen; |
1284 | } |
1285 | } |
1286 | |
1287 | // Need to fold at least one more character. Do until next non-word character |
1288 | // for efficiency. Include the non-word character too. |
1289 | // Return the length of the folded chars in bytes. |
1290 | static int fold_more(matchinf_T *mip) |
1291 | { |
1292 | int flen; |
1293 | char_u *p; |
1294 | |
1295 | p = mip->mi_fend; |
1296 | do { |
1297 | MB_PTR_ADV(mip->mi_fend); |
1298 | } while (*mip->mi_fend != NUL && spell_iswordp(mip->mi_fend, mip->mi_win)); |
1299 | |
1300 | // Include the non-word character so that we can check for the word end. |
1301 | if (*mip->mi_fend != NUL) { |
1302 | MB_PTR_ADV(mip->mi_fend); |
1303 | } |
1304 | |
1305 | (void)spell_casefold(p, (int)(mip->mi_fend - p), |
1306 | mip->mi_fword + mip->mi_fwordlen, |
1307 | MAXWLEN - mip->mi_fwordlen); |
1308 | flen = (int)STRLEN(mip->mi_fword + mip->mi_fwordlen); |
1309 | mip->mi_fwordlen += flen; |
1310 | return flen; |
1311 | } |
1312 | |
1313 | /// Checks case flags for a word. Returns true, if the word has the requested |
1314 | /// case. |
1315 | /// |
1316 | /// @param wordflags Flags for the checked word. |
1317 | /// @param treeflags Flags for the word in the spell tree. |
1318 | static bool spell_valid_case(int wordflags, int treeflags) |
1319 | { |
1320 | return (wordflags == WF_ALLCAP && (treeflags & WF_FIXCAP) == 0) |
1321 | || ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0 |
1322 | && ((treeflags & WF_ONECAP) == 0 |
1323 | || (wordflags & WF_ONECAP) != 0)); |
1324 | } |
1325 | |
1326 | // Returns true if spell checking is not enabled. |
1327 | static bool no_spell_checking(win_T *wp) |
1328 | { |
1329 | if (!wp->w_p_spell || *wp->w_s->b_p_spl == NUL |
1330 | || GA_EMPTY(&wp->w_s->b_langp)) { |
1331 | EMSG(_("E756: Spell checking is not enabled" )); |
1332 | return true; |
1333 | } |
1334 | return false; |
1335 | } |
1336 | |
1337 | // Moves to the next spell error. |
1338 | // "curline" is false for "[s", "]s", "[S" and "]S". |
1339 | // "curline" is true to find word under/after cursor in the same line. |
1340 | // For Insert mode completion "dir" is BACKWARD and "curline" is true: move |
1341 | // to after badly spelled word before the cursor. |
1342 | // Return 0 if not found, length of the badly spelled word otherwise. |
1343 | size_t |
1344 | spell_move_to ( |
1345 | win_T *wp, |
1346 | int dir, // FORWARD or BACKWARD |
1347 | bool allwords, // true for "[s"/"]s", false for "[S"/"]S" |
1348 | bool curline, |
1349 | hlf_T *attrp // return: attributes of bad word or NULL |
1350 | // (only when "dir" is FORWARD) |
1351 | ) |
1352 | { |
1353 | linenr_T lnum; |
1354 | pos_T found_pos; |
1355 | size_t found_len = 0; |
1356 | char_u *line; |
1357 | char_u *p; |
1358 | char_u *endp; |
1359 | hlf_T attr = HLF_COUNT; |
1360 | size_t len; |
1361 | int has_syntax = syntax_present(wp); |
1362 | int col; |
1363 | bool can_spell; |
1364 | char_u *buf = NULL; |
1365 | size_t buflen = 0; |
1366 | int skip = 0; |
1367 | int capcol = -1; |
1368 | bool found_one = false; |
1369 | bool wrapped = false; |
1370 | |
1371 | if (no_spell_checking(wp)) |
1372 | return 0; |
1373 | |
1374 | // Start looking for bad word at the start of the line, because we can't |
1375 | // start halfway through a word, we don't know where it starts or ends. |
1376 | // |
1377 | // When searching backwards, we continue in the line to find the last |
1378 | // bad word (in the cursor line: before the cursor). |
1379 | // |
1380 | // We concatenate the start of the next line, so that wrapped words work |
1381 | // (e.g. "et<line-break>cetera"). Doesn't work when searching backwards |
1382 | // though... |
1383 | lnum = wp->w_cursor.lnum; |
1384 | clearpos(&found_pos); |
1385 | |
1386 | while (!got_int) { |
1387 | line = ml_get_buf(wp->w_buffer, lnum, FALSE); |
1388 | |
1389 | len = STRLEN(line); |
1390 | if (buflen < len + MAXWLEN + 2) { |
1391 | xfree(buf); |
1392 | buflen = len + MAXWLEN + 2; |
1393 | buf = xmalloc(buflen); |
1394 | } |
1395 | assert(buf && buflen >= len + MAXWLEN + 2); |
1396 | |
1397 | // In first line check first word for Capital. |
1398 | if (lnum == 1) |
1399 | capcol = 0; |
1400 | |
1401 | // For checking first word with a capital skip white space. |
1402 | if (capcol == 0) { |
1403 | capcol = (int)getwhitecols(line); |
1404 | } else if (curline && wp == curwin) { |
1405 | // For spellbadword(): check if first word needs a capital. |
1406 | col = (int)getwhitecols(line); |
1407 | if (check_need_cap(lnum, col)) { |
1408 | capcol = col; |
1409 | } |
1410 | |
1411 | // Need to get the line again, may have looked at the previous |
1412 | // one. |
1413 | line = ml_get_buf(wp->w_buffer, lnum, FALSE); |
1414 | } |
1415 | |
1416 | // Copy the line into "buf" and append the start of the next line if |
1417 | // possible. |
1418 | STRCPY(buf, line); |
1419 | if (lnum < wp->w_buffer->b_ml.ml_line_count) |
1420 | spell_cat_line(buf + STRLEN(buf), |
1421 | ml_get_buf(wp->w_buffer, lnum + 1, FALSE), |
1422 | MAXWLEN); |
1423 | p = buf + skip; |
1424 | endp = buf + len; |
1425 | while (p < endp) { |
1426 | // When searching backward don't search after the cursor. Unless |
1427 | // we wrapped around the end of the buffer. |
1428 | if (dir == BACKWARD |
1429 | && lnum == wp->w_cursor.lnum |
1430 | && !wrapped |
1431 | && (colnr_T)(p - buf) >= wp->w_cursor.col) |
1432 | break; |
1433 | |
1434 | // start of word |
1435 | attr = HLF_COUNT; |
1436 | len = spell_check(wp, p, &attr, &capcol, false); |
1437 | |
1438 | if (attr != HLF_COUNT) { |
1439 | // We found a bad word. Check the attribute. |
1440 | if (allwords || attr == HLF_SPB) { |
1441 | // When searching forward only accept a bad word after |
1442 | // the cursor. |
1443 | if (dir == BACKWARD |
1444 | || lnum != wp->w_cursor.lnum |
1445 | || wrapped |
1446 | || ((colnr_T)(curline |
1447 | ? p - buf + (ptrdiff_t)len |
1448 | : p - buf) > wp->w_cursor.col)) { |
1449 | if (has_syntax) { |
1450 | col = (int)(p - buf); |
1451 | (void)syn_get_id(wp, lnum, (colnr_T)col, |
1452 | FALSE, &can_spell, FALSE); |
1453 | if (!can_spell) |
1454 | attr = HLF_COUNT; |
1455 | } else |
1456 | can_spell = true; |
1457 | |
1458 | if (can_spell) { |
1459 | found_one = true; |
1460 | found_pos.lnum = lnum; |
1461 | found_pos.col = (int)(p - buf); |
1462 | found_pos.coladd = 0; |
1463 | if (dir == FORWARD) { |
1464 | // No need to search further. |
1465 | wp->w_cursor = found_pos; |
1466 | xfree(buf); |
1467 | if (attrp != NULL) |
1468 | *attrp = attr; |
1469 | return len; |
1470 | } else if (curline) { |
1471 | // Insert mode completion: put cursor after |
1472 | // the bad word. |
1473 | assert(len <= INT_MAX); |
1474 | found_pos.col += (int)len; |
1475 | } |
1476 | found_len = len; |
1477 | } |
1478 | } else |
1479 | found_one = true; |
1480 | } |
1481 | } |
1482 | |
1483 | // advance to character after the word |
1484 | p += len; |
1485 | assert(len <= INT_MAX); |
1486 | capcol -= (int)len; |
1487 | } |
1488 | |
1489 | if (dir == BACKWARD && found_pos.lnum != 0) { |
1490 | // Use the last match in the line (before the cursor). |
1491 | wp->w_cursor = found_pos; |
1492 | xfree(buf); |
1493 | return found_len; |
1494 | } |
1495 | |
1496 | if (curline) { |
1497 | break; // only check cursor line |
1498 | } |
1499 | |
1500 | // If we are back at the starting line and searched it again there |
1501 | // is no match, give up. |
1502 | if (lnum == wp->w_cursor.lnum && wrapped) { |
1503 | break; |
1504 | } |
1505 | |
1506 | // Advance to next line. |
1507 | if (dir == BACKWARD) { |
1508 | if (lnum > 1) { |
1509 | lnum--; |
1510 | } else if (!p_ws) { |
1511 | break; // at first line and 'nowrapscan' |
1512 | } else { |
1513 | // Wrap around to the end of the buffer. May search the |
1514 | // starting line again and accept the last match. |
1515 | lnum = wp->w_buffer->b_ml.ml_line_count; |
1516 | wrapped = true; |
1517 | if (!shortmess(SHM_SEARCH)) |
1518 | give_warning((char_u *)_(top_bot_msg), true); |
1519 | } |
1520 | capcol = -1; |
1521 | } else { |
1522 | if (lnum < wp->w_buffer->b_ml.ml_line_count) |
1523 | ++lnum; |
1524 | else if (!p_ws) |
1525 | break; // at first line and 'nowrapscan' |
1526 | else { |
1527 | // Wrap around to the start of the buffer. May search the |
1528 | // starting line again and accept the first match. |
1529 | lnum = 1; |
1530 | wrapped = true; |
1531 | if (!shortmess(SHM_SEARCH)) |
1532 | give_warning((char_u *)_(bot_top_msg), true); |
1533 | } |
1534 | |
1535 | // If we are back at the starting line and there is no match then |
1536 | // give up. |
1537 | if (lnum == wp->w_cursor.lnum && !found_one) { |
1538 | break; |
1539 | } |
1540 | |
1541 | // Skip the characters at the start of the next line that were |
1542 | // included in a match crossing line boundaries. |
1543 | if (attr == HLF_COUNT) |
1544 | skip = (int)(p - endp); |
1545 | else |
1546 | skip = 0; |
1547 | |
1548 | // Capcol skips over the inserted space. |
1549 | --capcol; |
1550 | |
1551 | // But after empty line check first word in next line |
1552 | if (*skipwhite(line) == NUL) |
1553 | capcol = 0; |
1554 | } |
1555 | |
1556 | line_breakcheck(); |
1557 | } |
1558 | |
1559 | xfree(buf); |
1560 | return 0; |
1561 | } |
1562 | |
1563 | // For spell checking: concatenate the start of the following line "line" into |
1564 | // "buf", blanking-out special characters. Copy less then "maxlen" bytes. |
1565 | // Keep the blanks at the start of the next line, this is used in win_line() |
1566 | // to skip those bytes if the word was OK. |
1567 | void spell_cat_line(char_u *buf, char_u *line, int maxlen) |
1568 | { |
1569 | char_u *p; |
1570 | int n; |
1571 | |
1572 | p = skipwhite(line); |
1573 | while (vim_strchr((char_u *)"*#/\"\t" , *p) != NULL) |
1574 | p = skipwhite(p + 1); |
1575 | |
1576 | if (*p != NUL) { |
1577 | // Only worth concatenating if there is something else than spaces to |
1578 | // concatenate. |
1579 | n = (int)(p - line) + 1; |
1580 | if (n < maxlen - 1) { |
1581 | memset(buf, ' ', n); |
1582 | STRLCPY(buf + n, p, maxlen - n); |
1583 | } |
1584 | } |
1585 | } |
1586 | |
1587 | // Load word list(s) for "lang" from Vim spell file(s). |
1588 | // "lang" must be the language without the region: e.g., "en". |
1589 | static void spell_load_lang(char_u *lang) |
1590 | { |
1591 | char_u fname_enc[85]; |
1592 | int r; |
1593 | spelload_T sl; |
1594 | int round; |
1595 | |
1596 | // Copy the language name to pass it to spell_load_cb() as a cookie. |
1597 | // It's truncated when an error is detected. |
1598 | STRCPY(sl.sl_lang, lang); |
1599 | sl.sl_slang = NULL; |
1600 | sl.sl_nobreak = false; |
1601 | |
1602 | // We may retry when no spell file is found for the language, an |
1603 | // autocommand may load it then. |
1604 | for (round = 1; round <= 2; ++round) { |
1605 | // Find the first spell file for "lang" in 'runtimepath' and load it. |
1606 | vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5, |
1607 | "spell/%s.%s.spl" , lang, spell_enc()); |
1608 | r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl); |
1609 | |
1610 | if (r == FAIL && *sl.sl_lang != NUL) { |
1611 | // Try loading the ASCII version. |
1612 | vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5, |
1613 | "spell/%s.ascii.spl" , lang); |
1614 | r = do_in_runtimepath(fname_enc, 0, spell_load_cb, &sl); |
1615 | |
1616 | if (r == FAIL && *sl.sl_lang != NUL && round == 1 |
1617 | && apply_autocmds(EVENT_SPELLFILEMISSING, lang, |
1618 | curbuf->b_fname, FALSE, curbuf)) |
1619 | continue; |
1620 | break; |
1621 | } |
1622 | break; |
1623 | } |
1624 | |
1625 | if (r == FAIL) { |
1626 | if (starting) { |
1627 | // Prompt the user at VimEnter if spell files are missing. #3027 |
1628 | // Plugins aren't loaded yet, so spellfile.vim cannot handle this case. |
1629 | char autocmd_buf[512] = { 0 }; |
1630 | snprintf(autocmd_buf, sizeof(autocmd_buf), |
1631 | "autocmd VimEnter * call spellfile#LoadFile('%s')|set spell" , |
1632 | lang); |
1633 | do_cmdline_cmd(autocmd_buf); |
1634 | } else { |
1635 | smsg( |
1636 | _("Warning: Cannot find word list \"%s.%s.spl\" or \"%s.ascii.spl\"" ), |
1637 | lang, spell_enc(), lang); |
1638 | } |
1639 | } else if (sl.sl_slang != NULL) { |
1640 | // At least one file was loaded, now load ALL the additions. |
1641 | STRCPY(fname_enc + STRLEN(fname_enc) - 3, "add.spl" ); |
1642 | do_in_runtimepath(fname_enc, DIP_ALL, spell_load_cb, &sl); |
1643 | } |
1644 | } |
1645 | |
1646 | // Return the encoding used for spell checking: Use 'encoding', except that we |
1647 | // use "latin1" for "latin9". And limit to 60 characters (just in case). |
1648 | char_u *spell_enc(void) |
1649 | { |
1650 | |
1651 | if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15" ) != 0) |
1652 | return p_enc; |
1653 | return (char_u *)"latin1" ; |
1654 | } |
1655 | |
1656 | // Get the name of the .spl file for the internal wordlist into |
1657 | // "fname[MAXPATHL]". |
1658 | static void int_wordlist_spl(char_u *fname) |
1659 | { |
1660 | vim_snprintf((char *)fname, MAXPATHL, SPL_FNAME_TMPL, |
1661 | int_wordlist, spell_enc()); |
1662 | } |
1663 | |
1664 | // Allocate a new slang_T for language "lang". "lang" can be NULL. |
1665 | // Caller must fill "sl_next". |
1666 | slang_T *slang_alloc(char_u *lang) |
1667 | { |
1668 | slang_T *lp = xcalloc(1, sizeof(slang_T)); |
1669 | |
1670 | if (lang != NULL) |
1671 | lp->sl_name = vim_strsave(lang); |
1672 | ga_init(&lp->sl_rep, sizeof(fromto_T), 10); |
1673 | ga_init(&lp->sl_repsal, sizeof(fromto_T), 10); |
1674 | lp->sl_compmax = MAXWLEN; |
1675 | lp->sl_compsylmax = MAXWLEN; |
1676 | hash_init(&lp->sl_wordcount); |
1677 | |
1678 | return lp; |
1679 | } |
1680 | |
1681 | // Free the contents of an slang_T and the structure itself. |
1682 | void slang_free(slang_T *lp) |
1683 | { |
1684 | xfree(lp->sl_name); |
1685 | xfree(lp->sl_fname); |
1686 | slang_clear(lp); |
1687 | xfree(lp); |
1688 | } |
1689 | |
1690 | /// Frees a salitem_T |
1691 | static void free_salitem(salitem_T *smp) { |
1692 | xfree(smp->sm_lead); |
1693 | // Don't free sm_oneof and sm_rules, they point into sm_lead. |
1694 | xfree(smp->sm_to); |
1695 | xfree(smp->sm_lead_w); |
1696 | xfree(smp->sm_oneof_w); |
1697 | xfree(smp->sm_to_w); |
1698 | } |
1699 | |
1700 | /// Frees a fromto_T |
1701 | static void free_fromto(fromto_T *ftp) { |
1702 | xfree(ftp->ft_from); |
1703 | xfree(ftp->ft_to); |
1704 | } |
1705 | |
1706 | // Clear an slang_T so that the file can be reloaded. |
1707 | void slang_clear(slang_T *lp) |
1708 | { |
1709 | garray_T *gap; |
1710 | |
1711 | XFREE_CLEAR(lp->sl_fbyts); |
1712 | XFREE_CLEAR(lp->sl_kbyts); |
1713 | XFREE_CLEAR(lp->sl_pbyts); |
1714 | |
1715 | XFREE_CLEAR(lp->sl_fidxs); |
1716 | XFREE_CLEAR(lp->sl_kidxs); |
1717 | XFREE_CLEAR(lp->sl_pidxs); |
1718 | |
1719 | GA_DEEP_CLEAR(&lp->sl_rep, fromto_T, free_fromto); |
1720 | GA_DEEP_CLEAR(&lp->sl_repsal, fromto_T, free_fromto); |
1721 | |
1722 | gap = &lp->sl_sal; |
1723 | if (lp->sl_sofo) { |
1724 | // "ga_len" is set to 1 without adding an item for latin1 |
1725 | GA_DEEP_CLEAR_PTR(gap); |
1726 | } else { |
1727 | // SAL items: free salitem_T items |
1728 | GA_DEEP_CLEAR(gap, salitem_T, free_salitem); |
1729 | } |
1730 | |
1731 | for (int i = 0; i < lp->sl_prefixcnt; ++i) { |
1732 | vim_regfree(lp->sl_prefprog[i]); |
1733 | } |
1734 | lp->sl_prefixcnt = 0; |
1735 | XFREE_CLEAR(lp->sl_prefprog); |
1736 | XFREE_CLEAR(lp->sl_info); |
1737 | XFREE_CLEAR(lp->sl_midword); |
1738 | |
1739 | vim_regfree(lp->sl_compprog); |
1740 | lp->sl_compprog = NULL; |
1741 | XFREE_CLEAR(lp->sl_comprules); |
1742 | XFREE_CLEAR(lp->sl_compstartflags); |
1743 | XFREE_CLEAR(lp->sl_compallflags); |
1744 | |
1745 | XFREE_CLEAR(lp->sl_syllable); |
1746 | ga_clear(&lp->sl_syl_items); |
1747 | |
1748 | ga_clear_strings(&lp->sl_comppat); |
1749 | |
1750 | hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF); |
1751 | hash_init(&lp->sl_wordcount); |
1752 | |
1753 | hash_clear_all(&lp->sl_map_hash, 0); |
1754 | |
1755 | // Clear info from .sug file. |
1756 | slang_clear_sug(lp); |
1757 | |
1758 | lp->sl_compmax = MAXWLEN; |
1759 | lp->sl_compminlen = 0; |
1760 | lp->sl_compsylmax = MAXWLEN; |
1761 | lp->sl_regions[0] = NUL; |
1762 | } |
1763 | |
1764 | // Clear the info from the .sug file in "lp". |
1765 | void slang_clear_sug(slang_T *lp) |
1766 | { |
1767 | XFREE_CLEAR(lp->sl_sbyts); |
1768 | XFREE_CLEAR(lp->sl_sidxs); |
1769 | close_spellbuf(lp->sl_sugbuf); |
1770 | lp->sl_sugbuf = NULL; |
1771 | lp->sl_sugloaded = false; |
1772 | lp->sl_sugtime = 0; |
1773 | } |
1774 | |
1775 | // Load one spell file and store the info into a slang_T. |
1776 | // Invoked through do_in_runtimepath(). |
1777 | static void spell_load_cb(char_u *fname, void *cookie) |
1778 | { |
1779 | spelload_T *slp = (spelload_T *)cookie; |
1780 | slang_T *slang; |
1781 | |
1782 | slang = spell_load_file(fname, slp->sl_lang, NULL, false); |
1783 | if (slang != NULL) { |
1784 | // When a previously loaded file has NOBREAK also use it for the |
1785 | // ".add" files. |
1786 | if (slp->sl_nobreak && slang->sl_add) |
1787 | slang->sl_nobreak = true; |
1788 | else if (slang->sl_nobreak) |
1789 | slp->sl_nobreak = true; |
1790 | |
1791 | slp->sl_slang = slang; |
1792 | } |
1793 | } |
1794 | |
1795 | /// Add a word to the hashtable of common words. |
1796 | /// If it's already there then the counter is increased. |
1797 | /// |
1798 | /// @param[in] lp |
1799 | /// @param[in] word added to common words hashtable |
1800 | /// @param[in] len length of word or -1 for NUL terminated |
1801 | /// @param[in] count 1 to count once, 10 to init |
1802 | void count_common_word(slang_T *lp, char_u *word, int len, int count) |
1803 | { |
1804 | hash_T hash; |
1805 | hashitem_T *hi; |
1806 | wordcount_T *wc; |
1807 | char_u buf[MAXWLEN]; |
1808 | char_u *p; |
1809 | |
1810 | if (len == -1) { |
1811 | p = word; |
1812 | } else if (len >= MAXWLEN) { |
1813 | return; |
1814 | } else { |
1815 | STRLCPY(buf, word, len + 1); |
1816 | p = buf; |
1817 | } |
1818 | |
1819 | hash = hash_hash(p); |
1820 | const size_t p_len = STRLEN(p); |
1821 | hi = hash_lookup(&lp->sl_wordcount, (const char *)p, p_len, hash); |
1822 | if (HASHITEM_EMPTY(hi)) { |
1823 | wc = xmalloc(sizeof(wordcount_T) + p_len); |
1824 | memcpy(wc->wc_word, p, p_len + 1); |
1825 | wc->wc_count = count; |
1826 | hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash); |
1827 | } else { |
1828 | wc = HI2WC(hi); |
1829 | if ((wc->wc_count += count) < (unsigned)count) // check for overflow |
1830 | wc->wc_count = MAXWORDCOUNT; |
1831 | } |
1832 | } |
1833 | |
1834 | // Adjust the score of common words. |
1835 | static int |
1836 | score_wordcount_adj ( |
1837 | slang_T *slang, |
1838 | int score, |
1839 | char_u *word, |
1840 | bool split // word was split, less bonus |
1841 | ) |
1842 | { |
1843 | hashitem_T *hi; |
1844 | wordcount_T *wc; |
1845 | int bonus; |
1846 | int newscore; |
1847 | |
1848 | hi = hash_find(&slang->sl_wordcount, word); |
1849 | if (!HASHITEM_EMPTY(hi)) { |
1850 | wc = HI2WC(hi); |
1851 | if (wc->wc_count < SCORE_THRES2) |
1852 | bonus = SCORE_COMMON1; |
1853 | else if (wc->wc_count < SCORE_THRES3) |
1854 | bonus = SCORE_COMMON2; |
1855 | else |
1856 | bonus = SCORE_COMMON3; |
1857 | if (split) |
1858 | newscore = score - bonus / 2; |
1859 | else |
1860 | newscore = score - bonus; |
1861 | if (newscore < 0) |
1862 | return 0; |
1863 | return newscore; |
1864 | } |
1865 | return score; |
1866 | } |
1867 | |
1868 | // Returns true if byte "n" appears in "str". |
1869 | // Like strchr() but independent of locale. |
1870 | bool byte_in_str(char_u *str, int n) |
1871 | { |
1872 | char_u *p; |
1873 | |
1874 | for (p = str; *p != NUL; ++p) |
1875 | if (*p == n) |
1876 | return true; |
1877 | return false; |
1878 | } |
1879 | |
1880 | // Truncate "slang->sl_syllable" at the first slash and put the following items |
1881 | // in "slang->sl_syl_items". |
1882 | int init_syl_tab(slang_T *slang) |
1883 | { |
1884 | char_u *p; |
1885 | char_u *s; |
1886 | int l; |
1887 | |
1888 | ga_init(&slang->sl_syl_items, sizeof(syl_item_T), 4); |
1889 | p = vim_strchr(slang->sl_syllable, '/'); |
1890 | while (p != NULL) { |
1891 | *p++ = NUL; |
1892 | if (*p == NUL) // trailing slash |
1893 | break; |
1894 | s = p; |
1895 | p = vim_strchr(p, '/'); |
1896 | if (p == NULL) |
1897 | l = (int)STRLEN(s); |
1898 | else |
1899 | l = (int)(p - s); |
1900 | if (l >= SY_MAXLEN) |
1901 | return SP_FORMERROR; |
1902 | |
1903 | syl_item_T *syl = GA_APPEND_VIA_PTR(syl_item_T, &slang->sl_syl_items); |
1904 | STRLCPY(syl->sy_chars, s, l + 1); |
1905 | syl->sy_len = l; |
1906 | } |
1907 | return OK; |
1908 | } |
1909 | |
1910 | // Count the number of syllables in "word". |
1911 | // When "word" contains spaces the syllables after the last space are counted. |
1912 | // Returns zero if syllables are not defines. |
1913 | static int count_syllables(slang_T *slang, char_u *word) |
1914 | { |
1915 | int cnt = 0; |
1916 | bool skip = false; |
1917 | char_u *p; |
1918 | int len; |
1919 | syl_item_T *syl; |
1920 | int c; |
1921 | |
1922 | if (slang->sl_syllable == NULL) |
1923 | return 0; |
1924 | |
1925 | for (p = word; *p != NUL; p += len) { |
1926 | // When running into a space reset counter. |
1927 | if (*p == ' ') { |
1928 | len = 1; |
1929 | cnt = 0; |
1930 | continue; |
1931 | } |
1932 | |
1933 | // Find longest match of syllable items. |
1934 | len = 0; |
1935 | for (int i = 0; i < slang->sl_syl_items.ga_len; ++i) { |
1936 | syl = ((syl_item_T *)slang->sl_syl_items.ga_data) + i; |
1937 | if (syl->sy_len > len |
1938 | && STRNCMP(p, syl->sy_chars, syl->sy_len) == 0) |
1939 | len = syl->sy_len; |
1940 | } |
1941 | if (len != 0) { // found a match, count syllable |
1942 | ++cnt; |
1943 | skip = false; |
1944 | } else { |
1945 | // No recognized syllable item, at least a syllable char then? |
1946 | c = utf_ptr2char(p); |
1947 | len = (*mb_ptr2len)(p); |
1948 | if (vim_strchr(slang->sl_syllable, c) == NULL) |
1949 | skip = false; // No, search for next syllable |
1950 | else if (!skip) { |
1951 | ++cnt; // Yes, count it |
1952 | skip = true; // don't count following syllable chars |
1953 | } |
1954 | } |
1955 | } |
1956 | return cnt; |
1957 | } |
1958 | |
1959 | // Parse 'spelllang' and set w_s->b_langp accordingly. |
1960 | // Returns NULL if it's OK, an error message otherwise. |
1961 | char_u *did_set_spelllang(win_T *wp) |
1962 | { |
1963 | garray_T ga; |
1964 | char_u *splp; |
1965 | char_u *region; |
1966 | char_u region_cp[3]; |
1967 | bool filename; |
1968 | int region_mask; |
1969 | slang_T *slang; |
1970 | int c; |
1971 | char_u lang[MAXWLEN + 1]; |
1972 | char_u spf_name[MAXPATHL]; |
1973 | int len; |
1974 | char_u *p; |
1975 | int round; |
1976 | char_u *spf; |
1977 | char_u *use_region = NULL; |
1978 | bool dont_use_region = false; |
1979 | bool nobreak = false; |
1980 | langp_T *lp, *lp2; |
1981 | static bool recursive = false; |
1982 | char_u *ret_msg = NULL; |
1983 | char_u *spl_copy; |
1984 | |
1985 | bufref_T bufref; |
1986 | set_bufref(&bufref, wp->w_buffer); |
1987 | |
1988 | // We don't want to do this recursively. May happen when a language is |
1989 | // not available and the SpellFileMissing autocommand opens a new buffer |
1990 | // in which 'spell' is set. |
1991 | if (recursive) |
1992 | return NULL; |
1993 | recursive = true; |
1994 | |
1995 | ga_init(&ga, sizeof(langp_T), 2); |
1996 | clear_midword(wp); |
1997 | |
1998 | // Make a copy of 'spelllang', the SpellFileMissing autocommands may change |
1999 | // it under our fingers. |
2000 | spl_copy = vim_strsave(wp->w_s->b_p_spl); |
2001 | |
2002 | wp->w_s->b_cjk = 0; |
2003 | |
2004 | // Loop over comma separated language names. |
2005 | for (splp = spl_copy; *splp != NUL; ) { |
2006 | // Get one language name. |
2007 | copy_option_part(&splp, lang, MAXWLEN, "," ); |
2008 | region = NULL; |
2009 | len = (int)STRLEN(lang); |
2010 | |
2011 | if (STRCMP(lang, "cjk" ) == 0) { |
2012 | wp->w_s->b_cjk = 1; |
2013 | continue; |
2014 | } |
2015 | |
2016 | // If the name ends in ".spl" use it as the name of the spell file. |
2017 | // If there is a region name let "region" point to it and remove it |
2018 | // from the name. |
2019 | if (len > 4 && fnamecmp(lang + len - 4, ".spl" ) == 0) { |
2020 | filename = true; |
2021 | |
2022 | // Locate a region and remove it from the file name. |
2023 | p = vim_strchr(path_tail(lang), '_'); |
2024 | if (p != NULL && ASCII_ISALPHA(p[1]) && ASCII_ISALPHA(p[2]) |
2025 | && !ASCII_ISALPHA(p[3])) { |
2026 | STRLCPY(region_cp, p + 1, 3); |
2027 | memmove(p, p + 3, len - (p - lang) - 2); |
2028 | region = region_cp; |
2029 | } else |
2030 | dont_use_region = true; |
2031 | |
2032 | // Check if we loaded this language before. |
2033 | for (slang = first_lang; slang != NULL; slang = slang->sl_next) { |
2034 | if (path_full_compare(lang, slang->sl_fname, false) == kEqualFiles) { |
2035 | break; |
2036 | } |
2037 | } |
2038 | } else { |
2039 | filename = false; |
2040 | if (len > 3 && lang[len - 3] == '_') { |
2041 | region = lang + len - 2; |
2042 | lang[len - 3] = NUL; |
2043 | } else |
2044 | dont_use_region = true; |
2045 | |
2046 | // Check if we loaded this language before. |
2047 | for (slang = first_lang; slang != NULL; slang = slang->sl_next) |
2048 | if (STRICMP(lang, slang->sl_name) == 0) |
2049 | break; |
2050 | } |
2051 | |
2052 | if (region != NULL) { |
2053 | // If the region differs from what was used before then don't |
2054 | // use it for 'spellfile'. |
2055 | if (use_region != NULL && STRCMP(region, use_region) != 0) |
2056 | dont_use_region = true; |
2057 | use_region = region; |
2058 | } |
2059 | |
2060 | // If not found try loading the language now. |
2061 | if (slang == NULL) { |
2062 | if (filename) |
2063 | (void)spell_load_file(lang, lang, NULL, false); |
2064 | else { |
2065 | spell_load_lang(lang); |
2066 | // SpellFileMissing autocommands may do anything, including |
2067 | // destroying the buffer we are using... |
2068 | if (!bufref_valid(&bufref)) { |
2069 | ret_msg = |
2070 | (char_u *)N_("E797: SpellFileMissing autocommand deleted buffer" ); |
2071 | goto theend; |
2072 | } |
2073 | } |
2074 | } |
2075 | |
2076 | // Loop over the languages, there can be several files for "lang". |
2077 | for (slang = first_lang; slang != NULL; slang = slang->sl_next) { |
2078 | if (filename |
2079 | ? path_full_compare(lang, slang->sl_fname, false) == kEqualFiles |
2080 | : STRICMP(lang, slang->sl_name) == 0) { |
2081 | region_mask = REGION_ALL; |
2082 | if (!filename && region != NULL) { |
2083 | // find region in sl_regions |
2084 | c = find_region(slang->sl_regions, region); |
2085 | if (c == REGION_ALL) { |
2086 | if (slang->sl_add) { |
2087 | if (*slang->sl_regions != NUL) |
2088 | // This addition file is for other regions. |
2089 | region_mask = 0; |
2090 | } else |
2091 | // This is probably an error. Give a warning and |
2092 | // accept the words anyway. |
2093 | smsg(_("Warning: region %s not supported" ), |
2094 | region); |
2095 | } else |
2096 | region_mask = 1 << c; |
2097 | } |
2098 | |
2099 | if (region_mask != 0) { |
2100 | langp_T *p_ = GA_APPEND_VIA_PTR(langp_T, &ga); |
2101 | p_->lp_slang = slang; |
2102 | p_->lp_region = region_mask; |
2103 | |
2104 | use_midword(slang, wp); |
2105 | if (slang->sl_nobreak) |
2106 | nobreak = true; |
2107 | } |
2108 | } |
2109 | } |
2110 | } |
2111 | |
2112 | // round 0: load int_wordlist, if possible. |
2113 | // round 1: load first name in 'spellfile'. |
2114 | // round 2: load second name in 'spellfile. |
2115 | // etc. |
2116 | spf = curwin->w_s->b_p_spf; |
2117 | for (round = 0; round == 0 || *spf != NUL; ++round) { |
2118 | if (round == 0) { |
2119 | // Internal wordlist, if there is one. |
2120 | if (int_wordlist == NULL) |
2121 | continue; |
2122 | int_wordlist_spl(spf_name); |
2123 | } else { |
2124 | // One entry in 'spellfile'. |
2125 | copy_option_part(&spf, spf_name, MAXPATHL - 5, "," ); |
2126 | STRCAT(spf_name, ".spl" ); |
2127 | |
2128 | // If it was already found above then skip it. |
2129 | for (c = 0; c < ga.ga_len; ++c) { |
2130 | p = LANGP_ENTRY(ga, c)->lp_slang->sl_fname; |
2131 | if (p != NULL |
2132 | && path_full_compare(spf_name, p, false) == kEqualFiles) { |
2133 | break; |
2134 | } |
2135 | } |
2136 | if (c < ga.ga_len) |
2137 | continue; |
2138 | } |
2139 | |
2140 | // Check if it was loaded already. |
2141 | for (slang = first_lang; slang != NULL; slang = slang->sl_next) { |
2142 | if (path_full_compare(spf_name, slang->sl_fname, false) == kEqualFiles) { |
2143 | break; |
2144 | } |
2145 | } |
2146 | if (slang == NULL) { |
2147 | // Not loaded, try loading it now. The language name includes the |
2148 | // region name, the region is ignored otherwise. for int_wordlist |
2149 | // use an arbitrary name. |
2150 | if (round == 0) |
2151 | STRCPY(lang, "internal wordlist" ); |
2152 | else { |
2153 | STRLCPY(lang, path_tail(spf_name), MAXWLEN + 1); |
2154 | p = vim_strchr(lang, '.'); |
2155 | if (p != NULL) |
2156 | *p = NUL; // truncate at ".encoding.add" |
2157 | } |
2158 | slang = spell_load_file(spf_name, lang, NULL, true); |
2159 | |
2160 | // If one of the languages has NOBREAK we assume the addition |
2161 | // files also have this. |
2162 | if (slang != NULL && nobreak) |
2163 | slang->sl_nobreak = true; |
2164 | } |
2165 | if (slang != NULL) { |
2166 | region_mask = REGION_ALL; |
2167 | if (use_region != NULL && !dont_use_region) { |
2168 | // find region in sl_regions |
2169 | c = find_region(slang->sl_regions, use_region); |
2170 | if (c != REGION_ALL) |
2171 | region_mask = 1 << c; |
2172 | else if (*slang->sl_regions != NUL) |
2173 | // This spell file is for other regions. |
2174 | region_mask = 0; |
2175 | } |
2176 | |
2177 | if (region_mask != 0) { |
2178 | langp_T *p_ = GA_APPEND_VIA_PTR(langp_T, &ga); |
2179 | p_->lp_slang = slang; |
2180 | p_->lp_sallang = NULL; |
2181 | p_->lp_replang = NULL; |
2182 | p_->lp_region = region_mask; |
2183 | |
2184 | use_midword(slang, wp); |
2185 | } |
2186 | } |
2187 | } |
2188 | |
2189 | // Everything is fine, store the new b_langp value. |
2190 | ga_clear(&wp->w_s->b_langp); |
2191 | wp->w_s->b_langp = ga; |
2192 | |
2193 | // For each language figure out what language to use for sound folding and |
2194 | // REP items. If the language doesn't support it itself use another one |
2195 | // with the same name. E.g. for "en-math" use "en". |
2196 | for (int i = 0; i < ga.ga_len; ++i) { |
2197 | lp = LANGP_ENTRY(ga, i); |
2198 | |
2199 | // sound folding |
2200 | if (!GA_EMPTY(&lp->lp_slang->sl_sal)) |
2201 | // language does sound folding itself |
2202 | lp->lp_sallang = lp->lp_slang; |
2203 | else |
2204 | // find first similar language that does sound folding |
2205 | for (int j = 0; j < ga.ga_len; ++j) { |
2206 | lp2 = LANGP_ENTRY(ga, j); |
2207 | if (!GA_EMPTY(&lp2->lp_slang->sl_sal) |
2208 | && STRNCMP(lp->lp_slang->sl_name, |
2209 | lp2->lp_slang->sl_name, 2) == 0) { |
2210 | lp->lp_sallang = lp2->lp_slang; |
2211 | break; |
2212 | } |
2213 | } |
2214 | |
2215 | // REP items |
2216 | if (!GA_EMPTY(&lp->lp_slang->sl_rep)) |
2217 | // language has REP items itself |
2218 | lp->lp_replang = lp->lp_slang; |
2219 | else |
2220 | // find first similar language that has REP items |
2221 | for (int j = 0; j < ga.ga_len; ++j) { |
2222 | lp2 = LANGP_ENTRY(ga, j); |
2223 | if (!GA_EMPTY(&lp2->lp_slang->sl_rep) |
2224 | && STRNCMP(lp->lp_slang->sl_name, |
2225 | lp2->lp_slang->sl_name, 2) == 0) { |
2226 | lp->lp_replang = lp2->lp_slang; |
2227 | break; |
2228 | } |
2229 | } |
2230 | } |
2231 | |
2232 | theend: |
2233 | xfree(spl_copy); |
2234 | recursive = false; |
2235 | redraw_win_later(wp, NOT_VALID); |
2236 | return ret_msg; |
2237 | } |
2238 | |
2239 | // Clear the midword characters for buffer "buf". |
2240 | static void clear_midword(win_T *wp) |
2241 | { |
2242 | memset(wp->w_s->b_spell_ismw, 0, 256); |
2243 | XFREE_CLEAR(wp->w_s->b_spell_ismw_mb); |
2244 | } |
2245 | |
2246 | // Use the "sl_midword" field of language "lp" for buffer "buf". |
2247 | // They add up to any currently used midword characters. |
2248 | static void use_midword(slang_T *lp, win_T *wp) |
2249 | { |
2250 | char_u *p; |
2251 | |
2252 | if (lp->sl_midword == NULL) // there aren't any |
2253 | return; |
2254 | |
2255 | for (p = lp->sl_midword; *p != NUL; ) |
2256 | if (has_mbyte) { |
2257 | int c, l, n; |
2258 | char_u *bp; |
2259 | |
2260 | c = utf_ptr2char(p); |
2261 | l = (*mb_ptr2len)(p); |
2262 | if (c < 256 && l <= 2) |
2263 | wp->w_s->b_spell_ismw[c] = true; |
2264 | else if (wp->w_s->b_spell_ismw_mb == NULL) |
2265 | // First multi-byte char in "b_spell_ismw_mb". |
2266 | wp->w_s->b_spell_ismw_mb = vim_strnsave(p, l); |
2267 | else { |
2268 | // Append multi-byte chars to "b_spell_ismw_mb". |
2269 | n = (int)STRLEN(wp->w_s->b_spell_ismw_mb); |
2270 | bp = vim_strnsave(wp->w_s->b_spell_ismw_mb, n + l); |
2271 | xfree(wp->w_s->b_spell_ismw_mb); |
2272 | wp->w_s->b_spell_ismw_mb = bp; |
2273 | STRLCPY(bp + n, p, l + 1); |
2274 | } |
2275 | p += l; |
2276 | } else |
2277 | wp->w_s->b_spell_ismw[*p++] = true; |
2278 | } |
2279 | |
2280 | // Find the region "region[2]" in "rp" (points to "sl_regions"). |
2281 | // Each region is simply stored as the two characters of its name. |
2282 | // Returns the index if found (first is 0), REGION_ALL if not found. |
2283 | static int find_region(char_u *rp, char_u *region) |
2284 | { |
2285 | int i; |
2286 | |
2287 | for (i = 0;; i += 2) { |
2288 | if (rp[i] == NUL) |
2289 | return REGION_ALL; |
2290 | if (rp[i] == region[0] && rp[i + 1] == region[1]) |
2291 | break; |
2292 | } |
2293 | return i / 2; |
2294 | } |
2295 | |
2296 | /// Return case type of word: |
2297 | /// w word 0 |
2298 | /// Word WF_ONECAP |
2299 | /// W WORD WF_ALLCAP |
2300 | /// WoRd wOrd WF_KEEPCAP |
2301 | /// |
2302 | /// @param[in] word |
2303 | /// @param[in] end End of word or NULL for NUL delimited string |
2304 | /// |
2305 | /// @returns Case type of word |
2306 | int captype(char_u *word, char_u *end) |
2307 | FUNC_ATTR_NONNULL_ARG(1) |
2308 | { |
2309 | char_u *p; |
2310 | int c; |
2311 | int firstcap; |
2312 | bool allcap; |
2313 | bool past_second = false; // past second word char |
2314 | |
2315 | // find first letter |
2316 | for (p = word; !spell_iswordp_nmw(p, curwin); MB_PTR_ADV(p)) { |
2317 | if (end == NULL ? *p == NUL : p >= end) { |
2318 | return 0; // only non-word characters, illegal word |
2319 | } |
2320 | } |
2321 | if (has_mbyte) { |
2322 | c = mb_ptr2char_adv((const char_u **)&p); |
2323 | } else { |
2324 | c = *p++; |
2325 | } |
2326 | firstcap = allcap = SPELL_ISUPPER(c); |
2327 | |
2328 | // Need to check all letters to find a word with mixed upper/lower. |
2329 | // But a word with an upper char only at start is a ONECAP. |
2330 | for (; end == NULL ? *p != NUL : p < end; MB_PTR_ADV(p)) { |
2331 | if (spell_iswordp_nmw(p, curwin)) { |
2332 | c = PTR2CHAR(p); |
2333 | if (!SPELL_ISUPPER(c)) { |
2334 | // UUl -> KEEPCAP |
2335 | if (past_second && allcap) { |
2336 | return WF_KEEPCAP; |
2337 | } |
2338 | allcap = false; |
2339 | } else if (!allcap) { |
2340 | // UlU -> KEEPCAP |
2341 | return WF_KEEPCAP; |
2342 | } |
2343 | past_second = true; |
2344 | } |
2345 | } |
2346 | |
2347 | if (allcap) |
2348 | return WF_ALLCAP; |
2349 | if (firstcap) |
2350 | return WF_ONECAP; |
2351 | return 0; |
2352 | } |
2353 | |
2354 | // Like captype() but for a KEEPCAP word add ONECAP if the word starts with a |
2355 | // capital. So that make_case_word() can turn WOrd into Word. |
2356 | // Add ALLCAP for "WOrD". |
2357 | static int badword_captype(char_u *word, char_u *end) |
2358 | FUNC_ATTR_NONNULL_ALL |
2359 | { |
2360 | int flags = captype(word, end); |
2361 | int c; |
2362 | int l, u; |
2363 | bool first; |
2364 | char_u *p; |
2365 | |
2366 | if (flags & WF_KEEPCAP) { |
2367 | // Count the number of UPPER and lower case letters. |
2368 | l = u = 0; |
2369 | first = false; |
2370 | for (p = word; p < end; MB_PTR_ADV(p)) { |
2371 | c = PTR2CHAR(p); |
2372 | if (SPELL_ISUPPER(c)) { |
2373 | ++u; |
2374 | if (p == word) |
2375 | first = true; |
2376 | } else |
2377 | ++l; |
2378 | } |
2379 | |
2380 | // If there are more UPPER than lower case letters suggest an |
2381 | // ALLCAP word. Otherwise, if the first letter is UPPER then |
2382 | // suggest ONECAP. Exception: "ALl" most likely should be "All", |
2383 | // require three upper case letters. |
2384 | if (u > l && u > 2) |
2385 | flags |= WF_ALLCAP; |
2386 | else if (first) |
2387 | flags |= WF_ONECAP; |
2388 | |
2389 | if (u >= 2 && l >= 2) // maCARONI maCAroni |
2390 | flags |= WF_MIXCAP; |
2391 | } |
2392 | return flags; |
2393 | } |
2394 | |
2395 | // Delete the internal wordlist and its .spl file. |
2396 | void spell_delete_wordlist(void) |
2397 | { |
2398 | char_u fname[MAXPATHL] = {0}; |
2399 | |
2400 | if (int_wordlist != NULL) { |
2401 | os_remove((char *)int_wordlist); |
2402 | int_wordlist_spl(fname); |
2403 | os_remove((char *)fname); |
2404 | XFREE_CLEAR(int_wordlist); |
2405 | } |
2406 | } |
2407 | |
2408 | // Free all languages. |
2409 | void spell_free_all(void) |
2410 | { |
2411 | slang_T *slang; |
2412 | |
2413 | // Go through all buffers and handle 'spelllang'. <VN> |
2414 | FOR_ALL_BUFFERS(buf) { |
2415 | ga_clear(&buf->b_s.b_langp); |
2416 | } |
2417 | |
2418 | while (first_lang != NULL) { |
2419 | slang = first_lang; |
2420 | first_lang = slang->sl_next; |
2421 | slang_free(slang); |
2422 | } |
2423 | |
2424 | spell_delete_wordlist(); |
2425 | |
2426 | XFREE_CLEAR(repl_to); |
2427 | XFREE_CLEAR(repl_from); |
2428 | } |
2429 | |
2430 | // Clear all spelling tables and reload them. |
2431 | // Used after 'encoding' is set and when ":mkspell" was used. |
2432 | void spell_reload(void) |
2433 | { |
2434 | // Initialize the table for spell_iswordp(). |
2435 | init_spell_chartab(); |
2436 | |
2437 | // Unload all allocated memory. |
2438 | spell_free_all(); |
2439 | |
2440 | // Go through all buffers and handle 'spelllang'. |
2441 | FOR_ALL_WINDOWS_IN_TAB(wp, curtab) { |
2442 | // Only load the wordlists when 'spelllang' is set and there is a |
2443 | // window for this buffer in which 'spell' is set. |
2444 | if (*wp->w_s->b_p_spl != NUL) { |
2445 | if (wp->w_p_spell) { |
2446 | (void)did_set_spelllang(wp); |
2447 | break; |
2448 | } |
2449 | } |
2450 | } |
2451 | } |
2452 | |
2453 | |
2454 | // Opposite of offset2bytes(). |
2455 | // "pp" points to the bytes and is advanced over it. |
2456 | // Returns the offset. |
2457 | static int bytes2offset(char_u **pp) |
2458 | { |
2459 | char_u *p = *pp; |
2460 | int nr; |
2461 | int c; |
2462 | |
2463 | c = *p++; |
2464 | if ((c & 0x80) == 0x00) { // 1 byte |
2465 | nr = c - 1; |
2466 | } else if ((c & 0xc0) == 0x80) { // 2 bytes |
2467 | nr = (c & 0x3f) - 1; |
2468 | nr = nr * 255 + (*p++ - 1); |
2469 | } else if ((c & 0xe0) == 0xc0) { // 3 bytes |
2470 | nr = (c & 0x1f) - 1; |
2471 | nr = nr * 255 + (*p++ - 1); |
2472 | nr = nr * 255 + (*p++ - 1); |
2473 | } else { // 4 bytes |
2474 | nr = (c & 0x0f) - 1; |
2475 | nr = nr * 255 + (*p++ - 1); |
2476 | nr = nr * 255 + (*p++ - 1); |
2477 | nr = nr * 255 + (*p++ - 1); |
2478 | } |
2479 | |
2480 | *pp = p; |
2481 | return nr; |
2482 | } |
2483 | |
2484 | // Open a spell buffer. This is a nameless buffer that is not in the buffer |
2485 | // list and only contains text lines. Can use a swapfile to reduce memory |
2486 | // use. |
2487 | // Most other fields are invalid! Esp. watch out for string options being |
2488 | // NULL and there is no undo info. |
2489 | buf_T *open_spellbuf(void) |
2490 | { |
2491 | buf_T *buf = xcalloc(1, sizeof(buf_T)); |
2492 | |
2493 | buf->b_spell = true; |
2494 | buf->b_p_swf = true; // may create a swap file |
2495 | if (ml_open(buf) == FAIL) { |
2496 | ELOG("Error opening a new memline" ); |
2497 | } |
2498 | ml_open_file(buf); // create swap file now |
2499 | |
2500 | return buf; |
2501 | } |
2502 | |
2503 | // Close the buffer used for spell info. |
2504 | void close_spellbuf(buf_T *buf) |
2505 | { |
2506 | if (buf != NULL) { |
2507 | ml_close(buf, TRUE); |
2508 | xfree(buf); |
2509 | } |
2510 | } |
2511 | |
2512 | // Init the chartab used for spelling for ASCII. |
2513 | void clear_spell_chartab(spelltab_T *sp) |
2514 | { |
2515 | int i; |
2516 | |
2517 | // Init everything to false. |
2518 | memset(sp->st_isw, false, sizeof(sp->st_isw)); |
2519 | memset(sp->st_isu, false, sizeof(sp->st_isu)); |
2520 | |
2521 | for (i = 0; i < 256; ++i) { |
2522 | sp->st_fold[i] = i; |
2523 | sp->st_upper[i] = i; |
2524 | } |
2525 | |
2526 | // We include digits. A word shouldn't start with a digit, but handling |
2527 | // that is done separately. |
2528 | for (i = '0'; i <= '9'; ++i) |
2529 | sp->st_isw[i] = true; |
2530 | for (i = 'A'; i <= 'Z'; ++i) { |
2531 | sp->st_isw[i] = true; |
2532 | sp->st_isu[i] = true; |
2533 | sp->st_fold[i] = i + 0x20; |
2534 | } |
2535 | for (i = 'a'; i <= 'z'; ++i) { |
2536 | sp->st_isw[i] = true; |
2537 | sp->st_upper[i] = i - 0x20; |
2538 | } |
2539 | } |
2540 | |
2541 | // Init the chartab used for spelling. Called once while starting up. |
2542 | // The default is to use isalpha(), but the spell file should define the word |
2543 | // characters to make it possible that 'encoding' differs from the current |
2544 | // locale. For utf-8 we don't use isalpha() but our own functions. |
2545 | void init_spell_chartab(void) |
2546 | { |
2547 | int i; |
2548 | |
2549 | did_set_spelltab = false; |
2550 | clear_spell_chartab(&spelltab); |
2551 | for (i = 128; i < 256; i++) { |
2552 | int f = utf_fold(i); |
2553 | int u = mb_toupper(i); |
2554 | |
2555 | spelltab.st_isu[i] = mb_isupper(i); |
2556 | spelltab.st_isw[i] = spelltab.st_isu[i] || mb_islower(i); |
2557 | // The folded/upper-cased value is different between latin1 and |
2558 | // utf8 for 0xb5, causing E763 for no good reason. Use the latin1 |
2559 | // value for utf-8 to avoid this. |
2560 | spelltab.st_fold[i] = (f < 256) ? f : i; |
2561 | spelltab.st_upper[i] = (u < 256) ? u : i; |
2562 | } |
2563 | } |
2564 | |
2565 | /// Returns true if "p" points to a word character. |
2566 | /// As a special case we see "midword" characters as word character when it is |
2567 | /// followed by a word character. This finds they'there but not 'they there'. |
2568 | /// Thus this only works properly when past the first character of the word. |
2569 | /// |
2570 | /// @param wp Buffer used. |
2571 | static bool spell_iswordp(char_u *p, win_T *wp) |
2572 | { |
2573 | char_u *s; |
2574 | int l; |
2575 | int c; |
2576 | |
2577 | if (has_mbyte) { |
2578 | l = MB_PTR2LEN(p); |
2579 | s = p; |
2580 | if (l == 1) { |
2581 | // be quick for ASCII |
2582 | if (wp->w_s->b_spell_ismw[*p]) |
2583 | s = p + 1; // skip a mid-word character |
2584 | } else { |
2585 | c = utf_ptr2char(p); |
2586 | if (c < 256 ? wp->w_s->b_spell_ismw[c] |
2587 | : (wp->w_s->b_spell_ismw_mb != NULL |
2588 | && vim_strchr(wp->w_s->b_spell_ismw_mb, c) != NULL)) { |
2589 | s = p + l; |
2590 | } |
2591 | } |
2592 | |
2593 | c = utf_ptr2char(s); |
2594 | if (c > 255) { |
2595 | return spell_mb_isword_class(mb_get_class(s), wp); |
2596 | } |
2597 | return spelltab.st_isw[c]; |
2598 | } |
2599 | |
2600 | return spelltab.st_isw[wp->w_s->b_spell_ismw[*p] ? p[1] : p[0]]; |
2601 | } |
2602 | |
2603 | // Returns true if "p" points to a word character. |
2604 | // Unlike spell_iswordp() this doesn't check for "midword" characters. |
2605 | bool spell_iswordp_nmw(const char_u *p, win_T *wp) |
2606 | { |
2607 | int c = utf_ptr2char(p); |
2608 | if (c > 255) { |
2609 | return spell_mb_isword_class(mb_get_class(p), wp); |
2610 | } |
2611 | return spelltab.st_isw[c]; |
2612 | } |
2613 | |
2614 | // Returns true if word class indicates a word character. |
2615 | // Only for characters above 255. |
2616 | // Unicode subscript and superscript are not considered word characters. |
2617 | // See also utf_class() in mbyte.c. |
2618 | static bool spell_mb_isword_class(int cl, win_T *wp) |
2619 | { |
2620 | if (wp->w_s->b_cjk) |
2621 | // East Asian characters are not considered word characters. |
2622 | return cl == 2 || cl == 0x2800; |
2623 | return cl >= 2 && cl != 0x2070 && cl != 0x2080 && cl != 3; |
2624 | } |
2625 | |
2626 | // Returns true if "p" points to a word character. |
2627 | // Wide version of spell_iswordp(). |
2628 | static bool spell_iswordp_w(int *p, win_T *wp) |
2629 | { |
2630 | int *s; |
2631 | |
2632 | if (*p < 256 ? wp->w_s->b_spell_ismw[*p] |
2633 | : (wp->w_s->b_spell_ismw_mb != NULL |
2634 | && vim_strchr(wp->w_s->b_spell_ismw_mb, *p) != NULL)) |
2635 | s = p + 1; |
2636 | else |
2637 | s = p; |
2638 | |
2639 | if (*s > 255) { |
2640 | return spell_mb_isword_class(utf_class(*s), wp); |
2641 | } |
2642 | return spelltab.st_isw[*s]; |
2643 | } |
2644 | |
2645 | // Case-fold "str[len]" into "buf[buflen]". The result is NUL terminated. |
2646 | // Uses the character definitions from the .spl file. |
2647 | // When using a multi-byte 'encoding' the length may change! |
2648 | // Returns FAIL when something wrong. |
2649 | int spell_casefold(char_u *str, int len, char_u *buf, int buflen) |
2650 | { |
2651 | int i; |
2652 | |
2653 | if (len >= buflen) { |
2654 | buf[0] = NUL; |
2655 | return FAIL; // result will not fit |
2656 | } |
2657 | |
2658 | if (has_mbyte) { |
2659 | int outi = 0; |
2660 | char_u *p; |
2661 | int c; |
2662 | |
2663 | // Fold one character at a time. |
2664 | for (p = str; p < str + len; ) { |
2665 | if (outi + MB_MAXBYTES > buflen) { |
2666 | buf[outi] = NUL; |
2667 | return FAIL; |
2668 | } |
2669 | c = mb_cptr2char_adv((const char_u **)&p); |
2670 | outi += utf_char2bytes(SPELL_TOFOLD(c), buf + outi); |
2671 | } |
2672 | buf[outi] = NUL; |
2673 | } else { |
2674 | // Be quick for non-multibyte encodings. |
2675 | for (i = 0; i < len; ++i) |
2676 | buf[i] = spelltab.st_fold[str[i]]; |
2677 | buf[i] = NUL; |
2678 | } |
2679 | |
2680 | return OK; |
2681 | } |
2682 | |
2683 | // values for sps_flags |
2684 | #define SPS_BEST 1 |
2685 | #define SPS_FAST 2 |
2686 | #define SPS_DOUBLE 4 |
2687 | |
2688 | static int sps_flags = SPS_BEST; // flags from 'spellsuggest' |
2689 | static int sps_limit = 9999; // max nr of suggestions given |
2690 | |
2691 | // Check the 'spellsuggest' option. Return FAIL if it's wrong. |
2692 | // Sets "sps_flags" and "sps_limit". |
2693 | int spell_check_sps(void) |
2694 | { |
2695 | char_u *p; |
2696 | char_u *s; |
2697 | char_u buf[MAXPATHL]; |
2698 | int f; |
2699 | |
2700 | sps_flags = 0; |
2701 | sps_limit = 9999; |
2702 | |
2703 | for (p = p_sps; *p != NUL; ) { |
2704 | copy_option_part(&p, buf, MAXPATHL, "," ); |
2705 | |
2706 | f = 0; |
2707 | if (ascii_isdigit(*buf)) { |
2708 | s = buf; |
2709 | sps_limit = getdigits_int(&s, true, 0); |
2710 | if (*s != NUL && !ascii_isdigit(*s)) { |
2711 | f = -1; |
2712 | } |
2713 | } else if (STRCMP(buf, "best" ) == 0) { |
2714 | f = SPS_BEST; |
2715 | } else if (STRCMP(buf, "fast" ) == 0) { |
2716 | f = SPS_FAST; |
2717 | } else if (STRCMP(buf, "double" ) == 0) { |
2718 | f = SPS_DOUBLE; |
2719 | } else if (STRNCMP(buf, "expr:" , 5) != 0 |
2720 | && STRNCMP(buf, "file:" , 5) != 0) { |
2721 | f = -1; |
2722 | } |
2723 | |
2724 | if (f == -1 || (sps_flags != 0 && f != 0)) { |
2725 | sps_flags = SPS_BEST; |
2726 | sps_limit = 9999; |
2727 | return FAIL; |
2728 | } |
2729 | if (f != 0) |
2730 | sps_flags = f; |
2731 | } |
2732 | |
2733 | if (sps_flags == 0) |
2734 | sps_flags = SPS_BEST; |
2735 | |
2736 | return OK; |
2737 | } |
2738 | |
2739 | // "z=": Find badly spelled word under or after the cursor. |
2740 | // Give suggestions for the properly spelled word. |
2741 | // In Visual mode use the highlighted word as the bad word. |
2742 | // When "count" is non-zero use that suggestion. |
2743 | void spell_suggest(int count) |
2744 | { |
2745 | char_u *line; |
2746 | pos_T prev_cursor = curwin->w_cursor; |
2747 | char_u wcopy[MAXWLEN + 2]; |
2748 | char_u *p; |
2749 | int c; |
2750 | suginfo_T sug; |
2751 | suggest_T *stp; |
2752 | int mouse_used; |
2753 | int need_cap; |
2754 | int limit; |
2755 | int selected = count; |
2756 | int badlen = 0; |
2757 | int msg_scroll_save = msg_scroll; |
2758 | |
2759 | if (no_spell_checking(curwin)) |
2760 | return; |
2761 | |
2762 | if (VIsual_active) { |
2763 | // Use the Visually selected text as the bad word. But reject |
2764 | // a multi-line selection. |
2765 | if (curwin->w_cursor.lnum != VIsual.lnum) { |
2766 | vim_beep(BO_SPELL); |
2767 | return; |
2768 | } |
2769 | badlen = (int)curwin->w_cursor.col - (int)VIsual.col; |
2770 | if (badlen < 0) { |
2771 | badlen = -badlen; |
2772 | } else { |
2773 | curwin->w_cursor.col = VIsual.col; |
2774 | } |
2775 | badlen++; |
2776 | end_visual_mode(); |
2777 | } else |
2778 | // Find the start of the badly spelled word. |
2779 | if (spell_move_to(curwin, FORWARD, true, true, NULL) == 0 |
2780 | || curwin->w_cursor.col > prev_cursor.col) { |
2781 | // No bad word or it starts after the cursor: use the word under the |
2782 | // cursor. |
2783 | curwin->w_cursor = prev_cursor; |
2784 | line = get_cursor_line_ptr(); |
2785 | p = line + curwin->w_cursor.col; |
2786 | // Backup to before start of word. |
2787 | while (p > line && spell_iswordp_nmw(p, curwin)) { |
2788 | MB_PTR_BACK(line, p); |
2789 | } |
2790 | // Forward to start of word. |
2791 | while (*p != NUL && !spell_iswordp_nmw(p, curwin)) { |
2792 | MB_PTR_ADV(p); |
2793 | } |
2794 | |
2795 | if (!spell_iswordp_nmw(p, curwin)) { // No word found. |
2796 | beep_flush(); |
2797 | return; |
2798 | } |
2799 | curwin->w_cursor.col = (colnr_T)(p - line); |
2800 | } |
2801 | |
2802 | // Get the word and its length. |
2803 | |
2804 | // Figure out if the word should be capitalised. |
2805 | need_cap = check_need_cap(curwin->w_cursor.lnum, curwin->w_cursor.col); |
2806 | |
2807 | // Make a copy of current line since autocommands may free the line. |
2808 | line = vim_strsave(get_cursor_line_ptr()); |
2809 | |
2810 | // Get the list of suggestions. Limit to 'lines' - 2 or the number in |
2811 | // 'spellsuggest', whatever is smaller. |
2812 | if (sps_limit > (int)Rows - 2) |
2813 | limit = (int)Rows - 2; |
2814 | else |
2815 | limit = sps_limit; |
2816 | spell_find_suggest(line + curwin->w_cursor.col, badlen, &sug, limit, |
2817 | true, need_cap, true); |
2818 | |
2819 | if (GA_EMPTY(&sug.su_ga)) |
2820 | MSG(_("Sorry, no suggestions" )); |
2821 | else if (count > 0) { |
2822 | if (count > sug.su_ga.ga_len) |
2823 | smsg(_("Sorry, only %" PRId64 " suggestions" ), |
2824 | (int64_t)sug.su_ga.ga_len); |
2825 | } else { |
2826 | XFREE_CLEAR(repl_from); |
2827 | XFREE_CLEAR(repl_to); |
2828 | |
2829 | // When 'rightleft' is set the list is drawn right-left. |
2830 | cmdmsg_rl = curwin->w_p_rl; |
2831 | if (cmdmsg_rl) |
2832 | msg_col = Columns - 1; |
2833 | |
2834 | // List the suggestions. |
2835 | msg_start(); |
2836 | msg_row = Rows - 1; // for when 'cmdheight' > 1 |
2837 | lines_left = Rows; // avoid more prompt |
2838 | vim_snprintf((char *)IObuff, IOSIZE, _("Change \"%.*s\" to:" ), |
2839 | sug.su_badlen, sug.su_badptr); |
2840 | if (cmdmsg_rl && STRNCMP(IObuff, "Change" , 6) == 0) { |
2841 | // And now the rabbit from the high hat: Avoid showing the |
2842 | // untranslated message rightleft. |
2843 | vim_snprintf((char *)IObuff, IOSIZE, ":ot \"%.*s\" egnahC" , |
2844 | sug.su_badlen, sug.su_badptr); |
2845 | } |
2846 | msg_puts((const char *)IObuff); |
2847 | msg_clr_eos(); |
2848 | msg_putchar('\n'); |
2849 | |
2850 | msg_scroll = TRUE; |
2851 | for (int i = 0; i < sug.su_ga.ga_len; ++i) { |
2852 | stp = &SUG(sug.su_ga, i); |
2853 | |
2854 | // The suggested word may replace only part of the bad word, add |
2855 | // the not replaced part. |
2856 | STRLCPY(wcopy, stp->st_word, MAXWLEN + 1); |
2857 | if (sug.su_badlen > stp->st_orglen) |
2858 | STRLCPY(wcopy + stp->st_wordlen, |
2859 | sug.su_badptr + stp->st_orglen, |
2860 | sug.su_badlen - stp->st_orglen + 1); |
2861 | vim_snprintf((char *)IObuff, IOSIZE, "%2d" , i + 1); |
2862 | if (cmdmsg_rl) { |
2863 | rl_mirror(IObuff); |
2864 | } |
2865 | msg_puts((const char *)IObuff); |
2866 | |
2867 | vim_snprintf((char *)IObuff, IOSIZE, " \"%s\"" , wcopy); |
2868 | msg_puts((const char *)IObuff); |
2869 | |
2870 | // The word may replace more than "su_badlen". |
2871 | if (sug.su_badlen < stp->st_orglen) { |
2872 | vim_snprintf((char *)IObuff, IOSIZE, _(" < \"%.*s\"" ), |
2873 | stp->st_orglen, sug.su_badptr); |
2874 | msg_puts((const char *)IObuff); |
2875 | } |
2876 | |
2877 | if (p_verbose > 0) { |
2878 | // Add the score. |
2879 | if (sps_flags & (SPS_DOUBLE | SPS_BEST)) |
2880 | vim_snprintf((char *)IObuff, IOSIZE, " (%s%d - %d)" , |
2881 | stp->st_salscore ? "s " : "" , |
2882 | stp->st_score, stp->st_altscore); |
2883 | else |
2884 | vim_snprintf((char *)IObuff, IOSIZE, " (%d)" , |
2885 | stp->st_score); |
2886 | if (cmdmsg_rl) |
2887 | // Mirror the numbers, but keep the leading space. |
2888 | rl_mirror(IObuff + 1); |
2889 | msg_advance(30); |
2890 | msg_puts((const char *)IObuff); |
2891 | } |
2892 | msg_putchar('\n'); |
2893 | } |
2894 | |
2895 | cmdmsg_rl = FALSE; |
2896 | msg_col = 0; |
2897 | // Ask for choice. |
2898 | selected = prompt_for_number(&mouse_used); |
2899 | if (mouse_used) |
2900 | selected -= lines_left; |
2901 | lines_left = Rows; // avoid more prompt |
2902 | // don't delay for 'smd' in normal_cmd() |
2903 | msg_scroll = msg_scroll_save; |
2904 | } |
2905 | |
2906 | if (selected > 0 && selected <= sug.su_ga.ga_len && u_save_cursor() == OK) { |
2907 | // Save the from and to text for :spellrepall. |
2908 | stp = &SUG(sug.su_ga, selected - 1); |
2909 | if (sug.su_badlen > stp->st_orglen) { |
2910 | // Replacing less than "su_badlen", append the remainder to |
2911 | // repl_to. |
2912 | repl_from = vim_strnsave(sug.su_badptr, sug.su_badlen); |
2913 | vim_snprintf((char *)IObuff, IOSIZE, "%s%.*s" , stp->st_word, |
2914 | sug.su_badlen - stp->st_orglen, |
2915 | sug.su_badptr + stp->st_orglen); |
2916 | repl_to = vim_strsave(IObuff); |
2917 | } else { |
2918 | // Replacing su_badlen or more, use the whole word. |
2919 | repl_from = vim_strnsave(sug.su_badptr, stp->st_orglen); |
2920 | repl_to = vim_strsave(stp->st_word); |
2921 | } |
2922 | |
2923 | // Replace the word. |
2924 | p = xmalloc(STRLEN(line) - stp->st_orglen + stp->st_wordlen + 1); |
2925 | c = (int)(sug.su_badptr - line); |
2926 | memmove(p, line, c); |
2927 | STRCPY(p + c, stp->st_word); |
2928 | STRCAT(p, sug.su_badptr + stp->st_orglen); |
2929 | ml_replace(curwin->w_cursor.lnum, p, false); |
2930 | curwin->w_cursor.col = c; |
2931 | |
2932 | // For redo we use a change-word command. |
2933 | ResetRedobuff(); |
2934 | AppendToRedobuff("ciw" ); |
2935 | AppendToRedobuffLit(p + c, |
2936 | stp->st_wordlen + sug.su_badlen - stp->st_orglen); |
2937 | AppendCharToRedobuff(ESC); |
2938 | |
2939 | // After this "p" may be invalid. |
2940 | changed_bytes(curwin->w_cursor.lnum, c); |
2941 | } else |
2942 | curwin->w_cursor = prev_cursor; |
2943 | |
2944 | spell_find_cleanup(&sug); |
2945 | xfree(line); |
2946 | } |
2947 | |
2948 | // Check if the word at line "lnum" column "col" is required to start with a |
2949 | // capital. This uses 'spellcapcheck' of the current buffer. |
2950 | static bool check_need_cap(linenr_T lnum, colnr_T col) |
2951 | { |
2952 | bool need_cap = false; |
2953 | char_u *line; |
2954 | char_u *line_copy = NULL; |
2955 | char_u *p; |
2956 | colnr_T endcol; |
2957 | regmatch_T regmatch; |
2958 | |
2959 | if (curwin->w_s->b_cap_prog == NULL) |
2960 | return false; |
2961 | |
2962 | line = get_cursor_line_ptr(); |
2963 | endcol = 0; |
2964 | if (getwhitecols(line) >= (int)col) { |
2965 | // At start of line, check if previous line is empty or sentence |
2966 | // ends there. |
2967 | if (lnum == 1) |
2968 | need_cap = true; |
2969 | else { |
2970 | line = ml_get(lnum - 1); |
2971 | if (*skipwhite(line) == NUL) |
2972 | need_cap = true; |
2973 | else { |
2974 | // Append a space in place of the line break. |
2975 | line_copy = concat_str(line, (char_u *)" " ); |
2976 | line = line_copy; |
2977 | endcol = (colnr_T)STRLEN(line); |
2978 | } |
2979 | } |
2980 | } else { |
2981 | endcol = col; |
2982 | } |
2983 | |
2984 | if (endcol > 0) { |
2985 | // Check if sentence ends before the bad word. |
2986 | regmatch.regprog = curwin->w_s->b_cap_prog; |
2987 | regmatch.rm_ic = FALSE; |
2988 | p = line + endcol; |
2989 | for (;; ) { |
2990 | MB_PTR_BACK(line, p); |
2991 | if (p == line || spell_iswordp_nmw(p, curwin)) { |
2992 | break; |
2993 | } |
2994 | if (vim_regexec(®match, p, 0) |
2995 | && regmatch.endp[0] == line + endcol) { |
2996 | need_cap = true; |
2997 | break; |
2998 | } |
2999 | } |
3000 | curwin->w_s->b_cap_prog = regmatch.regprog; |
3001 | } |
3002 | |
3003 | xfree(line_copy); |
3004 | |
3005 | return need_cap; |
3006 | } |
3007 | |
3008 | |
3009 | // ":spellrepall" |
3010 | void ex_spellrepall(exarg_T *eap) |
3011 | { |
3012 | pos_T pos = curwin->w_cursor; |
3013 | char_u *frompat; |
3014 | int addlen; |
3015 | char_u *line; |
3016 | char_u *p; |
3017 | bool save_ws = p_ws; |
3018 | linenr_T prev_lnum = 0; |
3019 | |
3020 | if (repl_from == NULL || repl_to == NULL) { |
3021 | EMSG(_("E752: No previous spell replacement" )); |
3022 | return; |
3023 | } |
3024 | addlen = (int)(STRLEN(repl_to) - STRLEN(repl_from)); |
3025 | |
3026 | frompat = xmalloc(STRLEN(repl_from) + 7); |
3027 | sprintf((char *)frompat, "\\V\\<%s\\>" , repl_from); |
3028 | p_ws = false; |
3029 | |
3030 | sub_nsubs = 0; |
3031 | sub_nlines = 0; |
3032 | curwin->w_cursor.lnum = 0; |
3033 | while (!got_int) { |
3034 | if (do_search(NULL, '/', frompat, 1L, SEARCH_KEEP, NULL, NULL) == 0 |
3035 | || u_save_cursor() == FAIL) { |
3036 | break; |
3037 | } |
3038 | |
3039 | // Only replace when the right word isn't there yet. This happens |
3040 | // when changing "etc" to "etc.". |
3041 | line = get_cursor_line_ptr(); |
3042 | if (addlen <= 0 || STRNCMP(line + curwin->w_cursor.col, |
3043 | repl_to, STRLEN(repl_to)) != 0) { |
3044 | p = xmalloc(STRLEN(line) + addlen + 1); |
3045 | memmove(p, line, curwin->w_cursor.col); |
3046 | STRCPY(p + curwin->w_cursor.col, repl_to); |
3047 | STRCAT(p, line + curwin->w_cursor.col + STRLEN(repl_from)); |
3048 | ml_replace(curwin->w_cursor.lnum, p, false); |
3049 | changed_bytes(curwin->w_cursor.lnum, curwin->w_cursor.col); |
3050 | |
3051 | if (curwin->w_cursor.lnum != prev_lnum) { |
3052 | ++sub_nlines; |
3053 | prev_lnum = curwin->w_cursor.lnum; |
3054 | } |
3055 | ++sub_nsubs; |
3056 | } |
3057 | curwin->w_cursor.col += (colnr_T)STRLEN(repl_to); |
3058 | } |
3059 | |
3060 | p_ws = save_ws; |
3061 | curwin->w_cursor = pos; |
3062 | xfree(frompat); |
3063 | |
3064 | if (sub_nsubs == 0) |
3065 | EMSG2(_("E753: Not found: %s" ), repl_from); |
3066 | else |
3067 | do_sub_msg(false); |
3068 | } |
3069 | |
3070 | // Find spell suggestions for "word". Return them in the growarray "*gap" as |
3071 | // a list of allocated strings. |
3072 | void |
3073 | spell_suggest_list ( |
3074 | garray_T *gap, |
3075 | char_u *word, |
3076 | int maxcount, // maximum nr of suggestions |
3077 | bool need_cap, // 'spellcapcheck' matched |
3078 | bool interactive |
3079 | ) |
3080 | { |
3081 | suginfo_T sug; |
3082 | suggest_T *stp; |
3083 | char_u *wcopy; |
3084 | |
3085 | spell_find_suggest(word, 0, &sug, maxcount, false, need_cap, interactive); |
3086 | |
3087 | // Make room in "gap". |
3088 | ga_init(gap, sizeof(char_u *), sug.su_ga.ga_len + 1); |
3089 | ga_grow(gap, sug.su_ga.ga_len); |
3090 | for (int i = 0; i < sug.su_ga.ga_len; ++i) { |
3091 | stp = &SUG(sug.su_ga, i); |
3092 | |
3093 | // The suggested word may replace only part of "word", add the not |
3094 | // replaced part. |
3095 | wcopy = xmalloc(stp->st_wordlen |
3096 | + STRLEN(sug.su_badptr + stp->st_orglen) + 1); |
3097 | STRCPY(wcopy, stp->st_word); |
3098 | STRCPY(wcopy + stp->st_wordlen, sug.su_badptr + stp->st_orglen); |
3099 | ((char_u **)gap->ga_data)[gap->ga_len++] = wcopy; |
3100 | } |
3101 | |
3102 | spell_find_cleanup(&sug); |
3103 | } |
3104 | |
3105 | // Find spell suggestions for the word at the start of "badptr". |
3106 | // Return the suggestions in "su->su_ga". |
3107 | // The maximum number of suggestions is "maxcount". |
3108 | // Note: does use info for the current window. |
3109 | // This is based on the mechanisms of Aspell, but completely reimplemented. |
3110 | static void |
3111 | spell_find_suggest ( |
3112 | char_u *badptr, |
3113 | int badlen, // length of bad word or 0 if unknown |
3114 | suginfo_T *su, |
3115 | int maxcount, |
3116 | bool banbadword, // don't include badword in suggestions |
3117 | bool need_cap, // word should start with capital |
3118 | bool interactive |
3119 | ) |
3120 | { |
3121 | hlf_T attr = HLF_COUNT; |
3122 | char_u buf[MAXPATHL]; |
3123 | char_u *p; |
3124 | bool do_combine = false; |
3125 | char_u *sps_copy; |
3126 | static bool expr_busy = false; |
3127 | int c; |
3128 | langp_T *lp; |
3129 | |
3130 | // Set the info in "*su". |
3131 | memset(su, 0, sizeof(suginfo_T)); |
3132 | ga_init(&su->su_ga, (int)sizeof(suggest_T), 10); |
3133 | ga_init(&su->su_sga, (int)sizeof(suggest_T), 10); |
3134 | if (*badptr == NUL) |
3135 | return; |
3136 | hash_init(&su->su_banned); |
3137 | |
3138 | su->su_badptr = badptr; |
3139 | if (badlen != 0) |
3140 | su->su_badlen = badlen; |
3141 | else { |
3142 | size_t tmplen = spell_check(curwin, su->su_badptr, &attr, NULL, false); |
3143 | assert(tmplen <= INT_MAX); |
3144 | su->su_badlen = (int)tmplen; |
3145 | } |
3146 | su->su_maxcount = maxcount; |
3147 | su->su_maxscore = SCORE_MAXINIT; |
3148 | |
3149 | if (su->su_badlen >= MAXWLEN) |
3150 | su->su_badlen = MAXWLEN - 1; // just in case |
3151 | STRLCPY(su->su_badword, su->su_badptr, su->su_badlen + 1); |
3152 | (void)spell_casefold(su->su_badptr, su->su_badlen, su->su_fbadword, MAXWLEN); |
3153 | |
3154 | // TODO(vim): make this work if the case-folded text is longer than the |
3155 | // original text. Currently an illegal byte causes wrong pointer |
3156 | // computations. |
3157 | su->su_fbadword[su->su_badlen] = NUL; |
3158 | |
3159 | // get caps flags for bad word |
3160 | su->su_badflags = badword_captype(su->su_badptr, |
3161 | su->su_badptr + su->su_badlen); |
3162 | if (need_cap) |
3163 | su->su_badflags |= WF_ONECAP; |
3164 | |
3165 | // Find the default language for sound folding. We simply use the first |
3166 | // one in 'spelllang' that supports sound folding. That's good for when |
3167 | // using multiple files for one language, it's not that bad when mixing |
3168 | // languages (e.g., "pl,en"). |
3169 | for (int i = 0; i < curbuf->b_s.b_langp.ga_len; ++i) { |
3170 | lp = LANGP_ENTRY(curbuf->b_s.b_langp, i); |
3171 | if (lp->lp_sallang != NULL) { |
3172 | su->su_sallang = lp->lp_sallang; |
3173 | break; |
3174 | } |
3175 | } |
3176 | |
3177 | // Soundfold the bad word with the default sound folding, so that we don't |
3178 | // have to do this many times. |
3179 | if (su->su_sallang != NULL) |
3180 | spell_soundfold(su->su_sallang, su->su_fbadword, true, |
3181 | su->su_sal_badword); |
3182 | |
3183 | // If the word is not capitalised and spell_check() doesn't consider the |
3184 | // word to be bad then it might need to be capitalised. Add a suggestion |
3185 | // for that. |
3186 | c = PTR2CHAR(su->su_badptr); |
3187 | if (!SPELL_ISUPPER(c) && attr == HLF_COUNT) { |
3188 | make_case_word(su->su_badword, buf, WF_ONECAP); |
3189 | add_suggestion(su, &su->su_ga, buf, su->su_badlen, SCORE_ICASE, |
3190 | 0, true, su->su_sallang, false); |
3191 | } |
3192 | |
3193 | // Ban the bad word itself. It may appear in another region. |
3194 | if (banbadword) |
3195 | add_banned(su, su->su_badword); |
3196 | |
3197 | // Make a copy of 'spellsuggest', because the expression may change it. |
3198 | sps_copy = vim_strsave(p_sps); |
3199 | |
3200 | // Loop over the items in 'spellsuggest'. |
3201 | for (p = sps_copy; *p != NUL; ) { |
3202 | copy_option_part(&p, buf, MAXPATHL, "," ); |
3203 | |
3204 | if (STRNCMP(buf, "expr:" , 5) == 0) { |
3205 | // Evaluate an expression. Skip this when called recursively, |
3206 | // when using spellsuggest() in the expression. |
3207 | if (!expr_busy) { |
3208 | expr_busy = true; |
3209 | spell_suggest_expr(su, buf + 5); |
3210 | expr_busy = false; |
3211 | } |
3212 | } else if (STRNCMP(buf, "file:" , 5) == 0) |
3213 | // Use list of suggestions in a file. |
3214 | spell_suggest_file(su, buf + 5); |
3215 | else { |
3216 | // Use internal method. |
3217 | spell_suggest_intern(su, interactive); |
3218 | if (sps_flags & SPS_DOUBLE) |
3219 | do_combine = true; |
3220 | } |
3221 | } |
3222 | |
3223 | xfree(sps_copy); |
3224 | |
3225 | if (do_combine) |
3226 | // Combine the two list of suggestions. This must be done last, |
3227 | // because sorting changes the order again. |
3228 | score_combine(su); |
3229 | } |
3230 | |
3231 | // Find suggestions by evaluating expression "expr". |
3232 | static void spell_suggest_expr(suginfo_T *su, char_u *expr) |
3233 | { |
3234 | int score; |
3235 | const char *p; |
3236 | |
3237 | // The work is split up in a few parts to avoid having to export |
3238 | // suginfo_T. |
3239 | // First evaluate the expression and get the resulting list. |
3240 | list_T *const list = eval_spell_expr(su->su_badword, expr); |
3241 | if (list != NULL) { |
3242 | // Loop over the items in the list. |
3243 | TV_LIST_ITER(list, li, { |
3244 | if (TV_LIST_ITEM_TV(li)->v_type == VAR_LIST) { |
3245 | // Get the word and the score from the items. |
3246 | score = get_spellword(TV_LIST_ITEM_TV(li)->vval.v_list, &p); |
3247 | if (score >= 0 && score <= su->su_maxscore) { |
3248 | add_suggestion(su, &su->su_ga, (const char_u *)p, su->su_badlen, |
3249 | score, 0, true, su->su_sallang, false); |
3250 | } |
3251 | } |
3252 | }); |
3253 | tv_list_unref(list); |
3254 | } |
3255 | |
3256 | // Remove bogus suggestions, sort and truncate at "maxcount". |
3257 | check_suggestions(su, &su->su_ga); |
3258 | (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); |
3259 | } |
3260 | |
3261 | // Find suggestions in file "fname". Used for "file:" in 'spellsuggest'. |
3262 | static void spell_suggest_file(suginfo_T *su, char_u *fname) |
3263 | { |
3264 | FILE *fd; |
3265 | char_u line[MAXWLEN * 2]; |
3266 | char_u *p; |
3267 | int len; |
3268 | char_u cword[MAXWLEN]; |
3269 | |
3270 | // Open the file. |
3271 | fd = os_fopen((char *)fname, "r" ); |
3272 | if (fd == NULL) { |
3273 | EMSG2(_(e_notopen), fname); |
3274 | return; |
3275 | } |
3276 | |
3277 | // Read it line by line. |
3278 | while (!vim_fgets(line, MAXWLEN * 2, fd) && !got_int) { |
3279 | line_breakcheck(); |
3280 | |
3281 | p = vim_strchr(line, '/'); |
3282 | if (p == NULL) |
3283 | continue; // No Tab found, just skip the line. |
3284 | *p++ = NUL; |
3285 | if (STRICMP(su->su_badword, line) == 0) { |
3286 | // Match! Isolate the good word, until CR or NL. |
3287 | for (len = 0; p[len] >= ' '; ++len) |
3288 | ; |
3289 | p[len] = NUL; |
3290 | |
3291 | // If the suggestion doesn't have specific case duplicate the case |
3292 | // of the bad word. |
3293 | if (captype(p, NULL) == 0) { |
3294 | make_case_word(p, cword, su->su_badflags); |
3295 | p = cword; |
3296 | } |
3297 | |
3298 | add_suggestion(su, &su->su_ga, p, su->su_badlen, |
3299 | SCORE_FILE, 0, true, su->su_sallang, false); |
3300 | } |
3301 | } |
3302 | |
3303 | fclose(fd); |
3304 | |
3305 | // Remove bogus suggestions, sort and truncate at "maxcount". |
3306 | check_suggestions(su, &su->su_ga); |
3307 | (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); |
3308 | } |
3309 | |
3310 | // Find suggestions for the internal method indicated by "sps_flags". |
3311 | static void spell_suggest_intern(suginfo_T *su, bool interactive) |
3312 | { |
3313 | // Load the .sug file(s) that are available and not done yet. |
3314 | suggest_load_files(); |
3315 | |
3316 | // 1. Try special cases, such as repeating a word: "the the" -> "the". |
3317 | // |
3318 | // Set a maximum score to limit the combination of operations that is |
3319 | // tried. |
3320 | suggest_try_special(su); |
3321 | |
3322 | // 2. Try inserting/deleting/swapping/changing a letter, use REP entries |
3323 | // from the .aff file and inserting a space (split the word). |
3324 | suggest_try_change(su); |
3325 | |
3326 | // For the resulting top-scorers compute the sound-a-like score. |
3327 | if (sps_flags & SPS_DOUBLE) |
3328 | score_comp_sal(su); |
3329 | |
3330 | // 3. Try finding sound-a-like words. |
3331 | if ((sps_flags & SPS_FAST) == 0) { |
3332 | if (sps_flags & SPS_BEST) |
3333 | // Adjust the word score for the suggestions found so far for how |
3334 | // they sounds like. |
3335 | rescore_suggestions(su); |
3336 | |
3337 | // While going through the soundfold tree "su_maxscore" is the score |
3338 | // for the soundfold word, limits the changes that are being tried, |
3339 | // and "su_sfmaxscore" the rescored score, which is set by |
3340 | // cleanup_suggestions(). |
3341 | // First find words with a small edit distance, because this is much |
3342 | // faster and often already finds the top-N suggestions. If we didn't |
3343 | // find many suggestions try again with a higher edit distance. |
3344 | // "sl_sounddone" is used to avoid doing the same word twice. |
3345 | suggest_try_soundalike_prep(); |
3346 | su->su_maxscore = SCORE_SFMAX1; |
3347 | su->su_sfmaxscore = SCORE_MAXINIT * 3; |
3348 | suggest_try_soundalike(su); |
3349 | if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) { |
3350 | // We didn't find enough matches, try again, allowing more |
3351 | // changes to the soundfold word. |
3352 | su->su_maxscore = SCORE_SFMAX2; |
3353 | suggest_try_soundalike(su); |
3354 | if (su->su_ga.ga_len < SUG_CLEAN_COUNT(su)) { |
3355 | // Still didn't find enough matches, try again, allowing even |
3356 | // more changes to the soundfold word. |
3357 | su->su_maxscore = SCORE_SFMAX3; |
3358 | suggest_try_soundalike(su); |
3359 | } |
3360 | } |
3361 | su->su_maxscore = su->su_sfmaxscore; |
3362 | suggest_try_soundalike_finish(); |
3363 | } |
3364 | |
3365 | // When CTRL-C was hit while searching do show the results. Only clear |
3366 | // got_int when using a command, not for spellsuggest(). |
3367 | os_breakcheck(); |
3368 | if (interactive && got_int) { |
3369 | (void)vgetc(); |
3370 | got_int = FALSE; |
3371 | } |
3372 | |
3373 | if ((sps_flags & SPS_DOUBLE) == 0 && su->su_ga.ga_len != 0) { |
3374 | if (sps_flags & SPS_BEST) |
3375 | // Adjust the word score for how it sounds like. |
3376 | rescore_suggestions(su); |
3377 | |
3378 | // Remove bogus suggestions, sort and truncate at "maxcount". |
3379 | check_suggestions(su, &su->su_ga); |
3380 | (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); |
3381 | } |
3382 | } |
3383 | |
3384 | // Free the info put in "*su" by spell_find_suggest(). |
3385 | static void spell_find_cleanup(suginfo_T *su) |
3386 | { |
3387 | # define FREE_SUG_WORD(sug) xfree(sug->st_word) |
3388 | // Free the suggestions. |
3389 | GA_DEEP_CLEAR(&su->su_ga, suggest_T, FREE_SUG_WORD); |
3390 | GA_DEEP_CLEAR(&su->su_sga, suggest_T, FREE_SUG_WORD); |
3391 | |
3392 | // Free the banned words. |
3393 | hash_clear_all(&su->su_banned, 0); |
3394 | } |
3395 | |
3396 | /// Make a copy of "word", with the first letter upper or lower cased, to |
3397 | /// "wcopy[MAXWLEN]". "word" must not be empty. |
3398 | /// The result is NUL terminated. |
3399 | /// |
3400 | /// @param[in] word source string to copy |
3401 | /// @param[in,out] wcopy copied string, with case of first letter changed |
3402 | /// @param[in] upper True to upper case, otherwise lower case |
3403 | void onecap_copy(char_u *word, char_u *wcopy, bool upper) |
3404 | { |
3405 | char_u *p; |
3406 | int c; |
3407 | int l; |
3408 | |
3409 | p = word; |
3410 | if (has_mbyte) { |
3411 | c = mb_cptr2char_adv((const char_u **)&p); |
3412 | } else { |
3413 | c = *p++; |
3414 | } |
3415 | if (upper) { |
3416 | c = SPELL_TOUPPER(c); |
3417 | } else { |
3418 | c = SPELL_TOFOLD(c); |
3419 | } |
3420 | l = utf_char2bytes(c, wcopy); |
3421 | STRLCPY(wcopy + l, p, MAXWLEN - l); |
3422 | } |
3423 | |
3424 | // Make a copy of "word" with all the letters upper cased into |
3425 | // "wcopy[MAXWLEN]". The result is NUL terminated. |
3426 | static void allcap_copy(char_u *word, char_u *wcopy) |
3427 | { |
3428 | char_u *s; |
3429 | char_u *d; |
3430 | int c; |
3431 | |
3432 | d = wcopy; |
3433 | for (s = word; *s != NUL; ) { |
3434 | if (has_mbyte) { |
3435 | c = mb_cptr2char_adv((const char_u **)&s); |
3436 | } else { |
3437 | c = *s++; |
3438 | } |
3439 | |
3440 | if (c == 0xdf) { |
3441 | c = 'S'; |
3442 | if (d - wcopy >= MAXWLEN - 1) |
3443 | break; |
3444 | *d++ = c; |
3445 | } else |
3446 | c = SPELL_TOUPPER(c); |
3447 | |
3448 | if (d - wcopy >= MAXWLEN - MB_MAXBYTES) { |
3449 | break; |
3450 | } |
3451 | d += utf_char2bytes(c, d); |
3452 | } |
3453 | *d = NUL; |
3454 | } |
3455 | |
3456 | // Try finding suggestions by recognizing specific situations. |
3457 | static void suggest_try_special(suginfo_T *su) |
3458 | { |
3459 | char_u *p; |
3460 | size_t len; |
3461 | int c; |
3462 | char_u word[MAXWLEN]; |
3463 | |
3464 | // Recognize a word that is repeated: "the the". |
3465 | p = skiptowhite(su->su_fbadword); |
3466 | len = p - su->su_fbadword; |
3467 | p = skipwhite(p); |
3468 | if (STRLEN(p) == len && STRNCMP(su->su_fbadword, p, len) == 0) { |
3469 | // Include badflags: if the badword is onecap or allcap |
3470 | // use that for the goodword too: "The the" -> "The". |
3471 | c = su->su_fbadword[len]; |
3472 | su->su_fbadword[len] = NUL; |
3473 | make_case_word(su->su_fbadword, word, su->su_badflags); |
3474 | su->su_fbadword[len] = c; |
3475 | |
3476 | // Give a soundalike score of 0, compute the score as if deleting one |
3477 | // character. |
3478 | add_suggestion(su, &su->su_ga, word, su->su_badlen, |
3479 | RESCORE(SCORE_REP, 0), 0, true, su->su_sallang, false); |
3480 | } |
3481 | } |
3482 | |
3483 | // Measure how much time is spent in each state. |
3484 | // Output is dumped in "suggestprof". |
3485 | |
3486 | #ifdef SUGGEST_PROFILE |
3487 | proftime_T current; |
3488 | proftime_T total; |
3489 | proftime_T times[STATE_FINAL + 1]; |
3490 | long counts[STATE_FINAL + 1]; |
3491 | |
3492 | static void |
3493 | prof_init(void) |
3494 | { |
3495 | for (int i = 0; i <= STATE_FINAL; i++) { |
3496 | profile_zero(×[i]); |
3497 | counts[i] = 0; |
3498 | } |
3499 | profile_start(¤t); |
3500 | profile_start(&total); |
3501 | } |
3502 | |
3503 | // call before changing state |
3504 | static void |
3505 | prof_store(state_T state) |
3506 | { |
3507 | profile_end(¤t); |
3508 | profile_add(×[state], ¤t); |
3509 | counts[state]++; |
3510 | profile_start(¤t); |
3511 | } |
3512 | # define PROF_STORE(state) prof_store(state); |
3513 | |
3514 | static void |
3515 | prof_report(char *name) |
3516 | { |
3517 | FILE *fd = fopen("suggestprof" , "a" ); |
3518 | |
3519 | profile_end(&total); |
3520 | fprintf(fd, "-----------------------\n" ); |
3521 | fprintf(fd, "%s: %s\n" , name, profile_msg(&total)); |
3522 | for (int i = 0; i <= STATE_FINAL; i++) { |
3523 | fprintf(fd, "%d: %s (" %" PRId64)\n" , i, profile_msg(×[i]), counts[i]); |
3524 | } |
3525 | fclose(fd); |
3526 | } |
3527 | #else |
3528 | # define PROF_STORE(state) |
3529 | #endif |
3530 | |
3531 | // Try finding suggestions by adding/removing/swapping letters. |
3532 | |
3533 | static void suggest_try_change(suginfo_T *su) |
3534 | { |
3535 | char_u fword[MAXWLEN]; // copy of the bad word, case-folded |
3536 | int n; |
3537 | char_u *p; |
3538 | langp_T *lp; |
3539 | |
3540 | // We make a copy of the case-folded bad word, so that we can modify it |
3541 | // to find matches (esp. REP items). Append some more text, changing |
3542 | // chars after the bad word may help. |
3543 | STRCPY(fword, su->su_fbadword); |
3544 | n = (int)STRLEN(fword); |
3545 | p = su->su_badptr + su->su_badlen; |
3546 | (void)spell_casefold(p, (int)STRLEN(p), fword + n, MAXWLEN - n); |
3547 | |
3548 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) { |
3549 | lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
3550 | |
3551 | // If reloading a spell file fails it's still in the list but |
3552 | // everything has been cleared. |
3553 | if (lp->lp_slang->sl_fbyts == NULL) |
3554 | continue; |
3555 | |
3556 | // Try it for this language. Will add possible suggestions. |
3557 | // |
3558 | #ifdef SUGGEST_PROFILE |
3559 | prof_init(); |
3560 | #endif |
3561 | suggest_trie_walk(su, lp, fword, false); |
3562 | #ifdef SUGGEST_PROFILE |
3563 | prof_report("try_change" ); |
3564 | #endif |
3565 | } |
3566 | } |
3567 | |
3568 | // Check the maximum score, if we go over it we won't try this change. |
3569 | #define TRY_DEEPER(su, stack, depth, add) \ |
3570 | (stack[depth].ts_score + (add) < su->su_maxscore) |
3571 | |
3572 | // Try finding suggestions by adding/removing/swapping letters. |
3573 | // |
3574 | // This uses a state machine. At each node in the tree we try various |
3575 | // operations. When trying if an operation works "depth" is increased and the |
3576 | // stack[] is used to store info. This allows combinations, thus insert one |
3577 | // character, replace one and delete another. The number of changes is |
3578 | // limited by su->su_maxscore. |
3579 | // |
3580 | // After implementing this I noticed an article by Kemal Oflazer that |
3581 | // describes something similar: "Error-tolerant Finite State Recognition with |
3582 | // Applications to Morphological Analysis and Spelling Correction" (1996). |
3583 | // The implementation in the article is simplified and requires a stack of |
3584 | // unknown depth. The implementation here only needs a stack depth equal to |
3585 | // the length of the word. |
3586 | // |
3587 | // This is also used for the sound-folded word, "soundfold" is true then. |
3588 | // The mechanism is the same, but we find a match with a sound-folded word |
3589 | // that comes from one or more original words. Each of these words may be |
3590 | // added, this is done by add_sound_suggest(). |
3591 | // Don't use: |
3592 | // the prefix tree or the keep-case tree |
3593 | // "su->su_badlen" |
3594 | // anything to do with upper and lower case |
3595 | // anything to do with word or non-word characters ("spell_iswordp()") |
3596 | // banned words |
3597 | // word flags (rare, region, compounding) |
3598 | // word splitting for now |
3599 | // "similar_chars()" |
3600 | // use "slang->sl_repsal" instead of "lp->lp_replang->sl_rep" |
3601 | static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char_u *fword, bool soundfold) |
3602 | { |
3603 | char_u tword[MAXWLEN]; // good word collected so far |
3604 | trystate_T stack[MAXWLEN]; |
3605 | char_u preword[MAXWLEN * 3]; // word found with proper case; |
3606 | // concatenation of prefix compound |
3607 | // words and split word. NUL terminated |
3608 | // when going deeper but not when coming |
3609 | // back. |
3610 | char_u compflags[MAXWLEN]; // compound flags, one for each word |
3611 | trystate_T *sp; |
3612 | int newscore; |
3613 | int score; |
3614 | char_u *byts, *fbyts, *pbyts; |
3615 | idx_T *idxs, *fidxs, *pidxs; |
3616 | int depth; |
3617 | int c, c2, c3; |
3618 | int n = 0; |
3619 | int flags; |
3620 | garray_T *gap; |
3621 | idx_T arridx; |
3622 | int len; |
3623 | char_u *p; |
3624 | fromto_T *ftp; |
3625 | int fl = 0, tl; |
3626 | int = 0; // extra bytes in fword[] from REP item |
3627 | slang_T *slang = lp->lp_slang; |
3628 | int fword_ends; |
3629 | bool goodword_ends; |
3630 | #ifdef DEBUG_TRIEWALK |
3631 | // Stores the name of the change made at each level. |
3632 | char_u changename[MAXWLEN][80]; |
3633 | #endif |
3634 | int breakcheckcount = 1000; |
3635 | bool compound_ok; |
3636 | |
3637 | // Go through the whole case-fold tree, try changes at each node. |
3638 | // "tword[]" contains the word collected from nodes in the tree. |
3639 | // "fword[]" the word we are trying to match with (initially the bad |
3640 | // word). |
3641 | depth = 0; |
3642 | sp = &stack[0]; |
3643 | memset(sp, 0, sizeof(trystate_T)); // -V512 |
3644 | sp->ts_curi = 1; |
3645 | |
3646 | if (soundfold) { |
3647 | // Going through the soundfold tree. |
3648 | byts = fbyts = slang->sl_sbyts; |
3649 | idxs = fidxs = slang->sl_sidxs; |
3650 | pbyts = NULL; |
3651 | pidxs = NULL; |
3652 | sp->ts_prefixdepth = PFD_NOPREFIX; |
3653 | sp->ts_state = STATE_START; |
3654 | } else { |
3655 | // When there are postponed prefixes we need to use these first. At |
3656 | // the end of the prefix we continue in the case-fold tree. |
3657 | fbyts = slang->sl_fbyts; |
3658 | fidxs = slang->sl_fidxs; |
3659 | pbyts = slang->sl_pbyts; |
3660 | pidxs = slang->sl_pidxs; |
3661 | if (pbyts != NULL) { |
3662 | byts = pbyts; |
3663 | idxs = pidxs; |
3664 | sp->ts_prefixdepth = PFD_PREFIXTREE; |
3665 | sp->ts_state = STATE_NOPREFIX; // try without prefix first |
3666 | } else { |
3667 | byts = fbyts; |
3668 | idxs = fidxs; |
3669 | sp->ts_prefixdepth = PFD_NOPREFIX; |
3670 | sp->ts_state = STATE_START; |
3671 | } |
3672 | } |
3673 | |
3674 | // Loop to find all suggestions. At each round we either: |
3675 | // - For the current state try one operation, advance "ts_curi", |
3676 | // increase "depth". |
3677 | // - When a state is done go to the next, set "ts_state". |
3678 | // - When all states are tried decrease "depth". |
3679 | while (depth >= 0 && !got_int) { |
3680 | sp = &stack[depth]; |
3681 | switch (sp->ts_state) { |
3682 | case STATE_START: |
3683 | case STATE_NOPREFIX: |
3684 | // Start of node: Deal with NUL bytes, which means |
3685 | // tword[] may end here. |
3686 | arridx = sp->ts_arridx; // current node in the tree |
3687 | len = byts[arridx]; // bytes in this node |
3688 | arridx += sp->ts_curi; // index of current byte |
3689 | |
3690 | if (sp->ts_prefixdepth == PFD_PREFIXTREE) { |
3691 | // Skip over the NUL bytes, we use them later. |
3692 | for (n = 0; n < len && byts[arridx + n] == 0; ++n) |
3693 | ; |
3694 | sp->ts_curi += n; |
3695 | |
3696 | // Always past NUL bytes now. |
3697 | n = (int)sp->ts_state; |
3698 | PROF_STORE(sp->ts_state) |
3699 | sp->ts_state = STATE_ENDNUL; |
3700 | sp->ts_save_badflags = su->su_badflags; |
3701 | |
3702 | // At end of a prefix or at start of prefixtree: check for |
3703 | // following word. |
3704 | if (byts[arridx] == 0 || n == (int)STATE_NOPREFIX) { |
3705 | // Set su->su_badflags to the caps type at this position. |
3706 | // Use the caps type until here for the prefix itself. |
3707 | if (has_mbyte) |
3708 | n = nofold_len(fword, sp->ts_fidx, su->su_badptr); |
3709 | else |
3710 | n = sp->ts_fidx; |
3711 | flags = badword_captype(su->su_badptr, su->su_badptr + n); |
3712 | su->su_badflags = badword_captype(su->su_badptr + n, |
3713 | su->su_badptr + su->su_badlen); |
3714 | #ifdef DEBUG_TRIEWALK |
3715 | sprintf(changename[depth], "prefix" ); |
3716 | #endif |
3717 | go_deeper(stack, depth, 0); |
3718 | ++depth; |
3719 | sp = &stack[depth]; |
3720 | sp->ts_prefixdepth = depth - 1; |
3721 | byts = fbyts; |
3722 | idxs = fidxs; |
3723 | sp->ts_arridx = 0; |
3724 | |
3725 | // Move the prefix to preword[] with the right case |
3726 | // and make find_keepcap_word() works. |
3727 | tword[sp->ts_twordlen] = NUL; |
3728 | make_case_word(tword + sp->ts_splitoff, |
3729 | preword + sp->ts_prewordlen, flags); |
3730 | sp->ts_prewordlen = (char_u)STRLEN(preword); |
3731 | sp->ts_splitoff = sp->ts_twordlen; |
3732 | } |
3733 | break; |
3734 | } |
3735 | |
3736 | if (sp->ts_curi > len || byts[arridx] != 0) { |
3737 | // Past bytes in node and/or past NUL bytes. |
3738 | PROF_STORE(sp->ts_state) |
3739 | sp->ts_state = STATE_ENDNUL; |
3740 | sp->ts_save_badflags = su->su_badflags; |
3741 | break; |
3742 | } |
3743 | |
3744 | // End of word in tree. |
3745 | ++sp->ts_curi; // eat one NUL byte |
3746 | |
3747 | flags = (int)idxs[arridx]; |
3748 | |
3749 | // Skip words with the NOSUGGEST flag. |
3750 | if (flags & WF_NOSUGGEST) |
3751 | break; |
3752 | |
3753 | fword_ends = (fword[sp->ts_fidx] == NUL |
3754 | || (soundfold |
3755 | ? ascii_iswhite(fword[sp->ts_fidx]) |
3756 | : !spell_iswordp(fword + sp->ts_fidx, curwin))); |
3757 | tword[sp->ts_twordlen] = NUL; |
3758 | |
3759 | if (sp->ts_prefixdepth <= PFD_NOTSPECIAL |
3760 | && (sp->ts_flags & TSF_PREFIXOK) == 0) { |
3761 | // There was a prefix before the word. Check that the prefix |
3762 | // can be used with this word. |
3763 | // Count the length of the NULs in the prefix. If there are |
3764 | // none this must be the first try without a prefix. |
3765 | n = stack[sp->ts_prefixdepth].ts_arridx; |
3766 | len = pbyts[n++]; |
3767 | for (c = 0; c < len && pbyts[n + c] == 0; ++c) |
3768 | ; |
3769 | if (c > 0) { |
3770 | c = valid_word_prefix(c, n, flags, |
3771 | tword + sp->ts_splitoff, slang, false); |
3772 | if (c == 0) |
3773 | break; |
3774 | |
3775 | // Use the WF_RARE flag for a rare prefix. |
3776 | if (c & WF_RAREPFX) |
3777 | flags |= WF_RARE; |
3778 | |
3779 | // Tricky: when checking for both prefix and compounding |
3780 | // we run into the prefix flag first. |
3781 | // Remember that it's OK, so that we accept the prefix |
3782 | // when arriving at a compound flag. |
3783 | sp->ts_flags |= TSF_PREFIXOK; |
3784 | } |
3785 | } |
3786 | |
3787 | // Check NEEDCOMPOUND: can't use word without compounding. Do try |
3788 | // appending another compound word below. |
3789 | if (sp->ts_complen == sp->ts_compsplit && fword_ends |
3790 | && (flags & WF_NEEDCOMP)) |
3791 | goodword_ends = false; |
3792 | else |
3793 | goodword_ends = true; |
3794 | |
3795 | p = NULL; |
3796 | compound_ok = true; |
3797 | if (sp->ts_complen > sp->ts_compsplit) { |
3798 | if (slang->sl_nobreak) { |
3799 | // There was a word before this word. When there was no |
3800 | // change in this word (it was correct) add the first word |
3801 | // as a suggestion. If this word was corrected too, we |
3802 | // need to check if a correct word follows. |
3803 | if (sp->ts_fidx - sp->ts_splitfidx |
3804 | == sp->ts_twordlen - sp->ts_splitoff |
3805 | && STRNCMP(fword + sp->ts_splitfidx, |
3806 | tword + sp->ts_splitoff, |
3807 | sp->ts_fidx - sp->ts_splitfidx) == 0) { |
3808 | preword[sp->ts_prewordlen] = NUL; |
3809 | newscore = score_wordcount_adj(slang, sp->ts_score, |
3810 | preword + sp->ts_prewordlen, |
3811 | sp->ts_prewordlen > 0); |
3812 | // Add the suggestion if the score isn't too bad. |
3813 | if (newscore <= su->su_maxscore) |
3814 | add_suggestion(su, &su->su_ga, preword, |
3815 | sp->ts_splitfidx - repextra, |
3816 | newscore, 0, false, |
3817 | lp->lp_sallang, false); |
3818 | break; |
3819 | } |
3820 | } else { |
3821 | // There was a compound word before this word. If this |
3822 | // word does not support compounding then give up |
3823 | // (splitting is tried for the word without compound |
3824 | // flag). |
3825 | if (((unsigned)flags >> 24) == 0 |
3826 | || sp->ts_twordlen - sp->ts_splitoff |
3827 | < slang->sl_compminlen) |
3828 | break; |
3829 | // For multi-byte chars check character length against |
3830 | // COMPOUNDMIN. |
3831 | if (has_mbyte |
3832 | && slang->sl_compminlen > 0 |
3833 | && mb_charlen(tword + sp->ts_splitoff) |
3834 | < slang->sl_compminlen) |
3835 | break; |
3836 | |
3837 | compflags[sp->ts_complen] = ((unsigned)flags >> 24); |
3838 | compflags[sp->ts_complen + 1] = NUL; |
3839 | STRLCPY(preword + sp->ts_prewordlen, |
3840 | tword + sp->ts_splitoff, |
3841 | sp->ts_twordlen - sp->ts_splitoff + 1); |
3842 | |
3843 | // Verify CHECKCOMPOUNDPATTERN rules. |
3844 | if (match_checkcompoundpattern(preword, sp->ts_prewordlen, |
3845 | &slang->sl_comppat)) |
3846 | compound_ok = false; |
3847 | |
3848 | if (compound_ok) { |
3849 | p = preword; |
3850 | while (*skiptowhite(p) != NUL) |
3851 | p = skipwhite(skiptowhite(p)); |
3852 | if (fword_ends && !can_compound(slang, p, |
3853 | compflags + sp->ts_compsplit)) |
3854 | // Compound is not allowed. But it may still be |
3855 | // possible if we add another (short) word. |
3856 | compound_ok = false; |
3857 | } |
3858 | |
3859 | // Get pointer to last char of previous word. |
3860 | p = preword + sp->ts_prewordlen; |
3861 | MB_PTR_BACK(preword, p); |
3862 | } |
3863 | } |
3864 | |
3865 | // Form the word with proper case in preword. |
3866 | // If there is a word from a previous split, append. |
3867 | // For the soundfold tree don't change the case, simply append. |
3868 | if (soundfold) |
3869 | STRCPY(preword + sp->ts_prewordlen, tword + sp->ts_splitoff); |
3870 | else if (flags & WF_KEEPCAP) |
3871 | // Must find the word in the keep-case tree. |
3872 | find_keepcap_word(slang, tword + sp->ts_splitoff, |
3873 | preword + sp->ts_prewordlen); |
3874 | else { |
3875 | // Include badflags: If the badword is onecap or allcap |
3876 | // use that for the goodword too. But if the badword is |
3877 | // allcap and it's only one char long use onecap. |
3878 | c = su->su_badflags; |
3879 | if ((c & WF_ALLCAP) |
3880 | && su->su_badlen == (*mb_ptr2len)(su->su_badptr) |
3881 | ) |
3882 | c = WF_ONECAP; |
3883 | c |= flags; |
3884 | |
3885 | // When appending a compound word after a word character don't |
3886 | // use Onecap. |
3887 | if (p != NULL && spell_iswordp_nmw(p, curwin)) |
3888 | c &= ~WF_ONECAP; |
3889 | make_case_word(tword + sp->ts_splitoff, |
3890 | preword + sp->ts_prewordlen, c); |
3891 | } |
3892 | |
3893 | if (!soundfold) { |
3894 | // Don't use a banned word. It may appear again as a good |
3895 | // word, thus remember it. |
3896 | if (flags & WF_BANNED) { |
3897 | add_banned(su, preword + sp->ts_prewordlen); |
3898 | break; |
3899 | } |
3900 | if ((sp->ts_complen == sp->ts_compsplit |
3901 | && WAS_BANNED(su, preword + sp->ts_prewordlen)) |
3902 | || WAS_BANNED(su, preword)) { |
3903 | if (slang->sl_compprog == NULL) |
3904 | break; |
3905 | // the word so far was banned but we may try compounding |
3906 | goodword_ends = false; |
3907 | } |
3908 | } |
3909 | |
3910 | newscore = 0; |
3911 | if (!soundfold) { // soundfold words don't have flags |
3912 | if ((flags & WF_REGION) |
3913 | && (((unsigned)flags >> 16) & lp->lp_region) == 0) |
3914 | newscore += SCORE_REGION; |
3915 | if (flags & WF_RARE) |
3916 | newscore += SCORE_RARE; |
3917 | |
3918 | if (!spell_valid_case(su->su_badflags, |
3919 | captype(preword + sp->ts_prewordlen, NULL))) |
3920 | newscore += SCORE_ICASE; |
3921 | } |
3922 | |
3923 | // TODO: how about splitting in the soundfold tree? |
3924 | if (fword_ends |
3925 | && goodword_ends |
3926 | && sp->ts_fidx >= sp->ts_fidxtry |
3927 | && compound_ok) { |
3928 | // The badword also ends: add suggestions. |
3929 | #ifdef DEBUG_TRIEWALK |
3930 | if (soundfold && STRCMP(preword, "smwrd" ) == 0) { |
3931 | int j; |
3932 | |
3933 | // print the stack of changes that brought us here |
3934 | smsg("------ %s -------" , fword); |
3935 | for (j = 0; j < depth; ++j) |
3936 | smsg("%s" , changename[j]); |
3937 | } |
3938 | #endif |
3939 | if (soundfold) { |
3940 | // For soundfolded words we need to find the original |
3941 | // words, the edit distance and then add them. |
3942 | add_sound_suggest(su, preword, sp->ts_score, lp); |
3943 | } else if (sp->ts_fidx > 0) { |
3944 | // Give a penalty when changing non-word char to word |
3945 | // char, e.g., "thes," -> "these". |
3946 | p = fword + sp->ts_fidx; |
3947 | MB_PTR_BACK(fword, p); |
3948 | if (!spell_iswordp(p, curwin)) { |
3949 | p = preword + STRLEN(preword); |
3950 | MB_PTR_BACK(preword, p); |
3951 | if (spell_iswordp(p, curwin)) { |
3952 | newscore += SCORE_NONWORD; |
3953 | } |
3954 | } |
3955 | |
3956 | // Give a bonus to words seen before. |
3957 | score = score_wordcount_adj(slang, |
3958 | sp->ts_score + newscore, |
3959 | preword + sp->ts_prewordlen, |
3960 | sp->ts_prewordlen > 0); |
3961 | |
3962 | // Add the suggestion if the score isn't too bad. |
3963 | if (score <= su->su_maxscore) { |
3964 | add_suggestion(su, &su->su_ga, preword, |
3965 | sp->ts_fidx - repextra, |
3966 | score, 0, false, lp->lp_sallang, false); |
3967 | |
3968 | if (su->su_badflags & WF_MIXCAP) { |
3969 | // We really don't know if the word should be |
3970 | // upper or lower case, add both. |
3971 | c = captype(preword, NULL); |
3972 | if (c == 0 || c == WF_ALLCAP) { |
3973 | make_case_word(tword + sp->ts_splitoff, |
3974 | preword + sp->ts_prewordlen, |
3975 | c == 0 ? WF_ALLCAP : 0); |
3976 | |
3977 | add_suggestion(su, &su->su_ga, preword, |
3978 | sp->ts_fidx - repextra, |
3979 | score + SCORE_ICASE, 0, false, |
3980 | lp->lp_sallang, false); |
3981 | } |
3982 | } |
3983 | } |
3984 | } |
3985 | } |
3986 | |
3987 | // Try word split and/or compounding. |
3988 | if ((sp->ts_fidx >= sp->ts_fidxtry || fword_ends) |
3989 | // Don't split in the middle of a character |
3990 | && (!has_mbyte || sp->ts_tcharlen == 0) |
3991 | ) { |
3992 | bool try_compound; |
3993 | int try_split; |
3994 | |
3995 | // If past the end of the bad word don't try a split. |
3996 | // Otherwise try changing the next word. E.g., find |
3997 | // suggestions for "the the" where the second "the" is |
3998 | // different. It's done like a split. |
3999 | // TODO: word split for soundfold words |
4000 | try_split = (sp->ts_fidx - repextra < su->su_badlen) |
4001 | && !soundfold; |
4002 | |
4003 | // Get here in several situations: |
4004 | // 1. The word in the tree ends: |
4005 | // If the word allows compounding try that. Otherwise try |
4006 | // a split by inserting a space. For both check that a |
4007 | // valid words starts at fword[sp->ts_fidx]. |
4008 | // For NOBREAK do like compounding to be able to check if |
4009 | // the next word is valid. |
4010 | // 2. The badword does end, but it was due to a change (e.g., |
4011 | // a swap). No need to split, but do check that the |
4012 | // following word is valid. |
4013 | // 3. The badword and the word in the tree end. It may still |
4014 | // be possible to compound another (short) word. |
4015 | try_compound = false; |
4016 | if (!soundfold |
4017 | && !slang->sl_nocompoundsugs |
4018 | && slang->sl_compprog != NULL |
4019 | && ((unsigned)flags >> 24) != 0 |
4020 | && sp->ts_twordlen - sp->ts_splitoff |
4021 | >= slang->sl_compminlen |
4022 | && (!has_mbyte |
4023 | || slang->sl_compminlen == 0 |
4024 | || mb_charlen(tword + sp->ts_splitoff) |
4025 | >= slang->sl_compminlen) |
4026 | && (slang->sl_compsylmax < MAXWLEN |
4027 | || sp->ts_complen + 1 - sp->ts_compsplit |
4028 | < slang->sl_compmax) |
4029 | && (can_be_compound(sp, slang, |
4030 | compflags, ((unsigned)flags >> 24)))) { |
4031 | try_compound = true; |
4032 | compflags[sp->ts_complen] = ((unsigned)flags >> 24); |
4033 | compflags[sp->ts_complen + 1] = NUL; |
4034 | } |
4035 | |
4036 | // For NOBREAK we never try splitting, it won't make any word |
4037 | // valid. |
4038 | if (slang->sl_nobreak && !slang->sl_nocompoundsugs) { |
4039 | try_compound = true; |
4040 | } else if (!fword_ends |
4041 | && try_compound |
4042 | && (sp->ts_flags & TSF_DIDSPLIT) == 0) { |
4043 | // If we could add a compound word, and it's also possible to |
4044 | // split at this point, do the split first and set |
4045 | // TSF_DIDSPLIT to avoid doing it again. |
4046 | try_compound = false; |
4047 | sp->ts_flags |= TSF_DIDSPLIT; |
4048 | --sp->ts_curi; // do the same NUL again |
4049 | compflags[sp->ts_complen] = NUL; |
4050 | } else { |
4051 | sp->ts_flags &= ~TSF_DIDSPLIT; |
4052 | } |
4053 | |
4054 | if (try_split || try_compound) { |
4055 | if (!try_compound && (!fword_ends || !goodword_ends)) { |
4056 | // If we're going to split need to check that the |
4057 | // words so far are valid for compounding. If there |
4058 | // is only one word it must not have the NEEDCOMPOUND |
4059 | // flag. |
4060 | if (sp->ts_complen == sp->ts_compsplit |
4061 | && (flags & WF_NEEDCOMP)) |
4062 | break; |
4063 | p = preword; |
4064 | while (*skiptowhite(p) != NUL) |
4065 | p = skipwhite(skiptowhite(p)); |
4066 | if (sp->ts_complen > sp->ts_compsplit |
4067 | && !can_compound(slang, p, |
4068 | compflags + sp->ts_compsplit)) |
4069 | break; |
4070 | |
4071 | if (slang->sl_nosplitsugs) |
4072 | newscore += SCORE_SPLIT_NO; |
4073 | else |
4074 | newscore += SCORE_SPLIT; |
4075 | |
4076 | // Give a bonus to words seen before. |
4077 | newscore = score_wordcount_adj(slang, newscore, |
4078 | preword + sp->ts_prewordlen, true); |
4079 | } |
4080 | |
4081 | if (TRY_DEEPER(su, stack, depth, newscore)) { |
4082 | go_deeper(stack, depth, newscore); |
4083 | #ifdef DEBUG_TRIEWALK |
4084 | if (!try_compound && !fword_ends) |
4085 | sprintf(changename[depth], "%.*s-%s: split" , |
4086 | sp->ts_twordlen, tword, fword + sp->ts_fidx); |
4087 | else |
4088 | sprintf(changename[depth], "%.*s-%s: compound" , |
4089 | sp->ts_twordlen, tword, fword + sp->ts_fidx); |
4090 | #endif |
4091 | // Save things to be restored at STATE_SPLITUNDO. |
4092 | sp->ts_save_badflags = su->su_badflags; |
4093 | PROF_STORE(sp->ts_state) |
4094 | sp->ts_state = STATE_SPLITUNDO; |
4095 | |
4096 | ++depth; |
4097 | sp = &stack[depth]; |
4098 | |
4099 | // Append a space to preword when splitting. |
4100 | if (!try_compound && !fword_ends) |
4101 | STRCAT(preword, " " ); |
4102 | sp->ts_prewordlen = (char_u)STRLEN(preword); |
4103 | sp->ts_splitoff = sp->ts_twordlen; |
4104 | sp->ts_splitfidx = sp->ts_fidx; |
4105 | |
4106 | // If the badword has a non-word character at this |
4107 | // position skip it. That means replacing the |
4108 | // non-word character with a space. Always skip a |
4109 | // character when the word ends. But only when the |
4110 | // good word can end. |
4111 | if (((!try_compound && !spell_iswordp_nmw(fword |
4112 | + sp->ts_fidx, |
4113 | curwin)) |
4114 | || fword_ends) |
4115 | && fword[sp->ts_fidx] != NUL |
4116 | && goodword_ends) { |
4117 | int l; |
4118 | |
4119 | l = MB_PTR2LEN(fword + sp->ts_fidx); |
4120 | if (fword_ends) { |
4121 | // Copy the skipped character to preword. |
4122 | memmove(preword + sp->ts_prewordlen, |
4123 | fword + sp->ts_fidx, l); |
4124 | sp->ts_prewordlen += l; |
4125 | preword[sp->ts_prewordlen] = NUL; |
4126 | } else |
4127 | sp->ts_score -= SCORE_SPLIT - SCORE_SUBST; |
4128 | sp->ts_fidx += l; |
4129 | } |
4130 | |
4131 | // When compounding include compound flag in |
4132 | // compflags[] (already set above). When splitting we |
4133 | // may start compounding over again. |
4134 | if (try_compound) |
4135 | ++sp->ts_complen; |
4136 | else |
4137 | sp->ts_compsplit = sp->ts_complen; |
4138 | sp->ts_prefixdepth = PFD_NOPREFIX; |
4139 | |
4140 | // set su->su_badflags to the caps type at this |
4141 | // position |
4142 | if (has_mbyte) |
4143 | n = nofold_len(fword, sp->ts_fidx, su->su_badptr); |
4144 | else |
4145 | n = sp->ts_fidx; |
4146 | su->su_badflags = badword_captype(su->su_badptr + n, |
4147 | su->su_badptr + su->su_badlen); |
4148 | |
4149 | // Restart at top of the tree. |
4150 | sp->ts_arridx = 0; |
4151 | |
4152 | // If there are postponed prefixes, try these too. |
4153 | if (pbyts != NULL) { |
4154 | byts = pbyts; |
4155 | idxs = pidxs; |
4156 | sp->ts_prefixdepth = PFD_PREFIXTREE; |
4157 | PROF_STORE(sp->ts_state) |
4158 | sp->ts_state = STATE_NOPREFIX; |
4159 | } |
4160 | } |
4161 | } |
4162 | } |
4163 | break; |
4164 | |
4165 | case STATE_SPLITUNDO: |
4166 | // Undo the changes done for word split or compound word. |
4167 | su->su_badflags = sp->ts_save_badflags; |
4168 | |
4169 | // Continue looking for NUL bytes. |
4170 | PROF_STORE(sp->ts_state) |
4171 | sp->ts_state = STATE_START; |
4172 | |
4173 | // In case we went into the prefix tree. |
4174 | byts = fbyts; |
4175 | idxs = fidxs; |
4176 | break; |
4177 | |
4178 | case STATE_ENDNUL: |
4179 | // Past the NUL bytes in the node. |
4180 | su->su_badflags = sp->ts_save_badflags; |
4181 | if (fword[sp->ts_fidx] == NUL |
4182 | && sp->ts_tcharlen == 0 |
4183 | ) { |
4184 | // The badword ends, can't use STATE_PLAIN. |
4185 | PROF_STORE(sp->ts_state) |
4186 | sp->ts_state = STATE_DEL; |
4187 | break; |
4188 | } |
4189 | PROF_STORE(sp->ts_state) |
4190 | sp->ts_state = STATE_PLAIN; |
4191 | FALLTHROUGH; |
4192 | |
4193 | case STATE_PLAIN: |
4194 | // Go over all possible bytes at this node, add each to tword[] |
4195 | // and use child node. "ts_curi" is the index. |
4196 | arridx = sp->ts_arridx; |
4197 | if (sp->ts_curi > byts[arridx]) { |
4198 | // Done all bytes at this node, do next state. When still at |
4199 | // already changed bytes skip the other tricks. |
4200 | PROF_STORE(sp->ts_state) |
4201 | if (sp->ts_fidx >= sp->ts_fidxtry) { |
4202 | sp->ts_state = STATE_DEL; |
4203 | } else { |
4204 | sp->ts_state = STATE_FINAL; |
4205 | } |
4206 | } else { |
4207 | arridx += sp->ts_curi++; |
4208 | c = byts[arridx]; |
4209 | |
4210 | // Normal byte, go one level deeper. If it's not equal to the |
4211 | // byte in the bad word adjust the score. But don't even try |
4212 | // when the byte was already changed. And don't try when we |
4213 | // just deleted this byte, accepting it is always cheaper than |
4214 | // delete + substitute. |
4215 | if (c == fword[sp->ts_fidx] |
4216 | || (sp->ts_tcharlen > 0 && sp->ts_isdiff != DIFF_NONE) |
4217 | ) |
4218 | newscore = 0; |
4219 | else |
4220 | newscore = SCORE_SUBST; |
4221 | if ((newscore == 0 |
4222 | || (sp->ts_fidx >= sp->ts_fidxtry |
4223 | && ((sp->ts_flags & TSF_DIDDEL) == 0 |
4224 | || c != fword[sp->ts_delidx]))) |
4225 | && TRY_DEEPER(su, stack, depth, newscore)) { |
4226 | go_deeper(stack, depth, newscore); |
4227 | #ifdef DEBUG_TRIEWALK |
4228 | if (newscore > 0) |
4229 | sprintf(changename[depth], "%.*s-%s: subst %c to %c" , |
4230 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4231 | fword[sp->ts_fidx], c); |
4232 | else |
4233 | sprintf(changename[depth], "%.*s-%s: accept %c" , |
4234 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4235 | fword[sp->ts_fidx]); |
4236 | #endif |
4237 | ++depth; |
4238 | sp = &stack[depth]; |
4239 | ++sp->ts_fidx; |
4240 | tword[sp->ts_twordlen++] = c; |
4241 | sp->ts_arridx = idxs[arridx]; |
4242 | if (newscore == SCORE_SUBST) |
4243 | sp->ts_isdiff = DIFF_YES; |
4244 | if (has_mbyte) { |
4245 | // Multi-byte characters are a bit complicated to |
4246 | // handle: They differ when any of the bytes differ |
4247 | // and then their length may also differ. |
4248 | if (sp->ts_tcharlen == 0) { |
4249 | // First byte. |
4250 | sp->ts_tcharidx = 0; |
4251 | sp->ts_tcharlen = MB_BYTE2LEN(c); |
4252 | sp->ts_fcharstart = sp->ts_fidx - 1; |
4253 | sp->ts_isdiff = (newscore != 0) |
4254 | ? DIFF_YES : DIFF_NONE; |
4255 | } else if (sp->ts_isdiff == DIFF_INSERT) |
4256 | // When inserting trail bytes don't advance in the |
4257 | // bad word. |
4258 | --sp->ts_fidx; |
4259 | if (++sp->ts_tcharidx == sp->ts_tcharlen) { |
4260 | // Last byte of character. |
4261 | if (sp->ts_isdiff == DIFF_YES) { |
4262 | // Correct ts_fidx for the byte length of the |
4263 | // character (we didn't check that before). |
4264 | sp->ts_fidx = sp->ts_fcharstart |
4265 | + MB_PTR2LEN(fword + sp->ts_fcharstart); |
4266 | |
4267 | // For changing a composing character adjust |
4268 | // the score from SCORE_SUBST to |
4269 | // SCORE_SUBCOMP. |
4270 | if (enc_utf8 |
4271 | && utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen |
4272 | - sp->ts_tcharlen)) |
4273 | && utf_iscomposing(utf_ptr2char(fword |
4274 | + sp->ts_fcharstart))) { |
4275 | sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP; |
4276 | } else if ( |
4277 | !soundfold |
4278 | && slang->sl_has_map |
4279 | && similar_chars( |
4280 | slang, |
4281 | utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen), |
4282 | utf_ptr2char(fword + sp->ts_fcharstart))) { |
4283 | // For a similar character adjust score from |
4284 | // SCORE_SUBST to SCORE_SIMILAR. |
4285 | sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; |
4286 | } |
4287 | } else if (sp->ts_isdiff == DIFF_INSERT |
4288 | && sp->ts_twordlen > sp->ts_tcharlen) { |
4289 | p = tword + sp->ts_twordlen - sp->ts_tcharlen; |
4290 | c = utf_ptr2char(p); |
4291 | if (utf_iscomposing(c)) { |
4292 | // Inserting a composing char doesn't |
4293 | // count that much. |
4294 | sp->ts_score -= SCORE_INS - SCORE_INSCOMP; |
4295 | } else { |
4296 | // If the previous character was the same, |
4297 | // thus doubling a character, give a bonus |
4298 | // to the score. Also for the soundfold |
4299 | // tree (might seem illogical but does |
4300 | // give better scores). |
4301 | MB_PTR_BACK(tword, p); |
4302 | if (c == utf_ptr2char(p)) { |
4303 | sp->ts_score -= SCORE_INS - SCORE_INSDUP; |
4304 | } |
4305 | } |
4306 | } |
4307 | |
4308 | // Starting a new char, reset the length. |
4309 | sp->ts_tcharlen = 0; |
4310 | } |
4311 | } else { |
4312 | // If we found a similar char adjust the score. |
4313 | // We do this after calling go_deeper() because |
4314 | // it's slow. |
4315 | if (newscore != 0 |
4316 | && !soundfold |
4317 | && slang->sl_has_map |
4318 | && similar_chars(slang, |
4319 | c, fword[sp->ts_fidx - 1])) |
4320 | sp->ts_score -= SCORE_SUBST - SCORE_SIMILAR; |
4321 | } |
4322 | } |
4323 | } |
4324 | break; |
4325 | |
4326 | case STATE_DEL: |
4327 | // When past the first byte of a multi-byte char don't try |
4328 | // delete/insert/swap a character. |
4329 | if (has_mbyte && sp->ts_tcharlen > 0) { |
4330 | PROF_STORE(sp->ts_state) |
4331 | sp->ts_state = STATE_FINAL; |
4332 | break; |
4333 | } |
4334 | // Try skipping one character in the bad word (delete it). |
4335 | PROF_STORE(sp->ts_state) |
4336 | sp->ts_state = STATE_INS_PREP; |
4337 | sp->ts_curi = 1; |
4338 | if (soundfold && sp->ts_fidx == 0 && fword[sp->ts_fidx] == '*') |
4339 | // Deleting a vowel at the start of a word counts less, see |
4340 | // soundalike_score(). |
4341 | newscore = 2 * SCORE_DEL / 3; |
4342 | else |
4343 | newscore = SCORE_DEL; |
4344 | if (fword[sp->ts_fidx] != NUL |
4345 | && TRY_DEEPER(su, stack, depth, newscore)) { |
4346 | go_deeper(stack, depth, newscore); |
4347 | #ifdef DEBUG_TRIEWALK |
4348 | sprintf(changename[depth], "%.*s-%s: delete %c" , |
4349 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4350 | fword[sp->ts_fidx]); |
4351 | #endif |
4352 | ++depth; |
4353 | |
4354 | // Remember what character we deleted, so that we can avoid |
4355 | // inserting it again. |
4356 | stack[depth].ts_flags |= TSF_DIDDEL; |
4357 | stack[depth].ts_delidx = sp->ts_fidx; |
4358 | |
4359 | // Advance over the character in fword[]. Give a bonus to the |
4360 | // score if the same character is following "nn" -> "n". It's |
4361 | // a bit illogical for soundfold tree but it does give better |
4362 | // results. |
4363 | c = utf_ptr2char(fword + sp->ts_fidx); |
4364 | stack[depth].ts_fidx += MB_PTR2LEN(fword + sp->ts_fidx); |
4365 | if (utf_iscomposing(c)) { |
4366 | stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; |
4367 | } else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) { |
4368 | stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; |
4369 | } |
4370 | |
4371 | break; |
4372 | } |
4373 | FALLTHROUGH; |
4374 | |
4375 | case STATE_INS_PREP: |
4376 | if (sp->ts_flags & TSF_DIDDEL) { |
4377 | // If we just deleted a byte then inserting won't make sense, |
4378 | // a substitute is always cheaper. |
4379 | PROF_STORE(sp->ts_state) |
4380 | sp->ts_state = STATE_SWAP; |
4381 | break; |
4382 | } |
4383 | |
4384 | // skip over NUL bytes |
4385 | n = sp->ts_arridx; |
4386 | for (;; ) { |
4387 | if (sp->ts_curi > byts[n]) { |
4388 | // Only NUL bytes at this node, go to next state. |
4389 | PROF_STORE(sp->ts_state) |
4390 | sp->ts_state = STATE_SWAP; |
4391 | break; |
4392 | } |
4393 | if (byts[n + sp->ts_curi] != NUL) { |
4394 | // Found a byte to insert. |
4395 | PROF_STORE(sp->ts_state) |
4396 | sp->ts_state = STATE_INS; |
4397 | break; |
4398 | } |
4399 | ++sp->ts_curi; |
4400 | } |
4401 | break; |
4402 | |
4403 | FALLTHROUGH; |
4404 | |
4405 | case STATE_INS: |
4406 | // Insert one byte. Repeat this for each possible byte at this |
4407 | // node. |
4408 | n = sp->ts_arridx; |
4409 | if (sp->ts_curi > byts[n]) { |
4410 | // Done all bytes at this node, go to next state. |
4411 | PROF_STORE(sp->ts_state) |
4412 | sp->ts_state = STATE_SWAP; |
4413 | break; |
4414 | } |
4415 | |
4416 | // Do one more byte at this node, but: |
4417 | // - Skip NUL bytes. |
4418 | // - Skip the byte if it's equal to the byte in the word, |
4419 | // accepting that byte is always better. |
4420 | n += sp->ts_curi++; |
4421 | c = byts[n]; |
4422 | if (soundfold && sp->ts_twordlen == 0 && c == '*') |
4423 | // Inserting a vowel at the start of a word counts less, |
4424 | // see soundalike_score(). |
4425 | newscore = 2 * SCORE_INS / 3; |
4426 | else |
4427 | newscore = SCORE_INS; |
4428 | if (c != fword[sp->ts_fidx] |
4429 | && TRY_DEEPER(su, stack, depth, newscore)) { |
4430 | go_deeper(stack, depth, newscore); |
4431 | #ifdef DEBUG_TRIEWALK |
4432 | sprintf(changename[depth], "%.*s-%s: insert %c" , |
4433 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4434 | c); |
4435 | #endif |
4436 | ++depth; |
4437 | sp = &stack[depth]; |
4438 | tword[sp->ts_twordlen++] = c; |
4439 | sp->ts_arridx = idxs[n]; |
4440 | if (has_mbyte) { |
4441 | fl = MB_BYTE2LEN(c); |
4442 | if (fl > 1) { |
4443 | // There are following bytes for the same character. |
4444 | // We must find all bytes before trying |
4445 | // delete/insert/swap/etc. |
4446 | sp->ts_tcharlen = fl; |
4447 | sp->ts_tcharidx = 1; |
4448 | sp->ts_isdiff = DIFF_INSERT; |
4449 | } |
4450 | } else |
4451 | fl = 1; |
4452 | if (fl == 1) { |
4453 | // If the previous character was the same, thus doubling a |
4454 | // character, give a bonus to the score. Also for |
4455 | // soundfold words (illogical but does give a better |
4456 | // score). |
4457 | if (sp->ts_twordlen >= 2 |
4458 | && tword[sp->ts_twordlen - 2] == c) |
4459 | sp->ts_score -= SCORE_INS - SCORE_INSDUP; |
4460 | } |
4461 | } |
4462 | break; |
4463 | |
4464 | case STATE_SWAP: |
4465 | // Swap two bytes in the bad word: "12" -> "21". |
4466 | // We change "fword" here, it's changed back afterwards at |
4467 | // STATE_UNSWAP. |
4468 | p = fword + sp->ts_fidx; |
4469 | c = *p; |
4470 | if (c == NUL) { |
4471 | // End of word, can't swap or replace. |
4472 | PROF_STORE(sp->ts_state) |
4473 | sp->ts_state = STATE_FINAL; |
4474 | break; |
4475 | } |
4476 | |
4477 | // Don't swap if the first character is not a word character. |
4478 | // SWAP3 etc. also don't make sense then. |
4479 | if (!soundfold && !spell_iswordp(p, curwin)) { |
4480 | PROF_STORE(sp->ts_state) |
4481 | sp->ts_state = STATE_REP_INI; |
4482 | break; |
4483 | } |
4484 | |
4485 | n = MB_CPTR2LEN(p); |
4486 | c = utf_ptr2char(p); |
4487 | if (p[n] == NUL) { |
4488 | c2 = NUL; |
4489 | } else if (!soundfold && !spell_iswordp(p + n, curwin)) { |
4490 | c2 = c; // don't swap non-word char |
4491 | } else { |
4492 | c2 = utf_ptr2char(p + n); |
4493 | } |
4494 | |
4495 | // When the second character is NUL we can't swap. |
4496 | if (c2 == NUL) { |
4497 | PROF_STORE(sp->ts_state) |
4498 | sp->ts_state = STATE_REP_INI; |
4499 | break; |
4500 | } |
4501 | |
4502 | // When characters are identical, swap won't do anything. |
4503 | // Also get here if the second char is not a word character. |
4504 | if (c == c2) { |
4505 | PROF_STORE(sp->ts_state) |
4506 | sp->ts_state = STATE_SWAP3; |
4507 | break; |
4508 | } |
4509 | if (TRY_DEEPER(su, stack, depth, SCORE_SWAP)) { |
4510 | go_deeper(stack, depth, SCORE_SWAP); |
4511 | #ifdef DEBUG_TRIEWALK |
4512 | snprintf(changename[depth], sizeof(changename[0]), |
4513 | "%.*s-%s: swap %c and %c" , |
4514 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4515 | c, c2); |
4516 | #endif |
4517 | PROF_STORE(sp->ts_state) |
4518 | sp->ts_state = STATE_UNSWAP; |
4519 | depth++; |
4520 | fl = mb_char2len(c2); |
4521 | memmove(p, p + n, fl); |
4522 | utf_char2bytes(c, p + fl); |
4523 | stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; |
4524 | } else { |
4525 | // If this swap doesn't work then SWAP3 won't either. |
4526 | PROF_STORE(sp->ts_state) |
4527 | sp->ts_state = STATE_REP_INI; |
4528 | } |
4529 | break; |
4530 | |
4531 | case STATE_UNSWAP: |
4532 | // Undo the STATE_SWAP swap: "21" -> "12". |
4533 | p = fword + sp->ts_fidx; |
4534 | n = MB_PTR2LEN(p); |
4535 | c = utf_ptr2char(p + n); |
4536 | memmove(p + MB_PTR2LEN(p + n), p, n); |
4537 | utf_char2bytes(c, p); |
4538 | |
4539 | FALLTHROUGH; |
4540 | |
4541 | case STATE_SWAP3: |
4542 | // Swap two bytes, skipping one: "123" -> "321". We change |
4543 | // "fword" here, it's changed back afterwards at STATE_UNSWAP3. |
4544 | p = fword + sp->ts_fidx; |
4545 | n = MB_CPTR2LEN(p); |
4546 | c = utf_ptr2char(p); |
4547 | fl = MB_CPTR2LEN(p + n); |
4548 | c2 = utf_ptr2char(p + n); |
4549 | if (!soundfold && !spell_iswordp(p + n + fl, curwin)) { |
4550 | c3 = c; // don't swap non-word char |
4551 | } else { |
4552 | c3 = utf_ptr2char(p + n + fl); |
4553 | } |
4554 | |
4555 | // When characters are identical: "121" then SWAP3 result is |
4556 | // identical, ROT3L result is same as SWAP: "211", ROT3L result is |
4557 | // same as SWAP on next char: "112". Thus skip all swapping. |
4558 | // Also skip when c3 is NUL. |
4559 | // Also get here when the third character is not a word character. |
4560 | // Second character may any char: "a.b" -> "b.a" |
4561 | if (c == c3 || c3 == NUL) { |
4562 | PROF_STORE(sp->ts_state) |
4563 | sp->ts_state = STATE_REP_INI; |
4564 | break; |
4565 | } |
4566 | if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) { |
4567 | go_deeper(stack, depth, SCORE_SWAP3); |
4568 | #ifdef DEBUG_TRIEWALK |
4569 | sprintf(changename[depth], "%.*s-%s: swap3 %c and %c" , |
4570 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4571 | c, c3); |
4572 | #endif |
4573 | PROF_STORE(sp->ts_state) |
4574 | sp->ts_state = STATE_UNSWAP3; |
4575 | depth++; |
4576 | tl = mb_char2len(c3); |
4577 | memmove(p, p + n + fl, tl); |
4578 | utf_char2bytes(c2, p + tl); |
4579 | utf_char2bytes(c, p + fl + tl); |
4580 | stack[depth].ts_fidxtry = sp->ts_fidx + n + fl + tl; |
4581 | } else { |
4582 | PROF_STORE(sp->ts_state) |
4583 | sp->ts_state = STATE_REP_INI; |
4584 | } |
4585 | break; |
4586 | |
4587 | case STATE_UNSWAP3: |
4588 | // Undo STATE_SWAP3: "321" -> "123" |
4589 | p = fword + sp->ts_fidx; |
4590 | n = MB_PTR2LEN(p); |
4591 | c2 = utf_ptr2char(p + n); |
4592 | fl = MB_PTR2LEN(p + n); |
4593 | c = utf_ptr2char(p + n + fl); |
4594 | tl = MB_PTR2LEN(p + n + fl); |
4595 | memmove(p + fl + tl, p, n); |
4596 | utf_char2bytes(c, p); |
4597 | utf_char2bytes(c2, p + tl); |
4598 | p = p + tl; |
4599 | |
4600 | if (!soundfold && !spell_iswordp(p, curwin)) { |
4601 | // Middle char is not a word char, skip the rotate. First and |
4602 | // third char were already checked at swap and swap3. |
4603 | PROF_STORE(sp->ts_state) |
4604 | sp->ts_state = STATE_REP_INI; |
4605 | break; |
4606 | } |
4607 | |
4608 | // Rotate three characters left: "123" -> "231". We change |
4609 | // "fword" here, it's changed back afterwards at STATE_UNROT3L. |
4610 | if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) { |
4611 | go_deeper(stack, depth, SCORE_SWAP3); |
4612 | #ifdef DEBUG_TRIEWALK |
4613 | p = fword + sp->ts_fidx; |
4614 | sprintf(changename[depth], "%.*s-%s: rotate left %c%c%c" , |
4615 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4616 | p[0], p[1], p[2]); |
4617 | #endif |
4618 | PROF_STORE(sp->ts_state) |
4619 | sp->ts_state = STATE_UNROT3L; |
4620 | ++depth; |
4621 | p = fword + sp->ts_fidx; |
4622 | n = MB_CPTR2LEN(p); |
4623 | c = utf_ptr2char(p); |
4624 | fl = MB_CPTR2LEN(p + n); |
4625 | fl += MB_CPTR2LEN(p + n + fl); |
4626 | memmove(p, p + n, fl); |
4627 | utf_char2bytes(c, p + fl); |
4628 | stack[depth].ts_fidxtry = sp->ts_fidx + n + fl; |
4629 | } else { |
4630 | PROF_STORE(sp->ts_state) |
4631 | sp->ts_state = STATE_REP_INI; |
4632 | } |
4633 | break; |
4634 | |
4635 | case STATE_UNROT3L: |
4636 | // Undo ROT3L: "231" -> "123" |
4637 | p = fword + sp->ts_fidx; |
4638 | n = MB_PTR2LEN(p); |
4639 | n += MB_PTR2LEN(p + n); |
4640 | c = utf_ptr2char(p + n); |
4641 | tl = MB_PTR2LEN(p + n); |
4642 | memmove(p + tl, p, n); |
4643 | utf_char2bytes(c, p); |
4644 | |
4645 | // Rotate three bytes right: "123" -> "312". We change "fword" |
4646 | // here, it's changed back afterwards at STATE_UNROT3R. |
4647 | if (TRY_DEEPER(su, stack, depth, SCORE_SWAP3)) { |
4648 | go_deeper(stack, depth, SCORE_SWAP3); |
4649 | #ifdef DEBUG_TRIEWALK |
4650 | p = fword + sp->ts_fidx; |
4651 | sprintf(changename[depth], "%.*s-%s: rotate right %c%c%c" , |
4652 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4653 | p[0], p[1], p[2]); |
4654 | #endif |
4655 | PROF_STORE(sp->ts_state) |
4656 | sp->ts_state = STATE_UNROT3R; |
4657 | ++depth; |
4658 | p = fword + sp->ts_fidx; |
4659 | n = MB_CPTR2LEN(p); |
4660 | n += MB_CPTR2LEN(p + n); |
4661 | c = utf_ptr2char(p + n); |
4662 | tl = MB_CPTR2LEN(p + n); |
4663 | memmove(p + tl, p, n); |
4664 | utf_char2bytes(c, p); |
4665 | stack[depth].ts_fidxtry = sp->ts_fidx + n + tl; |
4666 | } else { |
4667 | PROF_STORE(sp->ts_state) |
4668 | sp->ts_state = STATE_REP_INI; |
4669 | } |
4670 | break; |
4671 | |
4672 | case STATE_UNROT3R: |
4673 | // Undo ROT3R: "312" -> "123" |
4674 | p = fword + sp->ts_fidx; |
4675 | c = utf_ptr2char(p); |
4676 | tl = MB_PTR2LEN(p); |
4677 | n = MB_PTR2LEN(p + tl); |
4678 | n += MB_PTR2LEN(p + tl + n); |
4679 | memmove(p, p + tl, n); |
4680 | utf_char2bytes(c, p + n); |
4681 | |
4682 | FALLTHROUGH; |
4683 | |
4684 | case STATE_REP_INI: |
4685 | // Check if matching with REP items from the .aff file would work. |
4686 | // Quickly skip if: |
4687 | // - there are no REP items and we are not in the soundfold trie |
4688 | // - the score is going to be too high anyway |
4689 | // - already applied a REP item or swapped here |
4690 | if ((lp->lp_replang == NULL && !soundfold) |
4691 | || sp->ts_score + SCORE_REP >= su->su_maxscore |
4692 | || sp->ts_fidx < sp->ts_fidxtry) { |
4693 | PROF_STORE(sp->ts_state) |
4694 | sp->ts_state = STATE_FINAL; |
4695 | break; |
4696 | } |
4697 | |
4698 | // Use the first byte to quickly find the first entry that may |
4699 | // match. If the index is -1 there is none. |
4700 | if (soundfold) |
4701 | sp->ts_curi = slang->sl_repsal_first[fword[sp->ts_fidx]]; |
4702 | else |
4703 | sp->ts_curi = lp->lp_replang->sl_rep_first[fword[sp->ts_fidx]]; |
4704 | |
4705 | if (sp->ts_curi < 0) { |
4706 | PROF_STORE(sp->ts_state) |
4707 | sp->ts_state = STATE_FINAL; |
4708 | break; |
4709 | } |
4710 | |
4711 | PROF_STORE(sp->ts_state) |
4712 | sp->ts_state = STATE_REP; |
4713 | FALLTHROUGH; |
4714 | |
4715 | case STATE_REP: |
4716 | // Try matching with REP items from the .aff file. For each match |
4717 | // replace the characters and check if the resulting word is |
4718 | // valid. |
4719 | p = fword + sp->ts_fidx; |
4720 | |
4721 | if (soundfold) |
4722 | gap = &slang->sl_repsal; |
4723 | else |
4724 | gap = &lp->lp_replang->sl_rep; |
4725 | while (sp->ts_curi < gap->ga_len) { |
4726 | ftp = (fromto_T *)gap->ga_data + sp->ts_curi++; |
4727 | if (*ftp->ft_from != *p) { |
4728 | // past possible matching entries |
4729 | sp->ts_curi = gap->ga_len; |
4730 | break; |
4731 | } |
4732 | if (STRNCMP(ftp->ft_from, p, STRLEN(ftp->ft_from)) == 0 |
4733 | && TRY_DEEPER(su, stack, depth, SCORE_REP)) { |
4734 | go_deeper(stack, depth, SCORE_REP); |
4735 | #ifdef DEBUG_TRIEWALK |
4736 | sprintf(changename[depth], "%.*s-%s: replace %s with %s" , |
4737 | sp->ts_twordlen, tword, fword + sp->ts_fidx, |
4738 | ftp->ft_from, ftp->ft_to); |
4739 | #endif |
4740 | // Need to undo this afterwards. |
4741 | PROF_STORE(sp->ts_state) |
4742 | sp->ts_state = STATE_REP_UNDO; |
4743 | |
4744 | // Change the "from" to the "to" string. |
4745 | ++depth; |
4746 | fl = (int)STRLEN(ftp->ft_from); |
4747 | tl = (int)STRLEN(ftp->ft_to); |
4748 | if (fl != tl) { |
4749 | STRMOVE(p + tl, p + fl); |
4750 | repextra += tl - fl; |
4751 | } |
4752 | memmove(p, ftp->ft_to, tl); |
4753 | stack[depth].ts_fidxtry = sp->ts_fidx + tl; |
4754 | stack[depth].ts_tcharlen = 0; |
4755 | break; |
4756 | } |
4757 | } |
4758 | |
4759 | if (sp->ts_curi >= gap->ga_len && sp->ts_state == STATE_REP) |
4760 | // No (more) matches. |
4761 | PROF_STORE(sp->ts_state) |
4762 | sp->ts_state = STATE_FINAL; |
4763 | |
4764 | break; |
4765 | |
4766 | case STATE_REP_UNDO: |
4767 | // Undo a REP replacement and continue with the next one. |
4768 | if (soundfold) |
4769 | gap = &slang->sl_repsal; |
4770 | else |
4771 | gap = &lp->lp_replang->sl_rep; |
4772 | ftp = (fromto_T *)gap->ga_data + sp->ts_curi - 1; |
4773 | fl = (int)STRLEN(ftp->ft_from); |
4774 | tl = (int)STRLEN(ftp->ft_to); |
4775 | p = fword + sp->ts_fidx; |
4776 | if (fl != tl) { |
4777 | STRMOVE(p + fl, p + tl); |
4778 | repextra -= tl - fl; |
4779 | } |
4780 | memmove(p, ftp->ft_from, fl); |
4781 | PROF_STORE(sp->ts_state) |
4782 | sp->ts_state = STATE_REP; |
4783 | break; |
4784 | |
4785 | default: |
4786 | // Did all possible states at this level, go up one level. |
4787 | --depth; |
4788 | |
4789 | if (depth >= 0 && stack[depth].ts_prefixdepth == PFD_PREFIXTREE) { |
4790 | // Continue in or go back to the prefix tree. |
4791 | byts = pbyts; |
4792 | idxs = pidxs; |
4793 | } |
4794 | |
4795 | // Don't check for CTRL-C too often, it takes time. |
4796 | if (--breakcheckcount == 0) { |
4797 | os_breakcheck(); |
4798 | breakcheckcount = 1000; |
4799 | } |
4800 | } |
4801 | } |
4802 | } |
4803 | |
4804 | |
4805 | // Go one level deeper in the tree. |
4806 | static void go_deeper(trystate_T *stack, int depth, int score_add) |
4807 | { |
4808 | stack[depth + 1] = stack[depth]; |
4809 | stack[depth + 1].ts_state = STATE_START; |
4810 | stack[depth + 1].ts_score = stack[depth].ts_score + score_add; |
4811 | stack[depth + 1].ts_curi = 1; // start just after length byte |
4812 | stack[depth + 1].ts_flags = 0; |
4813 | } |
4814 | |
4815 | // Case-folding may change the number of bytes: Count nr of chars in |
4816 | // fword[flen] and return the byte length of that many chars in "word". |
4817 | static int nofold_len(char_u *fword, int flen, char_u *word) |
4818 | { |
4819 | char_u *p; |
4820 | int i = 0; |
4821 | |
4822 | for (p = fword; p < fword + flen; MB_PTR_ADV(p)) { |
4823 | i++; |
4824 | } |
4825 | for (p = word; i > 0; MB_PTR_ADV(p)) { |
4826 | i--; |
4827 | } |
4828 | return (int)(p - word); |
4829 | } |
4830 | |
4831 | // "fword" is a good word with case folded. Find the matching keep-case |
4832 | // words and put it in "kword". |
4833 | // Theoretically there could be several keep-case words that result in the |
4834 | // same case-folded word, but we only find one... |
4835 | static void find_keepcap_word(slang_T *slang, char_u *fword, char_u *kword) |
4836 | { |
4837 | char_u uword[MAXWLEN]; // "fword" in upper-case |
4838 | int depth; |
4839 | idx_T tryidx; |
4840 | |
4841 | // The following arrays are used at each depth in the tree. |
4842 | idx_T arridx[MAXWLEN]; |
4843 | int round[MAXWLEN]; |
4844 | int fwordidx[MAXWLEN]; |
4845 | int uwordidx[MAXWLEN]; |
4846 | int kwordlen[MAXWLEN]; |
4847 | |
4848 | int flen, ulen; |
4849 | int l; |
4850 | int len; |
4851 | int c; |
4852 | idx_T lo, hi, m; |
4853 | char_u *p; |
4854 | char_u *byts = slang->sl_kbyts; // array with bytes of the words |
4855 | idx_T *idxs = slang->sl_kidxs; // array with indexes |
4856 | |
4857 | if (byts == NULL) { |
4858 | // array is empty: "cannot happen" |
4859 | *kword = NUL; |
4860 | return; |
4861 | } |
4862 | |
4863 | // Make an all-cap version of "fword". |
4864 | allcap_copy(fword, uword); |
4865 | |
4866 | // Each character needs to be tried both case-folded and upper-case. |
4867 | // All this gets very complicated if we keep in mind that changing case |
4868 | // may change the byte length of a multi-byte character... |
4869 | depth = 0; |
4870 | arridx[0] = 0; |
4871 | round[0] = 0; |
4872 | fwordidx[0] = 0; |
4873 | uwordidx[0] = 0; |
4874 | kwordlen[0] = 0; |
4875 | while (depth >= 0) { |
4876 | if (fword[fwordidx[depth]] == NUL) { |
4877 | // We are at the end of "fword". If the tree allows a word to end |
4878 | // here we have found a match. |
4879 | if (byts[arridx[depth] + 1] == 0) { |
4880 | kword[kwordlen[depth]] = NUL; |
4881 | return; |
4882 | } |
4883 | |
4884 | // kword is getting too long, continue one level up |
4885 | --depth; |
4886 | } else if (++round[depth] > 2) { |
4887 | // tried both fold-case and upper-case character, continue one |
4888 | // level up |
4889 | --depth; |
4890 | } else { |
4891 | // round[depth] == 1: Try using the folded-case character. |
4892 | // round[depth] == 2: Try using the upper-case character. |
4893 | if (has_mbyte) { |
4894 | flen = MB_CPTR2LEN(fword + fwordidx[depth]); |
4895 | ulen = MB_CPTR2LEN(uword + uwordidx[depth]); |
4896 | } else { |
4897 | ulen = flen = 1; |
4898 | } |
4899 | if (round[depth] == 1) { |
4900 | p = fword + fwordidx[depth]; |
4901 | l = flen; |
4902 | } else { |
4903 | p = uword + uwordidx[depth]; |
4904 | l = ulen; |
4905 | } |
4906 | |
4907 | for (tryidx = arridx[depth]; l > 0; --l) { |
4908 | // Perform a binary search in the list of accepted bytes. |
4909 | len = byts[tryidx++]; |
4910 | c = *p++; |
4911 | lo = tryidx; |
4912 | hi = tryidx + len - 1; |
4913 | while (lo < hi) { |
4914 | m = (lo + hi) / 2; |
4915 | if (byts[m] > c) |
4916 | hi = m - 1; |
4917 | else if (byts[m] < c) |
4918 | lo = m + 1; |
4919 | else { |
4920 | lo = hi = m; |
4921 | break; |
4922 | } |
4923 | } |
4924 | |
4925 | // Stop if there is no matching byte. |
4926 | if (hi < lo || byts[lo] != c) |
4927 | break; |
4928 | |
4929 | // Continue at the child (if there is one). |
4930 | tryidx = idxs[lo]; |
4931 | } |
4932 | |
4933 | if (l == 0) { |
4934 | // Found the matching char. Copy it to "kword" and go a |
4935 | // level deeper. |
4936 | if (round[depth] == 1) { |
4937 | STRNCPY(kword + kwordlen[depth], fword + fwordidx[depth], |
4938 | flen); |
4939 | kwordlen[depth + 1] = kwordlen[depth] + flen; |
4940 | } else { |
4941 | STRNCPY(kword + kwordlen[depth], uword + uwordidx[depth], |
4942 | ulen); |
4943 | kwordlen[depth + 1] = kwordlen[depth] + ulen; |
4944 | } |
4945 | fwordidx[depth + 1] = fwordidx[depth] + flen; |
4946 | uwordidx[depth + 1] = uwordidx[depth] + ulen; |
4947 | |
4948 | ++depth; |
4949 | arridx[depth] = tryidx; |
4950 | round[depth] = 0; |
4951 | } |
4952 | } |
4953 | } |
4954 | |
4955 | // Didn't find it: "cannot happen". |
4956 | *kword = NUL; |
4957 | } |
4958 | |
4959 | // Compute the sound-a-like score for suggestions in su->su_ga and add them to |
4960 | // su->su_sga. |
4961 | static void score_comp_sal(suginfo_T *su) |
4962 | { |
4963 | langp_T *lp; |
4964 | char_u badsound[MAXWLEN]; |
4965 | int i; |
4966 | suggest_T *stp; |
4967 | suggest_T *sstp; |
4968 | int score; |
4969 | |
4970 | ga_grow(&su->su_sga, su->su_ga.ga_len); |
4971 | |
4972 | // Use the sound-folding of the first language that supports it. |
4973 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) { |
4974 | lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
4975 | if (!GA_EMPTY(&lp->lp_slang->sl_sal)) { |
4976 | // soundfold the bad word |
4977 | spell_soundfold(lp->lp_slang, su->su_fbadword, true, badsound); |
4978 | |
4979 | for (i = 0; i < su->su_ga.ga_len; ++i) { |
4980 | stp = &SUG(su->su_ga, i); |
4981 | |
4982 | // Case-fold the suggested word, sound-fold it and compute the |
4983 | // sound-a-like score. |
4984 | score = stp_sal_score(stp, su, lp->lp_slang, badsound); |
4985 | if (score < SCORE_MAXMAX) { |
4986 | // Add the suggestion. |
4987 | sstp = &SUG(su->su_sga, su->su_sga.ga_len); |
4988 | sstp->st_word = vim_strsave(stp->st_word); |
4989 | sstp->st_wordlen = stp->st_wordlen; |
4990 | sstp->st_score = score; |
4991 | sstp->st_altscore = 0; |
4992 | sstp->st_orglen = stp->st_orglen; |
4993 | ++su->su_sga.ga_len; |
4994 | } |
4995 | } |
4996 | break; |
4997 | } |
4998 | } |
4999 | } |
5000 | |
5001 | // Combine the list of suggestions in su->su_ga and su->su_sga. |
5002 | // They are entwined. |
5003 | static void score_combine(suginfo_T *su) |
5004 | { |
5005 | garray_T ga; |
5006 | garray_T *gap; |
5007 | langp_T *lp; |
5008 | suggest_T *stp; |
5009 | char_u *p; |
5010 | char_u badsound[MAXWLEN]; |
5011 | int round; |
5012 | slang_T *slang = NULL; |
5013 | |
5014 | // Add the alternate score to su_ga. |
5015 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) { |
5016 | lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
5017 | if (!GA_EMPTY(&lp->lp_slang->sl_sal)) { |
5018 | // soundfold the bad word |
5019 | slang = lp->lp_slang; |
5020 | spell_soundfold(slang, su->su_fbadword, true, badsound); |
5021 | |
5022 | for (int i = 0; i < su->su_ga.ga_len; ++i) { |
5023 | stp = &SUG(su->su_ga, i); |
5024 | stp->st_altscore = stp_sal_score(stp, su, slang, badsound); |
5025 | if (stp->st_altscore == SCORE_MAXMAX) |
5026 | stp->st_score = (stp->st_score * 3 + SCORE_BIG) / 4; |
5027 | else |
5028 | stp->st_score = (stp->st_score * 3 |
5029 | + stp->st_altscore) / 4; |
5030 | stp->st_salscore = false; |
5031 | } |
5032 | break; |
5033 | } |
5034 | } |
5035 | |
5036 | if (slang == NULL) { // Using "double" without sound folding. |
5037 | (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, |
5038 | su->su_maxcount); |
5039 | return; |
5040 | } |
5041 | |
5042 | // Add the alternate score to su_sga. |
5043 | for (int i = 0; i < su->su_sga.ga_len; ++i) { |
5044 | stp = &SUG(su->su_sga, i); |
5045 | stp->st_altscore = spell_edit_score(slang, |
5046 | su->su_badword, stp->st_word); |
5047 | if (stp->st_score == SCORE_MAXMAX) |
5048 | stp->st_score = (SCORE_BIG * 7 + stp->st_altscore) / 8; |
5049 | else |
5050 | stp->st_score = (stp->st_score * 7 + stp->st_altscore) / 8; |
5051 | stp->st_salscore = true; |
5052 | } |
5053 | |
5054 | // Remove bad suggestions, sort the suggestions and truncate at "maxcount" |
5055 | // for both lists. |
5056 | check_suggestions(su, &su->su_ga); |
5057 | (void)cleanup_suggestions(&su->su_ga, su->su_maxscore, su->su_maxcount); |
5058 | check_suggestions(su, &su->su_sga); |
5059 | (void)cleanup_suggestions(&su->su_sga, su->su_maxscore, su->su_maxcount); |
5060 | |
5061 | ga_init(&ga, (int)sizeof(suginfo_T), 1); |
5062 | ga_grow(&ga, su->su_ga.ga_len + su->su_sga.ga_len); |
5063 | |
5064 | stp = &SUG(ga, 0); |
5065 | for (int i = 0; i < su->su_ga.ga_len || i < su->su_sga.ga_len; ++i) { |
5066 | // round 1: get a suggestion from su_ga |
5067 | // round 2: get a suggestion from su_sga |
5068 | for (round = 1; round <= 2; ++round) { |
5069 | gap = round == 1 ? &su->su_ga : &su->su_sga; |
5070 | if (i < gap->ga_len) { |
5071 | // Don't add a word if it's already there. |
5072 | p = SUG(*gap, i).st_word; |
5073 | int j; |
5074 | for (j = 0; j < ga.ga_len; ++j) |
5075 | if (STRCMP(stp[j].st_word, p) == 0) |
5076 | break; |
5077 | if (j == ga.ga_len) |
5078 | stp[ga.ga_len++] = SUG(*gap, i); |
5079 | else |
5080 | xfree(p); |
5081 | } |
5082 | } |
5083 | } |
5084 | |
5085 | ga_clear(&su->su_ga); |
5086 | ga_clear(&su->su_sga); |
5087 | |
5088 | // Truncate the list to the number of suggestions that will be displayed. |
5089 | if (ga.ga_len > su->su_maxcount) { |
5090 | for (int i = su->su_maxcount; i < ga.ga_len; ++i) { |
5091 | xfree(stp[i].st_word); |
5092 | } |
5093 | ga.ga_len = su->su_maxcount; |
5094 | } |
5095 | |
5096 | su->su_ga = ga; |
5097 | } |
5098 | |
5099 | // For the goodword in "stp" compute the soundalike score compared to the |
5100 | // badword. |
5101 | static int |
5102 | stp_sal_score ( |
5103 | suggest_T *stp, |
5104 | suginfo_T *su, |
5105 | slang_T *slang, |
5106 | char_u *badsound // sound-folded badword |
5107 | ) |
5108 | { |
5109 | char_u *p; |
5110 | char_u *pbad; |
5111 | char_u *pgood; |
5112 | char_u badsound2[MAXWLEN]; |
5113 | char_u fword[MAXWLEN]; |
5114 | char_u goodsound[MAXWLEN]; |
5115 | char_u goodword[MAXWLEN]; |
5116 | int lendiff; |
5117 | |
5118 | lendiff = su->su_badlen - stp->st_orglen; |
5119 | if (lendiff >= 0) |
5120 | pbad = badsound; |
5121 | else { |
5122 | // soundfold the bad word with more characters following |
5123 | (void)spell_casefold(su->su_badptr, stp->st_orglen, fword, MAXWLEN); |
5124 | |
5125 | // When joining two words the sound often changes a lot. E.g., "t he" |
5126 | // sounds like "t h" while "the" sounds like "@". Avoid that by |
5127 | // removing the space. Don't do it when the good word also contains a |
5128 | // space. |
5129 | if (ascii_iswhite(su->su_badptr[su->su_badlen]) |
5130 | && *skiptowhite(stp->st_word) == NUL) |
5131 | for (p = fword; *(p = skiptowhite(p)) != NUL; ) |
5132 | STRMOVE(p, p + 1); |
5133 | |
5134 | spell_soundfold(slang, fword, true, badsound2); |
5135 | pbad = badsound2; |
5136 | } |
5137 | |
5138 | if (lendiff > 0 && stp->st_wordlen + lendiff < MAXWLEN) { |
5139 | // Add part of the bad word to the good word, so that we soundfold |
5140 | // what replaces the bad word. |
5141 | STRCPY(goodword, stp->st_word); |
5142 | STRLCPY(goodword + stp->st_wordlen, |
5143 | su->su_badptr + su->su_badlen - lendiff, lendiff + 1); |
5144 | pgood = goodword; |
5145 | } else |
5146 | pgood = stp->st_word; |
5147 | |
5148 | // Sound-fold the word and compute the score for the difference. |
5149 | spell_soundfold(slang, pgood, false, goodsound); |
5150 | |
5151 | return soundalike_score(goodsound, pbad); |
5152 | } |
5153 | |
5154 | static sftword_T dumsft; |
5155 | #define HIKEY2SFT(p) ((sftword_T *)(p - (dumsft.sft_word - (char_u *)&dumsft))) |
5156 | #define HI2SFT(hi) HIKEY2SFT((hi)->hi_key) |
5157 | |
5158 | // Prepare for calling suggest_try_soundalike(). |
5159 | static void suggest_try_soundalike_prep(void) |
5160 | { |
5161 | langp_T *lp; |
5162 | slang_T *slang; |
5163 | |
5164 | // Do this for all languages that support sound folding and for which a |
5165 | // .sug file has been loaded. |
5166 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) { |
5167 | lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
5168 | slang = lp->lp_slang; |
5169 | if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL) |
5170 | // prepare the hashtable used by add_sound_suggest() |
5171 | hash_init(&slang->sl_sounddone); |
5172 | } |
5173 | } |
5174 | |
5175 | // Find suggestions by comparing the word in a sound-a-like form. |
5176 | // Note: This doesn't support postponed prefixes. |
5177 | static void suggest_try_soundalike(suginfo_T *su) |
5178 | { |
5179 | char_u salword[MAXWLEN]; |
5180 | langp_T *lp; |
5181 | slang_T *slang; |
5182 | |
5183 | // Do this for all languages that support sound folding and for which a |
5184 | // .sug file has been loaded. |
5185 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) { |
5186 | lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
5187 | slang = lp->lp_slang; |
5188 | if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL) { |
5189 | // soundfold the bad word |
5190 | spell_soundfold(slang, su->su_fbadword, true, salword); |
5191 | |
5192 | // try all kinds of inserts/deletes/swaps/etc. |
5193 | // TODO: also soundfold the next words, so that we can try joining |
5194 | // and splitting |
5195 | #ifdef SUGGEST_PROFILE |
5196 | prof_init(); |
5197 | #endif |
5198 | suggest_trie_walk(su, lp, salword, true); |
5199 | #ifdef SUGGEST_PROFILE |
5200 | prof_report("soundalike" ); |
5201 | #endif |
5202 | } |
5203 | } |
5204 | } |
5205 | |
5206 | // Finish up after calling suggest_try_soundalike(). |
5207 | static void suggest_try_soundalike_finish(void) |
5208 | { |
5209 | langp_T *lp; |
5210 | slang_T *slang; |
5211 | int todo; |
5212 | hashitem_T *hi; |
5213 | |
5214 | // Do this for all languages that support sound folding and for which a |
5215 | // .sug file has been loaded. |
5216 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) { |
5217 | lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
5218 | slang = lp->lp_slang; |
5219 | if (!GA_EMPTY(&slang->sl_sal) && slang->sl_sbyts != NULL) { |
5220 | // Free the info about handled words. |
5221 | todo = (int)slang->sl_sounddone.ht_used; |
5222 | for (hi = slang->sl_sounddone.ht_array; todo > 0; ++hi) |
5223 | if (!HASHITEM_EMPTY(hi)) { |
5224 | xfree(HI2SFT(hi)); |
5225 | --todo; |
5226 | } |
5227 | |
5228 | // Clear the hashtable, it may also be used by another region. |
5229 | hash_clear(&slang->sl_sounddone); |
5230 | hash_init(&slang->sl_sounddone); |
5231 | } |
5232 | } |
5233 | } |
5234 | |
5235 | // A match with a soundfolded word is found. Add the good word(s) that |
5236 | // produce this soundfolded word. |
5237 | static void |
5238 | add_sound_suggest ( |
5239 | suginfo_T *su, |
5240 | char_u *goodword, |
5241 | int score, // soundfold score |
5242 | langp_T *lp |
5243 | ) |
5244 | { |
5245 | slang_T *slang = lp->lp_slang; // language for sound folding |
5246 | int sfwordnr; |
5247 | char_u *nrline; |
5248 | int orgnr; |
5249 | char_u theword[MAXWLEN]; |
5250 | int i; |
5251 | int wlen; |
5252 | char_u *byts; |
5253 | idx_T *idxs; |
5254 | int n; |
5255 | int wordcount; |
5256 | int wc; |
5257 | int goodscore; |
5258 | hash_T hash; |
5259 | hashitem_T *hi; |
5260 | sftword_T *sft; |
5261 | int bc, gc; |
5262 | int limit; |
5263 | |
5264 | // It's very well possible that the same soundfold word is found several |
5265 | // times with different scores. Since the following is quite slow only do |
5266 | // the words that have a better score than before. Use a hashtable to |
5267 | // remember the words that have been done. |
5268 | hash = hash_hash(goodword); |
5269 | const size_t goodword_len = STRLEN(goodword); |
5270 | hi = hash_lookup(&slang->sl_sounddone, (const char *)goodword, goodword_len, |
5271 | hash); |
5272 | if (HASHITEM_EMPTY(hi)) { |
5273 | sft = xmalloc(sizeof(sftword_T) + goodword_len); |
5274 | sft->sft_score = score; |
5275 | memcpy(sft->sft_word, goodword, goodword_len + 1); |
5276 | hash_add_item(&slang->sl_sounddone, hi, sft->sft_word, hash); |
5277 | } else { |
5278 | sft = HI2SFT(hi); |
5279 | if (score >= sft->sft_score) |
5280 | return; |
5281 | sft->sft_score = score; |
5282 | } |
5283 | |
5284 | // Find the word nr in the soundfold tree. |
5285 | sfwordnr = soundfold_find(slang, goodword); |
5286 | if (sfwordnr < 0) { |
5287 | internal_error("add_sound_suggest()" ); |
5288 | return; |
5289 | } |
5290 | |
5291 | // Go over the list of good words that produce this soundfold word |
5292 | nrline = ml_get_buf(slang->sl_sugbuf, (linenr_T)sfwordnr + 1, false); |
5293 | orgnr = 0; |
5294 | while (*nrline != NUL) { |
5295 | // The wordnr was stored in a minimal nr of bytes as an offset to the |
5296 | // previous wordnr. |
5297 | orgnr += bytes2offset(&nrline); |
5298 | |
5299 | byts = slang->sl_fbyts; |
5300 | idxs = slang->sl_fidxs; |
5301 | |
5302 | // Lookup the word "orgnr" one of the two tries. |
5303 | n = 0; |
5304 | wordcount = 0; |
5305 | for (wlen = 0; wlen < MAXWLEN - 3; ++wlen) { |
5306 | i = 1; |
5307 | if (wordcount == orgnr && byts[n + 1] == NUL) |
5308 | break; // found end of word |
5309 | |
5310 | if (byts[n + 1] == NUL) |
5311 | ++wordcount; |
5312 | |
5313 | // skip over the NUL bytes |
5314 | for (; byts[n + i] == NUL; ++i) |
5315 | if (i > byts[n]) { // safety check |
5316 | STRCPY(theword + wlen, "BAD" ); |
5317 | wlen += 3; |
5318 | goto badword; |
5319 | } |
5320 | |
5321 | // One of the siblings must have the word. |
5322 | for (; i < byts[n]; ++i) { |
5323 | wc = idxs[idxs[n + i]]; // nr of words under this byte |
5324 | if (wordcount + wc > orgnr) |
5325 | break; |
5326 | wordcount += wc; |
5327 | } |
5328 | |
5329 | theword[wlen] = byts[n + i]; |
5330 | n = idxs[n + i]; |
5331 | } |
5332 | badword: |
5333 | theword[wlen] = NUL; |
5334 | |
5335 | // Go over the possible flags and regions. |
5336 | for (; i <= byts[n] && byts[n + i] == NUL; ++i) { |
5337 | char_u cword[MAXWLEN]; |
5338 | char_u *p; |
5339 | int flags = (int)idxs[n + i]; |
5340 | |
5341 | // Skip words with the NOSUGGEST flag |
5342 | if (flags & WF_NOSUGGEST) |
5343 | continue; |
5344 | |
5345 | if (flags & WF_KEEPCAP) { |
5346 | // Must find the word in the keep-case tree. |
5347 | find_keepcap_word(slang, theword, cword); |
5348 | p = cword; |
5349 | } else { |
5350 | flags |= su->su_badflags; |
5351 | if ((flags & WF_CAPMASK) != 0) { |
5352 | // Need to fix case according to "flags". |
5353 | make_case_word(theword, cword, flags); |
5354 | p = cword; |
5355 | } else |
5356 | p = theword; |
5357 | } |
5358 | |
5359 | // Add the suggestion. |
5360 | if (sps_flags & SPS_DOUBLE) { |
5361 | // Add the suggestion if the score isn't too bad. |
5362 | if (score <= su->su_maxscore) |
5363 | add_suggestion(su, &su->su_sga, p, su->su_badlen, |
5364 | score, 0, false, slang, false); |
5365 | } else { |
5366 | // Add a penalty for words in another region. |
5367 | if ((flags & WF_REGION) |
5368 | && (((unsigned)flags >> 16) & lp->lp_region) == 0) |
5369 | goodscore = SCORE_REGION; |
5370 | else |
5371 | goodscore = 0; |
5372 | |
5373 | // Add a small penalty for changing the first letter from |
5374 | // lower to upper case. Helps for "tath" -> "Kath", which is |
5375 | // less common than "tath" -> "path". Don't do it when the |
5376 | // letter is the same, that has already been counted. |
5377 | gc = PTR2CHAR(p); |
5378 | if (SPELL_ISUPPER(gc)) { |
5379 | bc = PTR2CHAR(su->su_badword); |
5380 | if (!SPELL_ISUPPER(bc) |
5381 | && SPELL_TOFOLD(bc) != SPELL_TOFOLD(gc)) |
5382 | goodscore += SCORE_ICASE / 2; |
5383 | } |
5384 | |
5385 | // Compute the score for the good word. This only does letter |
5386 | // insert/delete/swap/replace. REP items are not considered, |
5387 | // which may make the score a bit higher. |
5388 | // Use a limit for the score to make it work faster. Use |
5389 | // MAXSCORE(), because RESCORE() will change the score. |
5390 | // If the limit is very high then the iterative method is |
5391 | // inefficient, using an array is quicker. |
5392 | limit = MAXSCORE(su->su_sfmaxscore - goodscore, score); |
5393 | if (limit > SCORE_LIMITMAX) |
5394 | goodscore += spell_edit_score(slang, su->su_badword, p); |
5395 | else |
5396 | goodscore += spell_edit_score_limit(slang, su->su_badword, |
5397 | p, limit); |
5398 | |
5399 | // When going over the limit don't bother to do the rest. |
5400 | if (goodscore < SCORE_MAXMAX) { |
5401 | // Give a bonus to words seen before. |
5402 | goodscore = score_wordcount_adj(slang, goodscore, p, false); |
5403 | |
5404 | // Add the suggestion if the score isn't too bad. |
5405 | goodscore = RESCORE(goodscore, score); |
5406 | if (goodscore <= su->su_sfmaxscore) |
5407 | add_suggestion(su, &su->su_ga, p, su->su_badlen, |
5408 | goodscore, score, true, slang, true); |
5409 | } |
5410 | } |
5411 | } |
5412 | } |
5413 | } |
5414 | |
5415 | // Find word "word" in fold-case tree for "slang" and return the word number. |
5416 | static int soundfold_find(slang_T *slang, char_u *word) |
5417 | { |
5418 | idx_T arridx = 0; |
5419 | int len; |
5420 | int wlen = 0; |
5421 | int c; |
5422 | char_u *ptr = word; |
5423 | char_u *byts; |
5424 | idx_T *idxs; |
5425 | int wordnr = 0; |
5426 | |
5427 | byts = slang->sl_sbyts; |
5428 | idxs = slang->sl_sidxs; |
5429 | |
5430 | for (;; ) { |
5431 | // First byte is the number of possible bytes. |
5432 | len = byts[arridx++]; |
5433 | |
5434 | // If the first possible byte is a zero the word could end here. |
5435 | // If the word ends we found the word. If not skip the NUL bytes. |
5436 | c = ptr[wlen]; |
5437 | if (byts[arridx] == NUL) { |
5438 | if (c == NUL) |
5439 | break; |
5440 | |
5441 | // Skip over the zeros, there can be several. |
5442 | while (len > 0 && byts[arridx] == NUL) { |
5443 | ++arridx; |
5444 | --len; |
5445 | } |
5446 | if (len == 0) |
5447 | return -1; // no children, word should have ended here |
5448 | ++wordnr; |
5449 | } |
5450 | |
5451 | // If the word ends we didn't find it. |
5452 | if (c == NUL) |
5453 | return -1; |
5454 | |
5455 | // Perform a binary search in the list of accepted bytes. |
5456 | if (c == TAB) // <Tab> is handled like <Space> |
5457 | c = ' '; |
5458 | while (byts[arridx] < c) { |
5459 | // The word count is in the first idxs[] entry of the child. |
5460 | wordnr += idxs[idxs[arridx]]; |
5461 | ++arridx; |
5462 | if (--len == 0) // end of the bytes, didn't find it |
5463 | return -1; |
5464 | } |
5465 | if (byts[arridx] != c) // didn't find the byte |
5466 | return -1; |
5467 | |
5468 | // Continue at the child (if there is one). |
5469 | arridx = idxs[arridx]; |
5470 | ++wlen; |
5471 | |
5472 | // One space in the good word may stand for several spaces in the |
5473 | // checked word. |
5474 | if (c == ' ') |
5475 | while (ptr[wlen] == ' ' || ptr[wlen] == TAB) |
5476 | ++wlen; |
5477 | } |
5478 | |
5479 | return wordnr; |
5480 | } |
5481 | |
5482 | // Copy "fword" to "cword", fixing case according to "flags". |
5483 | static void make_case_word(char_u *fword, char_u *cword, int flags) |
5484 | { |
5485 | if (flags & WF_ALLCAP) |
5486 | // Make it all upper-case |
5487 | allcap_copy(fword, cword); |
5488 | else if (flags & WF_ONECAP) |
5489 | // Make the first letter upper-case |
5490 | onecap_copy(fword, cword, true); |
5491 | else |
5492 | // Use goodword as-is. |
5493 | STRCPY(cword, fword); |
5494 | } |
5495 | |
5496 | // Returns true if "c1" and "c2" are similar characters according to the MAP |
5497 | // lines in the .aff file. |
5498 | static bool similar_chars(slang_T *slang, int c1, int c2) |
5499 | { |
5500 | int m1, m2; |
5501 | char_u buf[MB_MAXBYTES + 1]; |
5502 | hashitem_T *hi; |
5503 | |
5504 | if (c1 >= 256) { |
5505 | buf[utf_char2bytes(c1, buf)] = 0; |
5506 | hi = hash_find(&slang->sl_map_hash, buf); |
5507 | if (HASHITEM_EMPTY(hi)) { |
5508 | m1 = 0; |
5509 | } else { |
5510 | m1 = utf_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); |
5511 | } |
5512 | } else { |
5513 | m1 = slang->sl_map_array[c1]; |
5514 | } |
5515 | if (m1 == 0) { |
5516 | return false; |
5517 | } |
5518 | |
5519 | if (c2 >= 256) { |
5520 | buf[utf_char2bytes(c2, buf)] = 0; |
5521 | hi = hash_find(&slang->sl_map_hash, buf); |
5522 | if (HASHITEM_EMPTY(hi)) { |
5523 | m2 = 0; |
5524 | } else { |
5525 | m2 = utf_ptr2char(hi->hi_key + STRLEN(hi->hi_key) + 1); |
5526 | } |
5527 | } else { |
5528 | m2 = slang->sl_map_array[c2]; |
5529 | } |
5530 | |
5531 | return m1 == m2; |
5532 | } |
5533 | |
5534 | // Adds a suggestion to the list of suggestions. |
5535 | // For a suggestion that is already in the list the lowest score is remembered. |
5536 | static void |
5537 | add_suggestion ( |
5538 | suginfo_T *su, |
5539 | garray_T *gap, // either su_ga or su_sga |
5540 | const char_u *goodword, |
5541 | int badlenarg, // len of bad word replaced with "goodword" |
5542 | int score, |
5543 | int altscore, |
5544 | bool had_bonus, // value for st_had_bonus |
5545 | slang_T *slang, // language for sound folding |
5546 | bool maxsf // su_maxscore applies to soundfold score, |
5547 | // su_sfmaxscore to the total score. |
5548 | ) |
5549 | { |
5550 | int goodlen; // len of goodword changed |
5551 | int badlen; // len of bad word changed |
5552 | suggest_T *stp; |
5553 | suggest_T new_sug; |
5554 | |
5555 | // Minimize "badlen" for consistency. Avoids that changing "the the" to |
5556 | // "thee the" is added next to changing the first "the" the "thee". |
5557 | const char_u *pgood = goodword + STRLEN(goodword); |
5558 | char_u *pbad = su->su_badptr + badlenarg; |
5559 | for (;; ) { |
5560 | goodlen = (int)(pgood - goodword); |
5561 | badlen = (int)(pbad - su->su_badptr); |
5562 | if (goodlen <= 0 || badlen <= 0) |
5563 | break; |
5564 | MB_PTR_BACK(goodword, pgood); |
5565 | MB_PTR_BACK(su->su_badptr, pbad); |
5566 | if (utf_ptr2char(pgood) != utf_ptr2char(pbad)) { |
5567 | break; |
5568 | } |
5569 | } |
5570 | |
5571 | if (badlen == 0 && goodlen == 0) |
5572 | // goodword doesn't change anything; may happen for "the the" changing |
5573 | // the first "the" to itself. |
5574 | return; |
5575 | |
5576 | int i; |
5577 | if (GA_EMPTY(gap)) { |
5578 | i = -1; |
5579 | } else { |
5580 | // Check if the word is already there. Also check the length that is |
5581 | // being replaced "thes," -> "these" is a different suggestion from |
5582 | // "thes" -> "these". |
5583 | stp = &SUG(*gap, 0); |
5584 | for (i = gap->ga_len; --i >= 0; ++stp) { |
5585 | if (stp->st_wordlen == goodlen |
5586 | && stp->st_orglen == badlen |
5587 | && STRNCMP(stp->st_word, goodword, goodlen) == 0) { |
5588 | // Found it. Remember the word with the lowest score. |
5589 | if (stp->st_slang == NULL) |
5590 | stp->st_slang = slang; |
5591 | |
5592 | new_sug.st_score = score; |
5593 | new_sug.st_altscore = altscore; |
5594 | new_sug.st_had_bonus = had_bonus; |
5595 | |
5596 | if (stp->st_had_bonus != had_bonus) { |
5597 | // Only one of the two had the soundalike score computed. |
5598 | // Need to do that for the other one now, otherwise the |
5599 | // scores can't be compared. This happens because |
5600 | // suggest_try_change() doesn't compute the soundalike |
5601 | // word to keep it fast, while some special methods set |
5602 | // the soundalike score to zero. |
5603 | if (had_bonus) |
5604 | rescore_one(su, stp); |
5605 | else { |
5606 | new_sug.st_word = stp->st_word; |
5607 | new_sug.st_wordlen = stp->st_wordlen; |
5608 | new_sug.st_slang = stp->st_slang; |
5609 | new_sug.st_orglen = badlen; |
5610 | rescore_one(su, &new_sug); |
5611 | } |
5612 | } |
5613 | |
5614 | if (stp->st_score > new_sug.st_score) { |
5615 | stp->st_score = new_sug.st_score; |
5616 | stp->st_altscore = new_sug.st_altscore; |
5617 | stp->st_had_bonus = new_sug.st_had_bonus; |
5618 | } |
5619 | break; |
5620 | } |
5621 | } |
5622 | } |
5623 | |
5624 | if (i < 0) { |
5625 | // Add a suggestion. |
5626 | stp = GA_APPEND_VIA_PTR(suggest_T, gap); |
5627 | stp->st_word = vim_strnsave(goodword, goodlen); |
5628 | stp->st_wordlen = goodlen; |
5629 | stp->st_score = score; |
5630 | stp->st_altscore = altscore; |
5631 | stp->st_had_bonus = had_bonus; |
5632 | stp->st_orglen = badlen; |
5633 | stp->st_slang = slang; |
5634 | |
5635 | // If we have too many suggestions now, sort the list and keep |
5636 | // the best suggestions. |
5637 | if (gap->ga_len > SUG_MAX_COUNT(su)) { |
5638 | if (maxsf) |
5639 | su->su_sfmaxscore = cleanup_suggestions(gap, |
5640 | su->su_sfmaxscore, SUG_CLEAN_COUNT(su)); |
5641 | else |
5642 | su->su_maxscore = cleanup_suggestions(gap, |
5643 | su->su_maxscore, SUG_CLEAN_COUNT(su)); |
5644 | } |
5645 | } |
5646 | } |
5647 | |
5648 | // Suggestions may in fact be flagged as errors. Esp. for banned words and |
5649 | // for split words, such as "the the". Remove these from the list here. |
5650 | static void |
5651 | check_suggestions ( |
5652 | suginfo_T *su, |
5653 | garray_T *gap // either su_ga or su_sga |
5654 | ) |
5655 | { |
5656 | suggest_T *stp; |
5657 | char_u longword[MAXWLEN + 1]; |
5658 | int len; |
5659 | hlf_T attr; |
5660 | |
5661 | stp = &SUG(*gap, 0); |
5662 | for (int i = gap->ga_len - 1; i >= 0; --i) { |
5663 | // Need to append what follows to check for "the the". |
5664 | STRLCPY(longword, stp[i].st_word, MAXWLEN + 1); |
5665 | len = stp[i].st_wordlen; |
5666 | STRLCPY(longword + len, su->su_badptr + stp[i].st_orglen, |
5667 | MAXWLEN - len + 1); |
5668 | attr = HLF_COUNT; |
5669 | (void)spell_check(curwin, longword, &attr, NULL, false); |
5670 | if (attr != HLF_COUNT) { |
5671 | // Remove this entry. |
5672 | xfree(stp[i].st_word); |
5673 | --gap->ga_len; |
5674 | if (i < gap->ga_len) |
5675 | memmove(stp + i, stp + i + 1, |
5676 | sizeof(suggest_T) * (gap->ga_len - i)); |
5677 | } |
5678 | } |
5679 | } |
5680 | |
5681 | |
5682 | // Add a word to be banned. |
5683 | static void add_banned(suginfo_T *su, char_u *word) |
5684 | { |
5685 | char_u *s; |
5686 | hash_T hash; |
5687 | hashitem_T *hi; |
5688 | |
5689 | hash = hash_hash(word); |
5690 | const size_t word_len = STRLEN(word); |
5691 | hi = hash_lookup(&su->su_banned, (const char *)word, word_len, hash); |
5692 | if (HASHITEM_EMPTY(hi)) { |
5693 | s = xmemdupz(word, word_len); |
5694 | hash_add_item(&su->su_banned, hi, s, hash); |
5695 | } |
5696 | } |
5697 | |
5698 | // Recompute the score for all suggestions if sound-folding is possible. This |
5699 | // is slow, thus only done for the final results. |
5700 | static void rescore_suggestions(suginfo_T *su) |
5701 | { |
5702 | if (su->su_sallang != NULL) { |
5703 | for (int i = 0; i < su->su_ga.ga_len; ++i) { |
5704 | rescore_one(su, &SUG(su->su_ga, i)); |
5705 | } |
5706 | } |
5707 | } |
5708 | |
5709 | // Recompute the score for one suggestion if sound-folding is possible. |
5710 | static void rescore_one(suginfo_T *su, suggest_T *stp) |
5711 | { |
5712 | slang_T *slang = stp->st_slang; |
5713 | char_u sal_badword[MAXWLEN]; |
5714 | char_u *p; |
5715 | |
5716 | // Only rescore suggestions that have no sal score yet and do have a |
5717 | // language. |
5718 | if (slang != NULL && !GA_EMPTY(&slang->sl_sal) && !stp->st_had_bonus) { |
5719 | if (slang == su->su_sallang) |
5720 | p = su->su_sal_badword; |
5721 | else { |
5722 | spell_soundfold(slang, su->su_fbadword, true, sal_badword); |
5723 | p = sal_badword; |
5724 | } |
5725 | |
5726 | stp->st_altscore = stp_sal_score(stp, su, slang, p); |
5727 | if (stp->st_altscore == SCORE_MAXMAX) |
5728 | stp->st_altscore = SCORE_BIG; |
5729 | stp->st_score = RESCORE(stp->st_score, stp->st_altscore); |
5730 | stp->st_had_bonus = true; |
5731 | } |
5732 | } |
5733 | |
5734 | |
5735 | // Function given to qsort() to sort the suggestions on st_score. |
5736 | // First on "st_score", then "st_altscore" then alphabetically. |
5737 | static int sug_compare(const void *s1, const void *s2) |
5738 | { |
5739 | suggest_T *p1 = (suggest_T *)s1; |
5740 | suggest_T *p2 = (suggest_T *)s2; |
5741 | int n = p1->st_score - p2->st_score; |
5742 | |
5743 | if (n == 0) { |
5744 | n = p1->st_altscore - p2->st_altscore; |
5745 | if (n == 0) |
5746 | n = STRICMP(p1->st_word, p2->st_word); |
5747 | } |
5748 | return n; |
5749 | } |
5750 | |
5751 | // Cleanup the suggestions: |
5752 | // - Sort on score. |
5753 | // - Remove words that won't be displayed. |
5754 | // Returns the maximum score in the list or "maxscore" unmodified. |
5755 | static int |
5756 | cleanup_suggestions ( |
5757 | garray_T *gap, |
5758 | int maxscore, |
5759 | int keep // nr of suggestions to keep |
5760 | ) |
5761 | { |
5762 | suggest_T *stp = &SUG(*gap, 0); |
5763 | |
5764 | // Sort the list. |
5765 | qsort(gap->ga_data, (size_t)gap->ga_len, sizeof(suggest_T), sug_compare); |
5766 | |
5767 | // Truncate the list to the number of suggestions that will be displayed. |
5768 | if (gap->ga_len > keep) { |
5769 | for (int i = keep; i < gap->ga_len; ++i) { |
5770 | xfree(stp[i].st_word); |
5771 | } |
5772 | gap->ga_len = keep; |
5773 | return stp[keep - 1].st_score; |
5774 | } |
5775 | return maxscore; |
5776 | } |
5777 | |
5778 | /// Soundfold a string, for soundfold() |
5779 | /// |
5780 | /// @param[in] word Word to soundfold. |
5781 | /// |
5782 | /// @return [allocated] soundfolded string or NULL in case of error. May return |
5783 | /// copy of the input string if soundfolding is not |
5784 | /// supported by any of the languages in &spellang. |
5785 | char *eval_soundfold(const char *const word) |
5786 | FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_MALLOC FUNC_ATTR_NONNULL_ALL |
5787 | { |
5788 | if (curwin->w_p_spell && *curwin->w_s->b_p_spl != NUL) { |
5789 | // Use the sound-folding of the first language that supports it. |
5790 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; lpi++) { |
5791 | langp_T *const lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
5792 | if (!GA_EMPTY(&lp->lp_slang->sl_sal)) { |
5793 | // soundfold the word |
5794 | char_u sound[MAXWLEN]; |
5795 | spell_soundfold(lp->lp_slang, (char_u *)word, false, sound); |
5796 | return xstrdup((const char *)sound); |
5797 | } |
5798 | } |
5799 | } |
5800 | |
5801 | // No language with sound folding, return word as-is. |
5802 | return xstrdup(word); |
5803 | } |
5804 | |
5805 | /// Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". |
5806 | /// |
5807 | /// There are many ways to turn a word into a sound-a-like representation. The |
5808 | /// oldest is Soundex (1918!). A nice overview can be found in "Approximate |
5809 | /// swedish name matching - survey and test of different algorithms" by Klas |
5810 | /// Erikson. |
5811 | /// |
5812 | /// We support two methods: |
5813 | /// 1. SOFOFROM/SOFOTO do a simple character mapping. |
5814 | /// 2. SAL items define a more advanced sound-folding (and much slower). |
5815 | /// |
5816 | /// @param[in] slang |
5817 | /// @param[in] inword word to soundfold |
5818 | /// @param[in] folded whether inword is already case-folded |
5819 | /// @param[in,out] res destination for soundfolded word |
5820 | void spell_soundfold(slang_T *slang, char_u *inword, bool folded, char_u *res) |
5821 | { |
5822 | char_u fword[MAXWLEN]; |
5823 | char_u *word; |
5824 | |
5825 | if (slang->sl_sofo) |
5826 | // SOFOFROM and SOFOTO used |
5827 | spell_soundfold_sofo(slang, inword, res); |
5828 | else { |
5829 | // SAL items used. Requires the word to be case-folded. |
5830 | if (folded) |
5831 | word = inword; |
5832 | else { |
5833 | (void)spell_casefold(inword, (int)STRLEN(inword), fword, MAXWLEN); |
5834 | word = fword; |
5835 | } |
5836 | |
5837 | if (has_mbyte) |
5838 | spell_soundfold_wsal(slang, word, res); |
5839 | else |
5840 | spell_soundfold_sal(slang, word, res); |
5841 | } |
5842 | } |
5843 | |
5844 | // Perform sound folding of "inword" into "res" according to SOFOFROM and |
5845 | // SOFOTO lines. |
5846 | static void spell_soundfold_sofo(slang_T *slang, char_u *inword, char_u *res) |
5847 | { |
5848 | char_u *s; |
5849 | int ri = 0; |
5850 | int c; |
5851 | |
5852 | if (has_mbyte) { |
5853 | int prevc = 0; |
5854 | int *ip; |
5855 | |
5856 | // The sl_sal_first[] table contains the translation for chars up to |
5857 | // 255, sl_sal the rest. |
5858 | for (s = inword; *s != NUL; ) { |
5859 | c = mb_cptr2char_adv((const char_u **)&s); |
5860 | if (enc_utf8 ? utf_class(c) == 0 : ascii_iswhite(c)) { |
5861 | c = ' '; |
5862 | } else if (c < 256) { |
5863 | c = slang->sl_sal_first[c]; |
5864 | } else { |
5865 | ip = ((int **)slang->sl_sal.ga_data)[c & 0xff]; |
5866 | if (ip == NULL) // empty list, can't match |
5867 | c = NUL; |
5868 | else |
5869 | for (;; ) { // find "c" in the list |
5870 | if (*ip == 0) { // not found |
5871 | c = NUL; |
5872 | break; |
5873 | } |
5874 | if (*ip == c) { // match! |
5875 | c = ip[1]; |
5876 | break; |
5877 | } |
5878 | ip += 2; |
5879 | } |
5880 | } |
5881 | |
5882 | if (c != NUL && c != prevc) { |
5883 | ri += utf_char2bytes(c, res + ri); |
5884 | if (ri + MB_MAXBYTES > MAXWLEN) { |
5885 | break; |
5886 | } |
5887 | prevc = c; |
5888 | } |
5889 | } |
5890 | } else { |
5891 | // The sl_sal_first[] table contains the translation. |
5892 | for (s = inword; (c = *s) != NUL; ++s) { |
5893 | if (ascii_iswhite(c)) |
5894 | c = ' '; |
5895 | else |
5896 | c = slang->sl_sal_first[c]; |
5897 | if (c != NUL && (ri == 0 || res[ri - 1] != c)) |
5898 | res[ri++] = c; |
5899 | } |
5900 | } |
5901 | |
5902 | res[ri] = NUL; |
5903 | } |
5904 | |
5905 | static void spell_soundfold_sal(slang_T *slang, char_u *inword, char_u *res) |
5906 | { |
5907 | salitem_T *smp; |
5908 | char_u word[MAXWLEN]; |
5909 | char_u *s = inword; |
5910 | char_u *t; |
5911 | char_u *pf; |
5912 | int i, j, z; |
5913 | int reslen; |
5914 | int n, k = 0; |
5915 | int z0; |
5916 | int k0; |
5917 | int n0; |
5918 | int c; |
5919 | int pri; |
5920 | int p0 = -333; |
5921 | int c0; |
5922 | |
5923 | // Remove accents, if wanted. We actually remove all non-word characters. |
5924 | // But keep white space. We need a copy, the word may be changed here. |
5925 | if (slang->sl_rem_accents) { |
5926 | t = word; |
5927 | while (*s != NUL) { |
5928 | if (ascii_iswhite(*s)) { |
5929 | *t++ = ' '; |
5930 | s = skipwhite(s); |
5931 | } else { |
5932 | if (spell_iswordp_nmw(s, curwin)) |
5933 | *t++ = *s; |
5934 | ++s; |
5935 | } |
5936 | } |
5937 | *t = NUL; |
5938 | } else |
5939 | STRLCPY(word, s, MAXWLEN); |
5940 | |
5941 | smp = (salitem_T *)slang->sl_sal.ga_data; |
5942 | |
5943 | // This comes from Aspell phonet.cpp. Converted from C++ to C. |
5944 | // Changed to keep spaces. |
5945 | i = reslen = z = 0; |
5946 | while ((c = word[i]) != NUL) { |
5947 | // Start with the first rule that has the character in the word. |
5948 | n = slang->sl_sal_first[c]; |
5949 | z0 = 0; |
5950 | |
5951 | if (n >= 0) { |
5952 | // check all rules for the same letter |
5953 | for (; (s = smp[n].sm_lead)[0] == c; ++n) { |
5954 | // Quickly skip entries that don't match the word. Most |
5955 | // entries are less then three chars, optimize for that. |
5956 | k = smp[n].sm_leadlen; |
5957 | if (k > 1) { |
5958 | if (word[i + 1] != s[1]) |
5959 | continue; |
5960 | if (k > 2) { |
5961 | for (j = 2; j < k; ++j) |
5962 | if (word[i + j] != s[j]) |
5963 | break; |
5964 | if (j < k) |
5965 | continue; |
5966 | } |
5967 | } |
5968 | |
5969 | if ((pf = smp[n].sm_oneof) != NULL) { |
5970 | // Check for match with one of the chars in "sm_oneof". |
5971 | while (*pf != NUL && *pf != word[i + k]) |
5972 | ++pf; |
5973 | if (*pf == NUL) |
5974 | continue; |
5975 | ++k; |
5976 | } |
5977 | s = smp[n].sm_rules; |
5978 | pri = 5; // default priority |
5979 | |
5980 | p0 = *s; |
5981 | k0 = k; |
5982 | while (*s == '-' && k > 1) { |
5983 | k--; |
5984 | s++; |
5985 | } |
5986 | if (*s == '<') |
5987 | s++; |
5988 | if (ascii_isdigit(*s)) { |
5989 | // determine priority |
5990 | pri = *s - '0'; |
5991 | s++; |
5992 | } |
5993 | if (*s == '^' && *(s + 1) == '^') |
5994 | s++; |
5995 | |
5996 | if (*s == NUL |
5997 | || (*s == '^' |
5998 | && (i == 0 || !(word[i - 1] == ' ' |
5999 | || spell_iswordp(word + i - 1, curwin))) |
6000 | && (*(s + 1) != '$' |
6001 | || (!spell_iswordp(word + i + k0, curwin)))) |
6002 | || (*s == '$' && i > 0 |
6003 | && spell_iswordp(word + i - 1, curwin) |
6004 | && (!spell_iswordp(word + i + k0, curwin)))) { |
6005 | // search for followup rules, if: |
6006 | // followup and k > 1 and NO '-' in searchstring |
6007 | c0 = word[i + k - 1]; |
6008 | n0 = slang->sl_sal_first[c0]; |
6009 | |
6010 | if (slang->sl_followup && k > 1 && n0 >= 0 |
6011 | && p0 != '-' && word[i + k] != NUL) { |
6012 | // test follow-up rule for "word[i + k]" |
6013 | for (; (s = smp[n0].sm_lead)[0] == c0; ++n0) { |
6014 | // Quickly skip entries that don't match the word. |
6015 | k0 = smp[n0].sm_leadlen; |
6016 | if (k0 > 1) { |
6017 | if (word[i + k] != s[1]) |
6018 | continue; |
6019 | if (k0 > 2) { |
6020 | pf = word + i + k + 1; |
6021 | for (j = 2; j < k0; ++j) |
6022 | if (*pf++ != s[j]) |
6023 | break; |
6024 | if (j < k0) |
6025 | continue; |
6026 | } |
6027 | } |
6028 | k0 += k - 1; |
6029 | |
6030 | if ((pf = smp[n0].sm_oneof) != NULL) { |
6031 | // Check for match with one of the chars in |
6032 | // "sm_oneof". |
6033 | while (*pf != NUL && *pf != word[i + k0]) |
6034 | ++pf; |
6035 | if (*pf == NUL) |
6036 | continue; |
6037 | ++k0; |
6038 | } |
6039 | |
6040 | p0 = 5; |
6041 | s = smp[n0].sm_rules; |
6042 | while (*s == '-') { |
6043 | // "k0" gets NOT reduced because |
6044 | // "if (k0 == k)" |
6045 | s++; |
6046 | } |
6047 | if (*s == '<') |
6048 | s++; |
6049 | if (ascii_isdigit(*s)) { |
6050 | p0 = *s - '0'; |
6051 | s++; |
6052 | } |
6053 | |
6054 | if (*s == NUL |
6055 | // *s == '^' cuts |
6056 | || (*s == '$' |
6057 | && !spell_iswordp(word + i + k0, |
6058 | curwin))) { |
6059 | if (k0 == k) |
6060 | // this is just a piece of the string |
6061 | continue; |
6062 | |
6063 | if (p0 < pri) |
6064 | // priority too low |
6065 | continue; |
6066 | // rule fits; stop search |
6067 | break; |
6068 | } |
6069 | } |
6070 | |
6071 | if (p0 >= pri && smp[n0].sm_lead[0] == c0) |
6072 | continue; |
6073 | } |
6074 | |
6075 | // replace string |
6076 | s = smp[n].sm_to; |
6077 | if (s == NULL) |
6078 | s = (char_u *)"" ; |
6079 | pf = smp[n].sm_rules; |
6080 | p0 = (vim_strchr(pf, '<') != NULL) ? 1 : 0; |
6081 | if (p0 == 1 && z == 0) { |
6082 | // rule with '<' is used |
6083 | if (reslen > 0 && *s != NUL && (res[reslen - 1] == c |
6084 | || res[reslen - 1] == *s)) |
6085 | reslen--; |
6086 | z0 = 1; |
6087 | z = 1; |
6088 | k0 = 0; |
6089 | while (*s != NUL && word[i + k0] != NUL) { |
6090 | word[i + k0] = *s; |
6091 | k0++; |
6092 | s++; |
6093 | } |
6094 | if (k > k0) |
6095 | STRMOVE(word + i + k0, word + i + k); |
6096 | |
6097 | // new "actual letter" |
6098 | c = word[i]; |
6099 | } else { |
6100 | // no '<' rule used |
6101 | i += k - 1; |
6102 | z = 0; |
6103 | while (*s != NUL && s[1] != NUL && reslen < MAXWLEN) { |
6104 | if (reslen == 0 || res[reslen - 1] != *s) |
6105 | res[reslen++] = *s; |
6106 | s++; |
6107 | } |
6108 | // new "actual letter" |
6109 | c = *s; |
6110 | if (strstr((char *)pf, "^^" ) != NULL) { |
6111 | if (c != NUL) |
6112 | res[reslen++] = c; |
6113 | STRMOVE(word, word + i + 1); |
6114 | i = 0; |
6115 | z0 = 1; |
6116 | } |
6117 | } |
6118 | break; |
6119 | } |
6120 | } |
6121 | } else if (ascii_iswhite(c)) { |
6122 | c = ' '; |
6123 | k = 1; |
6124 | } |
6125 | |
6126 | if (z0 == 0) { |
6127 | if (k && !p0 && reslen < MAXWLEN && c != NUL |
6128 | && (!slang->sl_collapse || reslen == 0 |
6129 | || res[reslen - 1] != c)) |
6130 | // condense only double letters |
6131 | res[reslen++] = c; |
6132 | |
6133 | i++; |
6134 | z = 0; |
6135 | k = 0; |
6136 | } |
6137 | } |
6138 | |
6139 | res[reslen] = NUL; |
6140 | } |
6141 | |
6142 | // Turn "inword" into its sound-a-like equivalent in "res[MAXWLEN]". |
6143 | // Multi-byte version of spell_soundfold(). |
6144 | static void spell_soundfold_wsal(slang_T *slang, char_u *inword, char_u *res) |
6145 | { |
6146 | salitem_T *smp = (salitem_T *)slang->sl_sal.ga_data; |
6147 | int word[MAXWLEN]; |
6148 | int wres[MAXWLEN]; |
6149 | int l; |
6150 | int *ws; |
6151 | int *pf; |
6152 | int i, j, z; |
6153 | int reslen; |
6154 | int n, k = 0; |
6155 | int z0; |
6156 | int k0; |
6157 | int n0; |
6158 | int c; |
6159 | int pri; |
6160 | int p0 = -333; |
6161 | int c0; |
6162 | bool did_white = false; |
6163 | int wordlen; |
6164 | |
6165 | |
6166 | // Convert the multi-byte string to a wide-character string. |
6167 | // Remove accents, if wanted. We actually remove all non-word characters. |
6168 | // But keep white space. |
6169 | wordlen = 0; |
6170 | for (const char_u *s = inword; *s != NUL; ) { |
6171 | const char_u *t = s; |
6172 | c = mb_cptr2char_adv((const char_u **)&s); |
6173 | if (slang->sl_rem_accents) { |
6174 | if (enc_utf8 ? utf_class(c) == 0 : ascii_iswhite(c)) { |
6175 | if (did_white) |
6176 | continue; |
6177 | c = ' '; |
6178 | did_white = true; |
6179 | } else { |
6180 | did_white = false; |
6181 | if (!spell_iswordp_nmw(t, curwin)) { |
6182 | continue; |
6183 | } |
6184 | } |
6185 | } |
6186 | word[wordlen++] = c; |
6187 | } |
6188 | word[wordlen] = NUL; |
6189 | |
6190 | // This algorithm comes from Aspell phonet.cpp. |
6191 | // Converted from C++ to C. Added support for multi-byte chars. |
6192 | // Changed to keep spaces. |
6193 | i = reslen = z = 0; |
6194 | while ((c = word[i]) != NUL) { |
6195 | // Start with the first rule that has the character in the word. |
6196 | n = slang->sl_sal_first[c & 0xff]; |
6197 | z0 = 0; |
6198 | |
6199 | if (n >= 0) { |
6200 | // Check all rules for the same index byte. |
6201 | // If c is 0x300 need extra check for the end of the array, as |
6202 | // (c & 0xff) is NUL. |
6203 | for (; ((ws = smp[n].sm_lead_w)[0] & 0xff) == (c & 0xff) |
6204 | && ws[0] != NUL; ++n) { |
6205 | // Quickly skip entries that don't match the word. Most |
6206 | // entries are less then three chars, optimize for that. |
6207 | if (c != ws[0]) |
6208 | continue; |
6209 | k = smp[n].sm_leadlen; |
6210 | if (k > 1) { |
6211 | if (word[i + 1] != ws[1]) |
6212 | continue; |
6213 | if (k > 2) { |
6214 | for (j = 2; j < k; ++j) |
6215 | if (word[i + j] != ws[j]) |
6216 | break; |
6217 | if (j < k) |
6218 | continue; |
6219 | } |
6220 | } |
6221 | |
6222 | if ((pf = smp[n].sm_oneof_w) != NULL) { |
6223 | // Check for match with one of the chars in "sm_oneof". |
6224 | while (*pf != NUL && *pf != word[i + k]) |
6225 | ++pf; |
6226 | if (*pf == NUL) |
6227 | continue; |
6228 | ++k; |
6229 | } |
6230 | char_u *s = smp[n].sm_rules; |
6231 | pri = 5; // default priority |
6232 | |
6233 | p0 = *s; |
6234 | k0 = k; |
6235 | while (*s == '-' && k > 1) { |
6236 | k--; |
6237 | s++; |
6238 | } |
6239 | if (*s == '<') |
6240 | s++; |
6241 | if (ascii_isdigit(*s)) { |
6242 | // determine priority |
6243 | pri = *s - '0'; |
6244 | s++; |
6245 | } |
6246 | if (*s == '^' && *(s + 1) == '^') |
6247 | s++; |
6248 | |
6249 | if (*s == NUL |
6250 | || (*s == '^' |
6251 | && (i == 0 || !(word[i - 1] == ' ' |
6252 | || spell_iswordp_w(word + i - 1, curwin))) |
6253 | && (*(s + 1) != '$' |
6254 | || (!spell_iswordp_w(word + i + k0, curwin)))) |
6255 | || (*s == '$' && i > 0 |
6256 | && spell_iswordp_w(word + i - 1, curwin) |
6257 | && (!spell_iswordp_w(word + i + k0, curwin)))) { |
6258 | // search for followup rules, if: |
6259 | // followup and k > 1 and NO '-' in searchstring |
6260 | c0 = word[i + k - 1]; |
6261 | n0 = slang->sl_sal_first[c0 & 0xff]; |
6262 | |
6263 | if (slang->sl_followup && k > 1 && n0 >= 0 |
6264 | && p0 != '-' && word[i + k] != NUL) { |
6265 | // Test follow-up rule for "word[i + k]"; loop over |
6266 | // all entries with the same index byte. |
6267 | for (; ((ws = smp[n0].sm_lead_w)[0] & 0xff) |
6268 | == (c0 & 0xff); ++n0) { |
6269 | // Quickly skip entries that don't match the word. |
6270 | if (c0 != ws[0]) |
6271 | continue; |
6272 | k0 = smp[n0].sm_leadlen; |
6273 | if (k0 > 1) { |
6274 | if (word[i + k] != ws[1]) |
6275 | continue; |
6276 | if (k0 > 2) { |
6277 | pf = word + i + k + 1; |
6278 | for (j = 2; j < k0; ++j) |
6279 | if (*pf++ != ws[j]) |
6280 | break; |
6281 | if (j < k0) |
6282 | continue; |
6283 | } |
6284 | } |
6285 | k0 += k - 1; |
6286 | |
6287 | if ((pf = smp[n0].sm_oneof_w) != NULL) { |
6288 | // Check for match with one of the chars in |
6289 | // "sm_oneof". |
6290 | while (*pf != NUL && *pf != word[i + k0]) |
6291 | ++pf; |
6292 | if (*pf == NUL) |
6293 | continue; |
6294 | ++k0; |
6295 | } |
6296 | |
6297 | p0 = 5; |
6298 | s = smp[n0].sm_rules; |
6299 | while (*s == '-') { |
6300 | // "k0" gets NOT reduced because |
6301 | // "if (k0 == k)" |
6302 | s++; |
6303 | } |
6304 | if (*s == '<') |
6305 | s++; |
6306 | if (ascii_isdigit(*s)) { |
6307 | p0 = *s - '0'; |
6308 | s++; |
6309 | } |
6310 | |
6311 | if (*s == NUL |
6312 | // *s == '^' cuts |
6313 | || (*s == '$' |
6314 | && !spell_iswordp_w(word + i + k0, |
6315 | curwin))) { |
6316 | if (k0 == k) |
6317 | // this is just a piece of the string |
6318 | continue; |
6319 | |
6320 | if (p0 < pri) |
6321 | // priority too low |
6322 | continue; |
6323 | // rule fits; stop search |
6324 | break; |
6325 | } |
6326 | } |
6327 | |
6328 | if (p0 >= pri && (smp[n0].sm_lead_w[0] & 0xff) |
6329 | == (c0 & 0xff)) |
6330 | continue; |
6331 | } |
6332 | |
6333 | // replace string |
6334 | ws = smp[n].sm_to_w; |
6335 | s = smp[n].sm_rules; |
6336 | p0 = (vim_strchr(s, '<') != NULL) ? 1 : 0; |
6337 | if (p0 == 1 && z == 0) { |
6338 | // rule with '<' is used |
6339 | if (reslen > 0 && ws != NULL && *ws != NUL |
6340 | && (wres[reslen - 1] == c |
6341 | || wres[reslen - 1] == *ws)) |
6342 | reslen--; |
6343 | z0 = 1; |
6344 | z = 1; |
6345 | k0 = 0; |
6346 | if (ws != NULL) |
6347 | while (*ws != NUL && word[i + k0] != NUL) { |
6348 | word[i + k0] = *ws; |
6349 | k0++; |
6350 | ws++; |
6351 | } |
6352 | if (k > k0) |
6353 | memmove(word + i + k0, word + i + k, |
6354 | sizeof(int) * (wordlen - (i + k) + 1)); |
6355 | |
6356 | // new "actual letter" |
6357 | c = word[i]; |
6358 | } else { |
6359 | // no '<' rule used |
6360 | i += k - 1; |
6361 | z = 0; |
6362 | if (ws != NULL) |
6363 | while (*ws != NUL && ws[1] != NUL |
6364 | && reslen < MAXWLEN) { |
6365 | if (reslen == 0 || wres[reslen - 1] != *ws) |
6366 | wres[reslen++] = *ws; |
6367 | ws++; |
6368 | } |
6369 | // new "actual letter" |
6370 | if (ws == NULL) |
6371 | c = NUL; |
6372 | else |
6373 | c = *ws; |
6374 | if (strstr((char *)s, "^^" ) != NULL) { |
6375 | if (c != NUL) |
6376 | wres[reslen++] = c; |
6377 | memmove(word, word + i + 1, |
6378 | sizeof(int) * (wordlen - (i + 1) + 1)); |
6379 | i = 0; |
6380 | z0 = 1; |
6381 | } |
6382 | } |
6383 | break; |
6384 | } |
6385 | } |
6386 | } else if (ascii_iswhite(c)) { |
6387 | c = ' '; |
6388 | k = 1; |
6389 | } |
6390 | |
6391 | if (z0 == 0) { |
6392 | if (k && !p0 && reslen < MAXWLEN && c != NUL |
6393 | && (!slang->sl_collapse || reslen == 0 |
6394 | || wres[reslen - 1] != c)) |
6395 | // condense only double letters |
6396 | wres[reslen++] = c; |
6397 | |
6398 | i++; |
6399 | z = 0; |
6400 | k = 0; |
6401 | } |
6402 | } |
6403 | |
6404 | // Convert wide characters in "wres" to a multi-byte string in "res". |
6405 | l = 0; |
6406 | for (n = 0; n < reslen; n++) { |
6407 | l += utf_char2bytes(wres[n], res + l); |
6408 | if (l + MB_MAXBYTES > MAXWLEN) { |
6409 | break; |
6410 | } |
6411 | } |
6412 | res[l] = NUL; |
6413 | } |
6414 | |
6415 | // Compute a score for two sound-a-like words. |
6416 | // This permits up to two inserts/deletes/swaps/etc. to keep things fast. |
6417 | // Instead of a generic loop we write out the code. That keeps it fast by |
6418 | // avoiding checks that will not be possible. |
6419 | static int |
6420 | soundalike_score ( |
6421 | char_u *goodstart, // sound-folded good word |
6422 | char_u *badstart // sound-folded bad word |
6423 | ) |
6424 | { |
6425 | char_u *goodsound = goodstart; |
6426 | char_u *badsound = badstart; |
6427 | int goodlen; |
6428 | int badlen; |
6429 | int n; |
6430 | char_u *pl, *ps; |
6431 | char_u *pl2, *ps2; |
6432 | int score = 0; |
6433 | |
6434 | // Adding/inserting "*" at the start (word starts with vowel) shouldn't be |
6435 | // counted so much, vowels in the middle of the word aren't counted at all. |
6436 | if ((*badsound == '*' || *goodsound == '*') && *badsound != *goodsound) { |
6437 | if ((badsound[0] == NUL && goodsound[1] == NUL) |
6438 | || (goodsound[0] == NUL && badsound[1] == NUL)) |
6439 | // changing word with vowel to word without a sound |
6440 | return SCORE_DEL; |
6441 | if (badsound[0] == NUL || goodsound[0] == NUL) |
6442 | // more than two changes |
6443 | return SCORE_MAXMAX; |
6444 | |
6445 | if (badsound[1] == goodsound[1] |
6446 | || (badsound[1] != NUL |
6447 | && goodsound[1] != NUL |
6448 | && badsound[2] == goodsound[2])) { |
6449 | // handle like a substitute |
6450 | } else { |
6451 | score = 2 * SCORE_DEL / 3; |
6452 | if (*badsound == '*') |
6453 | ++badsound; |
6454 | else |
6455 | ++goodsound; |
6456 | } |
6457 | } |
6458 | |
6459 | goodlen = (int)STRLEN(goodsound); |
6460 | badlen = (int)STRLEN(badsound); |
6461 | |
6462 | // Return quickly if the lengths are too different to be fixed by two |
6463 | // changes. |
6464 | n = goodlen - badlen; |
6465 | if (n < -2 || n > 2) |
6466 | return SCORE_MAXMAX; |
6467 | |
6468 | if (n > 0) { |
6469 | pl = goodsound; // goodsound is longest |
6470 | ps = badsound; |
6471 | } else { |
6472 | pl = badsound; // badsound is longest |
6473 | ps = goodsound; |
6474 | } |
6475 | |
6476 | // Skip over the identical part. |
6477 | while (*pl == *ps && *pl != NUL) { |
6478 | ++pl; |
6479 | ++ps; |
6480 | } |
6481 | |
6482 | switch (n) { |
6483 | case -2: |
6484 | case 2: |
6485 | // Must delete two characters from "pl". |
6486 | ++pl; // first delete |
6487 | while (*pl == *ps) { |
6488 | ++pl; |
6489 | ++ps; |
6490 | } |
6491 | // strings must be equal after second delete |
6492 | if (STRCMP(pl + 1, ps) == 0) |
6493 | return score + SCORE_DEL * 2; |
6494 | |
6495 | // Failed to compare. |
6496 | break; |
6497 | |
6498 | case -1: |
6499 | case 1: |
6500 | // Minimal one delete from "pl" required. |
6501 | |
6502 | // 1: delete |
6503 | pl2 = pl + 1; |
6504 | ps2 = ps; |
6505 | while (*pl2 == *ps2) { |
6506 | if (*pl2 == NUL) // reached the end |
6507 | return score + SCORE_DEL; |
6508 | ++pl2; |
6509 | ++ps2; |
6510 | } |
6511 | |
6512 | // 2: delete then swap, then rest must be equal |
6513 | if (pl2[0] == ps2[1] && pl2[1] == ps2[0] |
6514 | && STRCMP(pl2 + 2, ps2 + 2) == 0) |
6515 | return score + SCORE_DEL + SCORE_SWAP; |
6516 | |
6517 | // 3: delete then substitute, then the rest must be equal |
6518 | if (STRCMP(pl2 + 1, ps2 + 1) == 0) |
6519 | return score + SCORE_DEL + SCORE_SUBST; |
6520 | |
6521 | // 4: first swap then delete |
6522 | if (pl[0] == ps[1] && pl[1] == ps[0]) { |
6523 | pl2 = pl + 2; // swap, skip two chars |
6524 | ps2 = ps + 2; |
6525 | while (*pl2 == *ps2) { |
6526 | ++pl2; |
6527 | ++ps2; |
6528 | } |
6529 | // delete a char and then strings must be equal |
6530 | if (STRCMP(pl2 + 1, ps2) == 0) |
6531 | return score + SCORE_SWAP + SCORE_DEL; |
6532 | } |
6533 | |
6534 | // 5: first substitute then delete |
6535 | pl2 = pl + 1; // substitute, skip one char |
6536 | ps2 = ps + 1; |
6537 | while (*pl2 == *ps2) { |
6538 | ++pl2; |
6539 | ++ps2; |
6540 | } |
6541 | // delete a char and then strings must be equal |
6542 | if (STRCMP(pl2 + 1, ps2) == 0) |
6543 | return score + SCORE_SUBST + SCORE_DEL; |
6544 | |
6545 | // Failed to compare. |
6546 | break; |
6547 | |
6548 | case 0: |
6549 | // Lengths are equal, thus changes must result in same length: An |
6550 | // insert is only possible in combination with a delete. |
6551 | // 1: check if for identical strings |
6552 | if (*pl == NUL) |
6553 | return score; |
6554 | |
6555 | // 2: swap |
6556 | if (pl[0] == ps[1] && pl[1] == ps[0]) { |
6557 | pl2 = pl + 2; // swap, skip two chars |
6558 | ps2 = ps + 2; |
6559 | while (*pl2 == *ps2) { |
6560 | if (*pl2 == NUL) // reached the end |
6561 | return score + SCORE_SWAP; |
6562 | ++pl2; |
6563 | ++ps2; |
6564 | } |
6565 | // 3: swap and swap again |
6566 | if (pl2[0] == ps2[1] && pl2[1] == ps2[0] |
6567 | && STRCMP(pl2 + 2, ps2 + 2) == 0) |
6568 | return score + SCORE_SWAP + SCORE_SWAP; |
6569 | |
6570 | // 4: swap and substitute |
6571 | if (STRCMP(pl2 + 1, ps2 + 1) == 0) |
6572 | return score + SCORE_SWAP + SCORE_SUBST; |
6573 | } |
6574 | |
6575 | // 5: substitute |
6576 | pl2 = pl + 1; |
6577 | ps2 = ps + 1; |
6578 | while (*pl2 == *ps2) { |
6579 | if (*pl2 == NUL) // reached the end |
6580 | return score + SCORE_SUBST; |
6581 | ++pl2; |
6582 | ++ps2; |
6583 | } |
6584 | |
6585 | // 6: substitute and swap |
6586 | if (pl2[0] == ps2[1] && pl2[1] == ps2[0] |
6587 | && STRCMP(pl2 + 2, ps2 + 2) == 0) |
6588 | return score + SCORE_SUBST + SCORE_SWAP; |
6589 | |
6590 | // 7: substitute and substitute |
6591 | if (STRCMP(pl2 + 1, ps2 + 1) == 0) |
6592 | return score + SCORE_SUBST + SCORE_SUBST; |
6593 | |
6594 | // 8: insert then delete |
6595 | pl2 = pl; |
6596 | ps2 = ps + 1; |
6597 | while (*pl2 == *ps2) { |
6598 | ++pl2; |
6599 | ++ps2; |
6600 | } |
6601 | if (STRCMP(pl2 + 1, ps2) == 0) |
6602 | return score + SCORE_INS + SCORE_DEL; |
6603 | |
6604 | // 9: delete then insert |
6605 | pl2 = pl + 1; |
6606 | ps2 = ps; |
6607 | while (*pl2 == *ps2) { |
6608 | ++pl2; |
6609 | ++ps2; |
6610 | } |
6611 | if (STRCMP(pl2, ps2 + 1) == 0) |
6612 | return score + SCORE_INS + SCORE_DEL; |
6613 | |
6614 | // Failed to compare. |
6615 | break; |
6616 | } |
6617 | |
6618 | return SCORE_MAXMAX; |
6619 | } |
6620 | |
6621 | // Compute the "edit distance" to turn "badword" into "goodword". The less |
6622 | // deletes/inserts/substitutes/swaps are required the lower the score. |
6623 | // |
6624 | // The algorithm is described by Du and Chang, 1992. |
6625 | // The implementation of the algorithm comes from Aspell editdist.cpp, |
6626 | // edit_distance(). It has been converted from C++ to C and modified to |
6627 | // support multi-byte characters. |
6628 | static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword) |
6629 | { |
6630 | int *cnt; |
6631 | int j, i; |
6632 | int t; |
6633 | int bc, gc; |
6634 | int pbc, pgc; |
6635 | int wbadword[MAXWLEN]; |
6636 | int wgoodword[MAXWLEN]; |
6637 | const bool l_has_mbyte = has_mbyte; |
6638 | |
6639 | // Lengths with NUL. |
6640 | int badlen; |
6641 | int goodlen; |
6642 | if (l_has_mbyte) { |
6643 | // Get the characters from the multi-byte strings and put them in an |
6644 | // int array for easy access. |
6645 | badlen = 0; |
6646 | for (const char_u *p = badword; *p != NUL; ) { |
6647 | wbadword[badlen++] = mb_cptr2char_adv(&p); |
6648 | } |
6649 | wbadword[badlen++] = 0; |
6650 | goodlen = 0; |
6651 | for (const char_u *p = goodword; *p != NUL; ) { |
6652 | wgoodword[goodlen++] = mb_cptr2char_adv(&p); |
6653 | } |
6654 | wgoodword[goodlen++] = 0; |
6655 | } else { |
6656 | badlen = (int)STRLEN(badword) + 1; |
6657 | goodlen = (int)STRLEN(goodword) + 1; |
6658 | } |
6659 | |
6660 | // We use "cnt" as an array: CNT(badword_idx, goodword_idx). |
6661 | #define CNT(a, b) cnt[(a) + (b) * (badlen + 1)] |
6662 | cnt = xmalloc(sizeof(int) * (badlen + 1) * (goodlen + 1)); |
6663 | |
6664 | CNT(0, 0) = 0; |
6665 | for (j = 1; j <= goodlen; ++j) |
6666 | CNT(0, j) = CNT(0, j - 1) + SCORE_INS; |
6667 | |
6668 | for (i = 1; i <= badlen; ++i) { |
6669 | CNT(i, 0) = CNT(i - 1, 0) + SCORE_DEL; |
6670 | for (j = 1; j <= goodlen; ++j) { |
6671 | if (l_has_mbyte) { |
6672 | bc = wbadword[i - 1]; |
6673 | gc = wgoodword[j - 1]; |
6674 | } else { |
6675 | bc = badword[i - 1]; |
6676 | gc = goodword[j - 1]; |
6677 | } |
6678 | if (bc == gc) |
6679 | CNT(i, j) = CNT(i - 1, j - 1); |
6680 | else { |
6681 | // Use a better score when there is only a case difference. |
6682 | if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) |
6683 | CNT(i, j) = SCORE_ICASE + CNT(i - 1, j - 1); |
6684 | else { |
6685 | // For a similar character use SCORE_SIMILAR. |
6686 | if (slang != NULL |
6687 | && slang->sl_has_map |
6688 | && similar_chars(slang, gc, bc)) |
6689 | CNT(i, j) = SCORE_SIMILAR + CNT(i - 1, j - 1); |
6690 | else |
6691 | CNT(i, j) = SCORE_SUBST + CNT(i - 1, j - 1); |
6692 | } |
6693 | |
6694 | if (i > 1 && j > 1) { |
6695 | if (l_has_mbyte) { |
6696 | pbc = wbadword[i - 2]; |
6697 | pgc = wgoodword[j - 2]; |
6698 | } else { |
6699 | pbc = badword[i - 2]; |
6700 | pgc = goodword[j - 2]; |
6701 | } |
6702 | if (bc == pgc && pbc == gc) { |
6703 | t = SCORE_SWAP + CNT(i - 2, j - 2); |
6704 | if (t < CNT(i, j)) |
6705 | CNT(i, j) = t; |
6706 | } |
6707 | } |
6708 | t = SCORE_DEL + CNT(i - 1, j); |
6709 | if (t < CNT(i, j)) |
6710 | CNT(i, j) = t; |
6711 | t = SCORE_INS + CNT(i, j - 1); |
6712 | if (t < CNT(i, j)) |
6713 | CNT(i, j) = t; |
6714 | } |
6715 | } |
6716 | } |
6717 | |
6718 | i = CNT(badlen - 1, goodlen - 1); |
6719 | xfree(cnt); |
6720 | return i; |
6721 | } |
6722 | |
6723 | // Like spell_edit_score(), but with a limit on the score to make it faster. |
6724 | // May return SCORE_MAXMAX when the score is higher than "limit". |
6725 | // |
6726 | // This uses a stack for the edits still to be tried. |
6727 | // The idea comes from Aspell leditdist.cpp. Rewritten in C and added support |
6728 | // for multi-byte characters. |
6729 | static int spell_edit_score_limit(slang_T *slang, char_u *badword, char_u *goodword, int limit) |
6730 | { |
6731 | limitscore_T stack[10]; // allow for over 3 * 2 edits |
6732 | int stackidx; |
6733 | int bi, gi; |
6734 | int bi2, gi2; |
6735 | int bc, gc; |
6736 | int score; |
6737 | int score_off; |
6738 | int minscore; |
6739 | int round; |
6740 | |
6741 | // Multi-byte characters require a bit more work, use a different function |
6742 | // to avoid testing "has_mbyte" quite often. |
6743 | if (has_mbyte) |
6744 | return spell_edit_score_limit_w(slang, badword, goodword, limit); |
6745 | |
6746 | // The idea is to go from start to end over the words. So long as |
6747 | // characters are equal just continue, this always gives the lowest score. |
6748 | // When there is a difference try several alternatives. Each alternative |
6749 | // increases "score" for the edit distance. Some of the alternatives are |
6750 | // pushed unto a stack and tried later, some are tried right away. At the |
6751 | // end of the word the score for one alternative is known. The lowest |
6752 | // possible score is stored in "minscore". |
6753 | stackidx = 0; |
6754 | bi = 0; |
6755 | gi = 0; |
6756 | score = 0; |
6757 | minscore = limit + 1; |
6758 | |
6759 | for (;; ) { |
6760 | // Skip over an equal part, score remains the same. |
6761 | for (;; ) { |
6762 | bc = badword[bi]; |
6763 | gc = goodword[gi]; |
6764 | if (bc != gc) // stop at a char that's different |
6765 | break; |
6766 | if (bc == NUL) { // both words end |
6767 | if (score < minscore) |
6768 | minscore = score; |
6769 | goto pop; // do next alternative |
6770 | } |
6771 | ++bi; |
6772 | ++gi; |
6773 | } |
6774 | |
6775 | if (gc == NUL) { // goodword ends, delete badword chars |
6776 | do { |
6777 | if ((score += SCORE_DEL) >= minscore) |
6778 | goto pop; // do next alternative |
6779 | } while (badword[++bi] != NUL); |
6780 | minscore = score; |
6781 | } else if (bc == NUL) { // badword ends, insert badword chars |
6782 | do { |
6783 | if ((score += SCORE_INS) >= minscore) |
6784 | goto pop; // do next alternative |
6785 | } while (goodword[++gi] != NUL); |
6786 | minscore = score; |
6787 | } else { // both words continue |
6788 | // If not close to the limit, perform a change. Only try changes |
6789 | // that may lead to a lower score than "minscore". |
6790 | // round 0: try deleting a char from badword |
6791 | // round 1: try inserting a char in badword |
6792 | for (round = 0; round <= 1; ++round) { |
6793 | score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); |
6794 | if (score_off < minscore) { |
6795 | if (score_off + SCORE_EDIT_MIN >= minscore) { |
6796 | // Near the limit, rest of the words must match. We |
6797 | // can check that right now, no need to push an item |
6798 | // onto the stack. |
6799 | bi2 = bi + 1 - round; |
6800 | gi2 = gi + round; |
6801 | while (goodword[gi2] == badword[bi2]) { |
6802 | if (goodword[gi2] == NUL) { |
6803 | minscore = score_off; |
6804 | break; |
6805 | } |
6806 | ++bi2; |
6807 | ++gi2; |
6808 | } |
6809 | } else { |
6810 | // try deleting/inserting a character later |
6811 | stack[stackidx].badi = bi + 1 - round; |
6812 | stack[stackidx].goodi = gi + round; |
6813 | stack[stackidx].score = score_off; |
6814 | ++stackidx; |
6815 | } |
6816 | } |
6817 | } |
6818 | |
6819 | if (score + SCORE_SWAP < minscore) { |
6820 | // If swapping two characters makes a match then the |
6821 | // substitution is more expensive, thus there is no need to |
6822 | // try both. |
6823 | if (gc == badword[bi + 1] && bc == goodword[gi + 1]) { |
6824 | // Swap two characters, that is: skip them. |
6825 | gi += 2; |
6826 | bi += 2; |
6827 | score += SCORE_SWAP; |
6828 | continue; |
6829 | } |
6830 | } |
6831 | |
6832 | // Substitute one character for another which is the same |
6833 | // thing as deleting a character from both goodword and badword. |
6834 | // Use a better score when there is only a case difference. |
6835 | if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) |
6836 | score += SCORE_ICASE; |
6837 | else { |
6838 | // For a similar character use SCORE_SIMILAR. |
6839 | if (slang != NULL |
6840 | && slang->sl_has_map |
6841 | && similar_chars(slang, gc, bc)) |
6842 | score += SCORE_SIMILAR; |
6843 | else |
6844 | score += SCORE_SUBST; |
6845 | } |
6846 | |
6847 | if (score < minscore) { |
6848 | // Do the substitution. |
6849 | ++gi; |
6850 | ++bi; |
6851 | continue; |
6852 | } |
6853 | } |
6854 | pop: |
6855 | // Get here to try the next alternative, pop it from the stack. |
6856 | if (stackidx == 0) // stack is empty, finished |
6857 | break; |
6858 | |
6859 | // pop an item from the stack |
6860 | --stackidx; |
6861 | gi = stack[stackidx].goodi; |
6862 | bi = stack[stackidx].badi; |
6863 | score = stack[stackidx].score; |
6864 | } |
6865 | |
6866 | // When the score goes over "limit" it may actually be much higher. |
6867 | // Return a very large number to avoid going below the limit when giving a |
6868 | // bonus. |
6869 | if (minscore > limit) |
6870 | return SCORE_MAXMAX; |
6871 | return minscore; |
6872 | } |
6873 | |
6874 | // Multi-byte version of spell_edit_score_limit(). |
6875 | // Keep it in sync with the above! |
6876 | static int spell_edit_score_limit_w(slang_T *slang, char_u *badword, char_u *goodword, int limit) |
6877 | { |
6878 | limitscore_T stack[10]; // allow for over 3 * 2 edits |
6879 | int stackidx; |
6880 | int bi, gi; |
6881 | int bi2, gi2; |
6882 | int bc, gc; |
6883 | int score; |
6884 | int score_off; |
6885 | int minscore; |
6886 | int round; |
6887 | int wbadword[MAXWLEN]; |
6888 | int wgoodword[MAXWLEN]; |
6889 | |
6890 | // Get the characters from the multi-byte strings and put them in an |
6891 | // int array for easy access. |
6892 | bi = 0; |
6893 | for (const char_u *p = badword; *p != NUL; ) { |
6894 | wbadword[bi++] = mb_cptr2char_adv(&p); |
6895 | } |
6896 | wbadword[bi++] = 0; |
6897 | gi = 0; |
6898 | for (const char_u *p = goodword; *p != NUL; ) { |
6899 | wgoodword[gi++] = mb_cptr2char_adv(&p); |
6900 | } |
6901 | wgoodword[gi++] = 0; |
6902 | |
6903 | // The idea is to go from start to end over the words. So long as |
6904 | // characters are equal just continue, this always gives the lowest score. |
6905 | // When there is a difference try several alternatives. Each alternative |
6906 | // increases "score" for the edit distance. Some of the alternatives are |
6907 | // pushed unto a stack and tried later, some are tried right away. At the |
6908 | // end of the word the score for one alternative is known. The lowest |
6909 | // possible score is stored in "minscore". |
6910 | stackidx = 0; |
6911 | bi = 0; |
6912 | gi = 0; |
6913 | score = 0; |
6914 | minscore = limit + 1; |
6915 | |
6916 | for (;; ) { |
6917 | // Skip over an equal part, score remains the same. |
6918 | for (;; ) { |
6919 | bc = wbadword[bi]; |
6920 | gc = wgoodword[gi]; |
6921 | |
6922 | if (bc != gc) // stop at a char that's different |
6923 | break; |
6924 | if (bc == NUL) { // both words end |
6925 | if (score < minscore) |
6926 | minscore = score; |
6927 | goto pop; // do next alternative |
6928 | } |
6929 | ++bi; |
6930 | ++gi; |
6931 | } |
6932 | |
6933 | if (gc == NUL) { // goodword ends, delete badword chars |
6934 | do { |
6935 | if ((score += SCORE_DEL) >= minscore) |
6936 | goto pop; // do next alternative |
6937 | } while (wbadword[++bi] != NUL); |
6938 | minscore = score; |
6939 | } else if (bc == NUL) { // badword ends, insert badword chars |
6940 | do { |
6941 | if ((score += SCORE_INS) >= minscore) |
6942 | goto pop; // do next alternative |
6943 | } while (wgoodword[++gi] != NUL); |
6944 | minscore = score; |
6945 | } else { // both words continue |
6946 | // If not close to the limit, perform a change. Only try changes |
6947 | // that may lead to a lower score than "minscore". |
6948 | // round 0: try deleting a char from badword |
6949 | // round 1: try inserting a char in badword |
6950 | for (round = 0; round <= 1; ++round) { |
6951 | score_off = score + (round == 0 ? SCORE_DEL : SCORE_INS); |
6952 | if (score_off < minscore) { |
6953 | if (score_off + SCORE_EDIT_MIN >= minscore) { |
6954 | // Near the limit, rest of the words must match. We |
6955 | // can check that right now, no need to push an item |
6956 | // onto the stack. |
6957 | bi2 = bi + 1 - round; |
6958 | gi2 = gi + round; |
6959 | while (wgoodword[gi2] == wbadword[bi2]) { |
6960 | if (wgoodword[gi2] == NUL) { |
6961 | minscore = score_off; |
6962 | break; |
6963 | } |
6964 | ++bi2; |
6965 | ++gi2; |
6966 | } |
6967 | } else { |
6968 | // try deleting a character from badword later |
6969 | stack[stackidx].badi = bi + 1 - round; |
6970 | stack[stackidx].goodi = gi + round; |
6971 | stack[stackidx].score = score_off; |
6972 | ++stackidx; |
6973 | } |
6974 | } |
6975 | } |
6976 | |
6977 | if (score + SCORE_SWAP < minscore) { |
6978 | // If swapping two characters makes a match then the |
6979 | // substitution is more expensive, thus there is no need to |
6980 | // try both. |
6981 | if (gc == wbadword[bi + 1] && bc == wgoodword[gi + 1]) { |
6982 | // Swap two characters, that is: skip them. |
6983 | gi += 2; |
6984 | bi += 2; |
6985 | score += SCORE_SWAP; |
6986 | continue; |
6987 | } |
6988 | } |
6989 | |
6990 | // Substitute one character for another which is the same |
6991 | // thing as deleting a character from both goodword and badword. |
6992 | // Use a better score when there is only a case difference. |
6993 | if (SPELL_TOFOLD(bc) == SPELL_TOFOLD(gc)) |
6994 | score += SCORE_ICASE; |
6995 | else { |
6996 | // For a similar character use SCORE_SIMILAR. |
6997 | if (slang != NULL |
6998 | && slang->sl_has_map |
6999 | && similar_chars(slang, gc, bc)) |
7000 | score += SCORE_SIMILAR; |
7001 | else |
7002 | score += SCORE_SUBST; |
7003 | } |
7004 | |
7005 | if (score < minscore) { |
7006 | // Do the substitution. |
7007 | ++gi; |
7008 | ++bi; |
7009 | continue; |
7010 | } |
7011 | } |
7012 | pop: |
7013 | // Get here to try the next alternative, pop it from the stack. |
7014 | if (stackidx == 0) // stack is empty, finished |
7015 | break; |
7016 | |
7017 | // pop an item from the stack |
7018 | --stackidx; |
7019 | gi = stack[stackidx].goodi; |
7020 | bi = stack[stackidx].badi; |
7021 | score = stack[stackidx].score; |
7022 | } |
7023 | |
7024 | // When the score goes over "limit" it may actually be much higher. |
7025 | // Return a very large number to avoid going below the limit when giving a |
7026 | // bonus. |
7027 | if (minscore > limit) |
7028 | return SCORE_MAXMAX; |
7029 | return minscore; |
7030 | } |
7031 | |
7032 | // ":spellinfo" |
7033 | void ex_spellinfo(exarg_T *eap) |
7034 | { |
7035 | if (no_spell_checking(curwin)) { |
7036 | return; |
7037 | } |
7038 | |
7039 | msg_start(); |
7040 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len && !got_int; lpi++) { |
7041 | langp_T *const lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
7042 | msg_puts("file: " ); |
7043 | msg_puts((const char *)lp->lp_slang->sl_fname); |
7044 | msg_putchar('\n'); |
7045 | const char *const p = (const char *)lp->lp_slang->sl_info; |
7046 | if (p != NULL) { |
7047 | msg_puts(p); |
7048 | msg_putchar('\n'); |
7049 | } |
7050 | } |
7051 | msg_end(); |
7052 | } |
7053 | |
7054 | #define DUMPFLAG_KEEPCASE 1 // round 2: keep-case tree |
7055 | #define DUMPFLAG_COUNT 2 // include word count |
7056 | #define DUMPFLAG_ICASE 4 // ignore case when finding matches |
7057 | #define DUMPFLAG_ONECAP 8 // pattern starts with capital |
7058 | #define DUMPFLAG_ALLCAP 16 // pattern is all capitals |
7059 | |
7060 | // ":spelldump" |
7061 | void ex_spelldump(exarg_T *eap) |
7062 | { |
7063 | char_u *spl; |
7064 | long dummy; |
7065 | |
7066 | if (no_spell_checking(curwin)) { |
7067 | return; |
7068 | } |
7069 | get_option_value((char_u *)"spl" , &dummy, &spl, OPT_LOCAL); |
7070 | |
7071 | // Create a new empty buffer in a new window. |
7072 | do_cmdline_cmd("new" ); |
7073 | |
7074 | // enable spelling locally in the new window |
7075 | set_option_value("spell" , true, "" , OPT_LOCAL); |
7076 | set_option_value("spl" , dummy, (char *)spl, OPT_LOCAL); |
7077 | xfree(spl); |
7078 | |
7079 | if (!BUFEMPTY()) { |
7080 | return; |
7081 | } |
7082 | |
7083 | spell_dump_compl(NULL, 0, NULL, eap->forceit ? DUMPFLAG_COUNT : 0); |
7084 | |
7085 | // Delete the empty line that we started with. |
7086 | if (curbuf->b_ml.ml_line_count > 1) { |
7087 | ml_delete(curbuf->b_ml.ml_line_count, false); |
7088 | } |
7089 | redraw_later(NOT_VALID); |
7090 | } |
7091 | |
7092 | // Go through all possible words and: |
7093 | // 1. When "pat" is NULL: dump a list of all words in the current buffer. |
7094 | // "ic" and "dir" are not used. |
7095 | // 2. When "pat" is not NULL: add matching words to insert mode completion. |
7096 | void |
7097 | spell_dump_compl ( |
7098 | char_u *pat, // leading part of the word |
7099 | int ic, // ignore case |
7100 | int *dir, // direction for adding matches |
7101 | int dumpflags_arg // DUMPFLAG_* |
7102 | ) |
7103 | { |
7104 | langp_T *lp; |
7105 | slang_T *slang; |
7106 | idx_T arridx[MAXWLEN]; |
7107 | int curi[MAXWLEN]; |
7108 | char_u word[MAXWLEN]; |
7109 | int c; |
7110 | char_u *byts; |
7111 | idx_T *idxs; |
7112 | linenr_T lnum = 0; |
7113 | int round; |
7114 | int depth; |
7115 | int n; |
7116 | int flags; |
7117 | char_u *region_names = NULL; // region names being used |
7118 | bool do_region = true; // dump region names and numbers |
7119 | char_u *p; |
7120 | int dumpflags = dumpflags_arg; |
7121 | int patlen; |
7122 | |
7123 | // When ignoring case or when the pattern starts with capital pass this on |
7124 | // to dump_word(). |
7125 | if (pat != NULL) { |
7126 | if (ic) |
7127 | dumpflags |= DUMPFLAG_ICASE; |
7128 | else { |
7129 | n = captype(pat, NULL); |
7130 | if (n == WF_ONECAP) |
7131 | dumpflags |= DUMPFLAG_ONECAP; |
7132 | else if (n == WF_ALLCAP |
7133 | && (int)STRLEN(pat) > mb_ptr2len(pat) |
7134 | ) |
7135 | dumpflags |= DUMPFLAG_ALLCAP; |
7136 | } |
7137 | } |
7138 | |
7139 | // Find out if we can support regions: All languages must support the same |
7140 | // regions or none at all. |
7141 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) { |
7142 | lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
7143 | p = lp->lp_slang->sl_regions; |
7144 | if (p[0] != 0) { |
7145 | if (region_names == NULL) // first language with regions |
7146 | region_names = p; |
7147 | else if (STRCMP(region_names, p) != 0) { |
7148 | do_region = false; // region names are different |
7149 | break; |
7150 | } |
7151 | } |
7152 | } |
7153 | |
7154 | if (do_region && region_names != NULL) { |
7155 | if (pat == NULL) { |
7156 | vim_snprintf((char *)IObuff, IOSIZE, "/regions=%s" , region_names); |
7157 | ml_append(lnum++, IObuff, (colnr_T)0, false); |
7158 | } |
7159 | } else |
7160 | do_region = false; |
7161 | |
7162 | // Loop over all files loaded for the entries in 'spelllang'. |
7163 | for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) { |
7164 | lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi); |
7165 | slang = lp->lp_slang; |
7166 | if (slang->sl_fbyts == NULL) // reloading failed |
7167 | continue; |
7168 | |
7169 | if (pat == NULL) { |
7170 | vim_snprintf((char *)IObuff, IOSIZE, "# file: %s" , slang->sl_fname); |
7171 | ml_append(lnum++, IObuff, (colnr_T)0, false); |
7172 | } |
7173 | |
7174 | // When matching with a pattern and there are no prefixes only use |
7175 | // parts of the tree that match "pat". |
7176 | if (pat != NULL && slang->sl_pbyts == NULL) |
7177 | patlen = (int)STRLEN(pat); |
7178 | else |
7179 | patlen = -1; |
7180 | |
7181 | // round 1: case-folded tree |
7182 | // round 2: keep-case tree |
7183 | for (round = 1; round <= 2; ++round) { |
7184 | if (round == 1) { |
7185 | dumpflags &= ~DUMPFLAG_KEEPCASE; |
7186 | byts = slang->sl_fbyts; |
7187 | idxs = slang->sl_fidxs; |
7188 | } else { |
7189 | dumpflags |= DUMPFLAG_KEEPCASE; |
7190 | byts = slang->sl_kbyts; |
7191 | idxs = slang->sl_kidxs; |
7192 | } |
7193 | if (byts == NULL) |
7194 | continue; // array is empty |
7195 | |
7196 | depth = 0; |
7197 | arridx[0] = 0; |
7198 | curi[0] = 1; |
7199 | while (depth >= 0 && !got_int |
7200 | && (pat == NULL || !compl_interrupted)) { |
7201 | if (curi[depth] > byts[arridx[depth]]) { |
7202 | // Done all bytes at this node, go up one level. |
7203 | --depth; |
7204 | line_breakcheck(); |
7205 | ins_compl_check_keys(50, false); |
7206 | } else { |
7207 | // Do one more byte at this node. |
7208 | n = arridx[depth] + curi[depth]; |
7209 | ++curi[depth]; |
7210 | c = byts[n]; |
7211 | if (c == 0) { |
7212 | // End of word, deal with the word. |
7213 | // Don't use keep-case words in the fold-case tree, |
7214 | // they will appear in the keep-case tree. |
7215 | // Only use the word when the region matches. |
7216 | flags = (int)idxs[n]; |
7217 | if ((round == 2 || (flags & WF_KEEPCAP) == 0) |
7218 | && (flags & WF_NEEDCOMP) == 0 |
7219 | && (do_region |
7220 | || (flags & WF_REGION) == 0 |
7221 | || (((unsigned)flags >> 16) |
7222 | & lp->lp_region) != 0)) { |
7223 | word[depth] = NUL; |
7224 | if (!do_region) |
7225 | flags &= ~WF_REGION; |
7226 | |
7227 | // Dump the basic word if there is no prefix or |
7228 | // when it's the first one. |
7229 | c = (unsigned)flags >> 24; |
7230 | if (c == 0 || curi[depth] == 2) { |
7231 | dump_word(slang, word, pat, dir, |
7232 | dumpflags, flags, lnum); |
7233 | if (pat == NULL) |
7234 | ++lnum; |
7235 | } |
7236 | |
7237 | // Apply the prefix, if there is one. |
7238 | if (c != 0) |
7239 | lnum = dump_prefixes(slang, word, pat, dir, |
7240 | dumpflags, flags, lnum); |
7241 | } |
7242 | } else { |
7243 | // Normal char, go one level deeper. |
7244 | word[depth++] = c; |
7245 | arridx[depth] = idxs[n]; |
7246 | curi[depth] = 1; |
7247 | |
7248 | // Check if this characters matches with the pattern. |
7249 | // If not skip the whole tree below it. |
7250 | // Always ignore case here, dump_word() will check |
7251 | // proper case later. This isn't exactly right when |
7252 | // length changes for multi-byte characters with |
7253 | // ignore case... |
7254 | assert(depth >= 0); |
7255 | if (depth <= patlen |
7256 | && mb_strnicmp(word, pat, (size_t)depth) != 0) |
7257 | --depth; |
7258 | } |
7259 | } |
7260 | } |
7261 | } |
7262 | } |
7263 | } |
7264 | |
7265 | // Dumps one word: apply case modifications and append a line to the buffer. |
7266 | // When "lnum" is zero add insert mode completion. |
7267 | static void dump_word(slang_T *slang, char_u *word, char_u *pat, int *dir, int dumpflags, int wordflags, linenr_T lnum) |
7268 | { |
7269 | bool keepcap = false; |
7270 | char_u *p; |
7271 | char_u *tw; |
7272 | char_u cword[MAXWLEN]; |
7273 | char_u badword[MAXWLEN + 10]; |
7274 | int i; |
7275 | int flags = wordflags; |
7276 | |
7277 | if (dumpflags & DUMPFLAG_ONECAP) |
7278 | flags |= WF_ONECAP; |
7279 | if (dumpflags & DUMPFLAG_ALLCAP) |
7280 | flags |= WF_ALLCAP; |
7281 | |
7282 | if ((dumpflags & DUMPFLAG_KEEPCASE) == 0 && (flags & WF_CAPMASK) != 0) { |
7283 | // Need to fix case according to "flags". |
7284 | make_case_word(word, cword, flags); |
7285 | p = cword; |
7286 | } else { |
7287 | p = word; |
7288 | if ((dumpflags & DUMPFLAG_KEEPCASE) |
7289 | && ((captype(word, NULL) & WF_KEEPCAP) == 0 |
7290 | || (flags & WF_FIXCAP) != 0)) |
7291 | keepcap = true; |
7292 | } |
7293 | tw = p; |
7294 | |
7295 | if (pat == NULL) { |
7296 | // Add flags and regions after a slash. |
7297 | if ((flags & (WF_BANNED | WF_RARE | WF_REGION)) || keepcap) { |
7298 | STRCPY(badword, p); |
7299 | STRCAT(badword, "/" ); |
7300 | if (keepcap) { |
7301 | STRCAT(badword, "=" ); |
7302 | } |
7303 | if (flags & WF_BANNED) { |
7304 | STRCAT(badword, "!" ); |
7305 | } else if (flags & WF_RARE) { |
7306 | STRCAT(badword, "?" ); |
7307 | } |
7308 | if (flags & WF_REGION) { |
7309 | for (i = 0; i < 7; i++) { |
7310 | if (flags & (0x10000 << i)) { |
7311 | const size_t badword_len = STRLEN(badword); |
7312 | snprintf((char *)badword + badword_len, |
7313 | sizeof(badword) - badword_len, |
7314 | "%d" , i + 1); |
7315 | } |
7316 | } |
7317 | } |
7318 | p = badword; |
7319 | } |
7320 | |
7321 | if (dumpflags & DUMPFLAG_COUNT) { |
7322 | hashitem_T *hi; |
7323 | |
7324 | // Include the word count for ":spelldump!". |
7325 | hi = hash_find(&slang->sl_wordcount, tw); |
7326 | if (!HASHITEM_EMPTY(hi)) { |
7327 | vim_snprintf((char *)IObuff, IOSIZE, "%s\t%d" , |
7328 | tw, HI2WC(hi)->wc_count); |
7329 | p = IObuff; |
7330 | } |
7331 | } |
7332 | |
7333 | ml_append(lnum, p, (colnr_T)0, false); |
7334 | } else if (((dumpflags & DUMPFLAG_ICASE) |
7335 | ? mb_strnicmp(p, pat, STRLEN(pat)) == 0 |
7336 | : STRNCMP(p, pat, STRLEN(pat)) == 0) |
7337 | && ins_compl_add_infercase(p, (int)STRLEN(p), |
7338 | p_ic, NULL, *dir, false) == OK) { |
7339 | // if dir was BACKWARD then honor it just once |
7340 | *dir = FORWARD; |
7341 | } |
7342 | } |
7343 | |
7344 | // For ":spelldump": Find matching prefixes for "word". Prepend each to |
7345 | // "word" and append a line to the buffer. |
7346 | // When "lnum" is zero add insert mode completion. |
7347 | // Return the updated line number. |
7348 | static linenr_T |
7349 | dump_prefixes ( |
7350 | slang_T *slang, |
7351 | char_u *word, // case-folded word |
7352 | char_u *pat, |
7353 | int *dir, |
7354 | int dumpflags, |
7355 | int flags, // flags with prefix ID |
7356 | linenr_T startlnum |
7357 | ) |
7358 | { |
7359 | idx_T arridx[MAXWLEN]; |
7360 | int curi[MAXWLEN]; |
7361 | char_u prefix[MAXWLEN]; |
7362 | char_u word_up[MAXWLEN]; |
7363 | bool has_word_up = false; |
7364 | int c; |
7365 | char_u *byts; |
7366 | idx_T *idxs; |
7367 | linenr_T lnum = startlnum; |
7368 | int depth; |
7369 | int n; |
7370 | int len; |
7371 | int i; |
7372 | |
7373 | // If the word starts with a lower-case letter make the word with an |
7374 | // upper-case letter in word_up[]. |
7375 | c = PTR2CHAR(word); |
7376 | if (SPELL_TOUPPER(c) != c) { |
7377 | onecap_copy(word, word_up, true); |
7378 | has_word_up = true; |
7379 | } |
7380 | |
7381 | byts = slang->sl_pbyts; |
7382 | idxs = slang->sl_pidxs; |
7383 | if (byts != NULL) { // array not is empty |
7384 | // Loop over all prefixes, building them byte-by-byte in prefix[]. |
7385 | // When at the end of a prefix check that it supports "flags". |
7386 | depth = 0; |
7387 | arridx[0] = 0; |
7388 | curi[0] = 1; |
7389 | while (depth >= 0 && !got_int) { |
7390 | n = arridx[depth]; |
7391 | len = byts[n]; |
7392 | if (curi[depth] > len) { |
7393 | // Done all bytes at this node, go up one level. |
7394 | --depth; |
7395 | line_breakcheck(); |
7396 | } else { |
7397 | // Do one more byte at this node. |
7398 | n += curi[depth]; |
7399 | ++curi[depth]; |
7400 | c = byts[n]; |
7401 | if (c == 0) { |
7402 | // End of prefix, find out how many IDs there are. |
7403 | for (i = 1; i < len; ++i) |
7404 | if (byts[n + i] != 0) |
7405 | break; |
7406 | curi[depth] += i - 1; |
7407 | |
7408 | c = valid_word_prefix(i, n, flags, word, slang, false); |
7409 | if (c != 0) { |
7410 | STRLCPY(prefix + depth, word, MAXWLEN - depth); |
7411 | dump_word(slang, prefix, pat, dir, dumpflags, |
7412 | (c & WF_RAREPFX) ? (flags | WF_RARE) |
7413 | : flags, lnum); |
7414 | if (lnum != 0) |
7415 | ++lnum; |
7416 | } |
7417 | |
7418 | // Check for prefix that matches the word when the |
7419 | // first letter is upper-case, but only if the prefix has |
7420 | // a condition. |
7421 | if (has_word_up) { |
7422 | c = valid_word_prefix(i, n, flags, word_up, slang, |
7423 | true); |
7424 | if (c != 0) { |
7425 | STRLCPY(prefix + depth, word_up, MAXWLEN - depth); |
7426 | dump_word(slang, prefix, pat, dir, dumpflags, |
7427 | (c & WF_RAREPFX) ? (flags | WF_RARE) |
7428 | : flags, lnum); |
7429 | if (lnum != 0) |
7430 | ++lnum; |
7431 | } |
7432 | } |
7433 | } else { |
7434 | // Normal char, go one level deeper. |
7435 | prefix[depth++] = c; |
7436 | arridx[depth] = idxs[n]; |
7437 | curi[depth] = 1; |
7438 | } |
7439 | } |
7440 | } |
7441 | } |
7442 | |
7443 | return lnum; |
7444 | } |
7445 | |
7446 | // Move "p" to the end of word "start". |
7447 | // Uses the spell-checking word characters. |
7448 | char_u *spell_to_word_end(char_u *start, win_T *win) |
7449 | { |
7450 | char_u *p = start; |
7451 | |
7452 | while (*p != NUL && spell_iswordp(p, win)) { |
7453 | MB_PTR_ADV(p); |
7454 | } |
7455 | return p; |
7456 | } |
7457 | |
7458 | // For Insert mode completion CTRL-X s: |
7459 | // Find start of the word in front of column "startcol". |
7460 | // We don't check if it is badly spelled, with completion we can only change |
7461 | // the word in front of the cursor. |
7462 | // Returns the column number of the word. |
7463 | int spell_word_start(int startcol) |
7464 | { |
7465 | char_u *line; |
7466 | char_u *p; |
7467 | int col = 0; |
7468 | |
7469 | if (no_spell_checking(curwin)) { |
7470 | return startcol; |
7471 | } |
7472 | |
7473 | // Find a word character before "startcol". |
7474 | line = get_cursor_line_ptr(); |
7475 | for (p = line + startcol; p > line; ) { |
7476 | MB_PTR_BACK(line, p); |
7477 | if (spell_iswordp_nmw(p, curwin)) { |
7478 | break; |
7479 | } |
7480 | } |
7481 | |
7482 | // Go back to start of the word. |
7483 | while (p > line) { |
7484 | col = (int)(p - line); |
7485 | MB_PTR_BACK(line, p); |
7486 | if (!spell_iswordp(p, curwin)) { |
7487 | break; |
7488 | } |
7489 | col = 0; |
7490 | } |
7491 | |
7492 | return col; |
7493 | } |
7494 | |
7495 | // Need to check for 'spellcapcheck' now, the word is removed before |
7496 | // expand_spelling() is called. Therefore the ugly global variable. |
7497 | static bool spell_expand_need_cap; |
7498 | |
7499 | void spell_expand_check_cap(colnr_T col) |
7500 | { |
7501 | spell_expand_need_cap = check_need_cap(curwin->w_cursor.lnum, col); |
7502 | } |
7503 | |
7504 | // Get list of spelling suggestions. |
7505 | // Used for Insert mode completion CTRL-X ?. |
7506 | // Returns the number of matches. The matches are in "matchp[]", array of |
7507 | // allocated strings. |
7508 | int expand_spelling(linenr_T lnum, char_u *pat, char_u ***matchp) |
7509 | { |
7510 | garray_T ga; |
7511 | |
7512 | spell_suggest_list(&ga, pat, 100, spell_expand_need_cap, true); |
7513 | *matchp = ga.ga_data; |
7514 | return ga.ga_len; |
7515 | } |
7516 | |