1 | #ifndef NVIM_SPELL_DEFS_H |
2 | #define NVIM_SPELL_DEFS_H |
3 | |
4 | #include <stdbool.h> |
5 | #include <stdint.h> |
6 | |
7 | #include "nvim/buffer_defs.h" |
8 | #include "nvim/garray.h" |
9 | #include "nvim/regexp_defs.h" |
10 | #include "nvim/types.h" |
11 | |
12 | #define MAXWLEN 254 // Assume max. word len is this many bytes. |
13 | // Some places assume a word length fits in a |
14 | // byte, thus it can't be above 255. |
15 | |
16 | // Number of regions supported. |
17 | #define MAXREGIONS 8 |
18 | |
19 | // Type used for indexes in the word tree need to be at least 4 bytes. If int |
20 | // is 8 bytes we could use something smaller, but what? |
21 | typedef int idx_T; |
22 | |
23 | # define SPL_FNAME_TMPL "%s.%s.spl" |
24 | # define SPL_FNAME_ADD ".add." |
25 | # define SPL_FNAME_ASCII ".ascii." |
26 | |
27 | // Flags used for a word. Only the lowest byte can be used, the region byte |
28 | // comes above it. |
29 | #define WF_REGION 0x01 // region byte follows |
30 | #define WF_ONECAP 0x02 // word with one capital (or all capitals) |
31 | #define WF_ALLCAP 0x04 // word must be all capitals |
32 | #define WF_RARE 0x08 // rare word |
33 | #define WF_BANNED 0x10 // bad word |
34 | #define WF_AFX 0x20 // affix ID follows |
35 | #define WF_FIXCAP 0x40 // keep-case word, allcap not allowed |
36 | #define WF_KEEPCAP 0x80 // keep-case word |
37 | |
38 | // for <flags2>, shifted up one byte to be used in wn_flags |
39 | #define WF_HAS_AFF 0x0100 // word includes affix |
40 | #define WF_NEEDCOMP 0x0200 // word only valid in compound |
41 | #define WF_NOSUGGEST 0x0400 // word not to be suggested |
42 | #define WF_COMPROOT 0x0800 // already compounded word, COMPOUNDROOT |
43 | #define WF_NOCOMPBEF 0x1000 // no compounding before this word |
44 | #define WF_NOCOMPAFT 0x2000 // no compounding after this word |
45 | |
46 | // flags for <pflags> |
47 | #define WFP_RARE 0x01 // rare prefix |
48 | #define WFP_NC 0x02 // prefix is not combining |
49 | #define WFP_UP 0x04 // to-upper prefix |
50 | #define WFP_COMPPERMIT 0x08 // prefix with COMPOUNDPERMITFLAG |
51 | #define WFP_COMPFORBID 0x10 // prefix with COMPOUNDFORBIDFLAG |
52 | |
53 | // Flags for postponed prefixes in "sl_pidxs". Must be above affixID (one |
54 | // byte) and prefcondnr (two bytes). |
55 | #define WF_RAREPFX (WFP_RARE << 24) // rare postponed prefix |
56 | #define WF_PFX_NC (WFP_NC << 24) // non-combining postponed prefix |
57 | #define WF_PFX_UP (WFP_UP << 24) // to-upper postponed prefix |
58 | #define WF_PFX_COMPPERMIT (WFP_COMPPERMIT << 24) // postponed prefix with |
59 | // COMPOUNDPERMITFLAG |
60 | #define WF_PFX_COMPFORBID (WFP_COMPFORBID << 24) // postponed prefix with |
61 | // COMPOUNDFORBIDFLAG |
62 | |
63 | |
64 | // flags for <compoptions> |
65 | #define COMP_CHECKDUP 1 // CHECKCOMPOUNDDUP |
66 | #define COMP_CHECKREP 2 // CHECKCOMPOUNDREP |
67 | #define COMP_CHECKCASE 4 // CHECKCOMPOUNDCASE |
68 | #define COMP_CHECKTRIPLE 8 // CHECKCOMPOUNDTRIPLE |
69 | |
70 | // Info from "REP", "REPSAL" and "SAL" entries in ".aff" file used in si_rep, |
71 | // si_repsal, sl_rep, and si_sal. Not for sl_sal! |
72 | // One replacement: from "ft_from" to "ft_to". |
73 | typedef struct fromto_S { |
74 | char_u *ft_from; |
75 | char_u *ft_to; |
76 | } fromto_T; |
77 | |
78 | // Info from "SAL" entries in ".aff" file used in sl_sal. |
79 | // The info is split for quick processing by spell_soundfold(). |
80 | // Note that "sm_oneof" and "sm_rules" point into sm_lead. |
81 | typedef struct salitem_S { |
82 | char_u *sm_lead; // leading letters |
83 | int sm_leadlen; // length of "sm_lead" |
84 | char_u *sm_oneof; // letters from () or NULL |
85 | char_u *sm_rules; // rules like ^, $, priority |
86 | char_u *sm_to; // replacement. |
87 | int *sm_lead_w; // wide character copy of "sm_lead" |
88 | int *sm_oneof_w; // wide character copy of "sm_oneof" |
89 | int *sm_to_w; // wide character copy of "sm_to" |
90 | } salitem_T; |
91 | |
92 | typedef int salfirst_T; |
93 | |
94 | // Values for SP_*ERROR are negative, positive values are used by |
95 | // read_cnt_string(). |
96 | #define SP_TRUNCERROR -1 // spell file truncated error |
97 | #define SP_FORMERROR -2 // format error in spell file |
98 | #define SP_OTHERERROR -3 // other error while reading spell file |
99 | |
100 | // Structure used to store words and other info for one language, loaded from |
101 | // a .spl file. |
102 | // The main access is through the tree in "sl_fbyts/sl_fidxs", storing the |
103 | // case-folded words. "sl_kbyts/sl_kidxs" is for keep-case words. |
104 | // |
105 | // The "byts" array stores the possible bytes in each tree node, preceded by |
106 | // the number of possible bytes, sorted on byte value: |
107 | // <len> <byte1> <byte2> ... |
108 | // The "idxs" array stores the index of the child node corresponding to the |
109 | // byte in "byts". |
110 | // Exception: when the byte is zero, the word may end here and "idxs" holds |
111 | // the flags, region mask and affixID for the word. There may be several |
112 | // zeros in sequence for alternative flag/region/affixID combinations. |
113 | typedef struct slang_S slang_T; |
114 | |
115 | struct slang_S { |
116 | slang_T *sl_next; // next language |
117 | char_u *sl_name; // language name "en", "en.rare", "nl", etc. |
118 | char_u *sl_fname; // name of .spl file |
119 | bool sl_add; // true if it's a .add file. |
120 | |
121 | char_u *sl_fbyts; // case-folded word bytes |
122 | idx_T *sl_fidxs; // case-folded word indexes |
123 | char_u *sl_kbyts; // keep-case word bytes |
124 | idx_T *sl_kidxs; // keep-case word indexes |
125 | char_u *sl_pbyts; // prefix tree word bytes |
126 | idx_T *sl_pidxs; // prefix tree word indexes |
127 | |
128 | char_u *sl_info; // infotext string or NULL |
129 | |
130 | char_u sl_regions[MAXREGIONS * 2 + 1]; |
131 | // table with up to 8 region names plus NUL |
132 | |
133 | char_u *sl_midword; // MIDWORD string or NULL |
134 | |
135 | hashtab_T sl_wordcount; // hashtable with word count, wordcount_T |
136 | |
137 | int sl_compmax; // COMPOUNDWORDMAX (default: MAXWLEN) |
138 | int sl_compminlen; // COMPOUNDMIN (default: 0) |
139 | int sl_compsylmax; // COMPOUNDSYLMAX (default: MAXWLEN) |
140 | int sl_compoptions; // COMP_* flags |
141 | garray_T sl_comppat; // CHECKCOMPOUNDPATTERN items |
142 | regprog_T *sl_compprog; // COMPOUNDRULE turned into a regexp progrm |
143 | // (NULL when no compounding) |
144 | char_u *sl_comprules; // all COMPOUNDRULE concatenated (or NULL) |
145 | char_u *sl_compstartflags; // flags for first compound word |
146 | char_u *sl_compallflags; // all flags for compound words |
147 | bool sl_nobreak; // When true: no spaces between words |
148 | char_u *sl_syllable; // SYLLABLE repeatable chars or NULL |
149 | garray_T sl_syl_items; // syllable items |
150 | |
151 | int sl_prefixcnt; // number of items in "sl_prefprog" |
152 | regprog_T **sl_prefprog; // table with regprogs for prefixes |
153 | |
154 | garray_T sl_rep; // list of fromto_T entries from REP lines |
155 | int16_t sl_rep_first[256]; // indexes where byte first appears, -1 if |
156 | // there is none |
157 | garray_T sl_sal; // list of salitem_T entries from SAL lines |
158 | salfirst_T sl_sal_first[256]; // indexes where byte first appears, -1 if |
159 | // there is none |
160 | bool sl_followup; // SAL followup |
161 | bool sl_collapse; // SAL collapse_result |
162 | bool sl_rem_accents; // SAL remove_accents |
163 | bool sl_sofo; // SOFOFROM and SOFOTO instead of SAL items: |
164 | // "sl_sal_first" maps chars, when has_mbyte |
165 | // "sl_sal" is a list of wide char lists. |
166 | garray_T sl_repsal; // list of fromto_T entries from REPSAL lines |
167 | int16_t sl_repsal_first[256]; // sl_rep_first for REPSAL lines |
168 | bool sl_nosplitsugs; // don't suggest splitting a word |
169 | bool sl_nocompoundsugs; // don't suggest compounding |
170 | |
171 | // Info from the .sug file. Loaded on demand. |
172 | time_t sl_sugtime; // timestamp for .sug file |
173 | char_u *sl_sbyts; // soundfolded word bytes |
174 | idx_T *sl_sidxs; // soundfolded word indexes |
175 | buf_T *sl_sugbuf; // buffer with word number table |
176 | bool sl_sugloaded; // true when .sug file was loaded or failed to |
177 | // load |
178 | |
179 | bool sl_has_map; // true, if there is a MAP line |
180 | hashtab_T sl_map_hash; // MAP for multi-byte chars |
181 | int sl_map_array[256]; // MAP for first 256 chars |
182 | hashtab_T sl_sounddone; // table with soundfolded words that have |
183 | // handled, see add_sound_suggest() |
184 | }; |
185 | |
186 | // Structure used in "b_langp", filled from 'spelllang'. |
187 | typedef struct langp_S { |
188 | slang_T *lp_slang; // info for this language |
189 | slang_T *lp_sallang; // language used for sound folding or NULL |
190 | slang_T *lp_replang; // language used for REP items or NULL |
191 | int lp_region; // bitmask for region or REGION_ALL |
192 | } langp_T; |
193 | |
194 | #define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i)) |
195 | |
196 | #define VIMSUGMAGIC "VIMsug" // string at start of Vim .sug file |
197 | #define VIMSUGMAGICL 6 |
198 | #define VIMSUGVERSION 1 |
199 | |
200 | #define REGION_ALL 0xff // word valid in all regions |
201 | |
202 | // The tables used for recognizing word characters according to spelling. |
203 | // These are only used for the first 256 characters of 'encoding'. |
204 | typedef struct { |
205 | bool st_isw[256]; // flags: is word char |
206 | bool st_isu[256]; // flags: is uppercase char |
207 | char_u st_fold[256]; // chars: folded case |
208 | char_u st_upper[256]; // chars: upper case |
209 | } spelltab_T; |
210 | |
211 | // For finding suggestions: At each node in the tree these states are tried: |
212 | typedef enum { |
213 | STATE_START = 0, // At start of node check for NUL bytes (goodword |
214 | // ends); if badword ends there is a match, otherwise |
215 | // try splitting word. |
216 | STATE_NOPREFIX, // try without prefix |
217 | STATE_SPLITUNDO, // Undo splitting. |
218 | STATE_ENDNUL, // Past NUL bytes at start of the node. |
219 | STATE_PLAIN, // Use each byte of the node. |
220 | STATE_DEL, // Delete a byte from the bad word. |
221 | STATE_INS_PREP, // Prepare for inserting bytes. |
222 | STATE_INS, // Insert a byte in the bad word. |
223 | STATE_SWAP, // Swap two bytes. |
224 | STATE_UNSWAP, // Undo swap two characters. |
225 | STATE_SWAP3, // Swap two characters over three. |
226 | STATE_UNSWAP3, // Undo Swap two characters over three. |
227 | STATE_UNROT3L, // Undo rotate three characters left |
228 | STATE_UNROT3R, // Undo rotate three characters right |
229 | STATE_REP_INI, // Prepare for using REP items. |
230 | STATE_REP, // Use matching REP items from the .aff file. |
231 | STATE_REP_UNDO, // Undo a REP item replacement. |
232 | STATE_FINAL // End of this node. |
233 | } state_T; |
234 | |
235 | // Struct to keep the state at each level in suggest_try_change(). |
236 | typedef struct trystate_S { |
237 | state_T ts_state; // state at this level, STATE_ |
238 | int ts_score; // score |
239 | idx_T ts_arridx; // index in tree array, start of node |
240 | short ts_curi; // index in list of child nodes |
241 | char_u ts_fidx; // index in fword[], case-folded bad word |
242 | char_u ts_fidxtry; // ts_fidx at which bytes may be changed |
243 | char_u ts_twordlen; // valid length of tword[] |
244 | char_u ts_prefixdepth; // stack depth for end of prefix or |
245 | // PFD_PREFIXTREE or PFD_NOPREFIX |
246 | char_u ts_flags; // TSF_ flags |
247 | char_u ts_tcharlen; // number of bytes in tword character |
248 | char_u ts_tcharidx; // current byte index in tword character |
249 | char_u ts_isdiff; // DIFF_ values |
250 | char_u ts_fcharstart; // index in fword where badword char started |
251 | char_u ts_prewordlen; // length of word in "preword[]" |
252 | char_u ts_splitoff; // index in "tword" after last split |
253 | char_u ts_splitfidx; // "ts_fidx" at word split |
254 | char_u ts_complen; // nr of compound words used |
255 | char_u ts_compsplit; // index for "compflags" where word was spit |
256 | char_u ts_save_badflags; // su_badflags saved here |
257 | char_u ts_delidx; // index in fword for char that was deleted, |
258 | // valid when "ts_flags" has TSF_DIDDEL |
259 | } trystate_T; |
260 | |
261 | // Use our own character-case definitions, because the current locale may |
262 | // differ from what the .spl file uses. |
263 | // These must not be called with negative number! |
264 | #include <wchar.h> // for towupper() and towlower() |
265 | // Multi-byte implementation. For Unicode we can call utf_*(), but don't do |
266 | // that for ASCII, because we don't want to use 'casemap' here. Otherwise use |
267 | // the "w" library function for characters above 255. |
268 | #define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \ |
269 | : (c) < \ |
270 | 256 ? (int)spelltab.st_fold[c] : (int)towlower(c)) |
271 | |
272 | #define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? mb_toupper(c) \ |
273 | : (c) < \ |
274 | 256 ? (int)spelltab.st_upper[c] : (int)towupper(c)) |
275 | |
276 | #define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? mb_isupper(c) \ |
277 | : (c) < 256 ? spelltab.st_isu[c] : iswupper(c)) |
278 | |
279 | // First language that is loaded, start of the linked list of loaded |
280 | // languages. |
281 | extern slang_T *first_lang; |
282 | |
283 | // file used for "zG" and "zW" |
284 | extern char_u *int_wordlist; |
285 | |
286 | extern spelltab_T spelltab; |
287 | extern int did_set_spelltab; |
288 | |
289 | extern char *e_format; |
290 | |
291 | #endif // NVIM_SPELL_DEFS_H |
292 | |