1 | // This is an open source non-commercial project. Dear PVS-Studio, please check |
2 | // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com |
3 | |
4 | /// @file arabic.c |
5 | /// |
6 | /// Functions for Arabic language. |
7 | /// |
8 | /// Arabic characters are categorized into following types: |
9 | /// |
10 | /// Isolated - iso-8859-6 form char denoted with a_* |
11 | /// Initial - unicode form-B start char denoted with a_i_* |
12 | /// Medial - unicode form-B middle char denoted with a_m_* |
13 | /// Final - unicode form-B final char denoted with a_f_* |
14 | /// Stand-Alone - unicode form-B isolated char denoted with a_s_* (NOT USED) |
15 | |
16 | #include <stdbool.h> |
17 | |
18 | #include "nvim/vim.h" |
19 | #include "nvim/ascii.h" |
20 | #include "nvim/arabic.h" |
21 | |
22 | // Arabic ISO-10646-1 character set definition |
23 | |
24 | // Arabic ISO-8859-6 (subset of 10646; 0600 - 06FF) |
25 | #define a_COMMA 0x060C |
26 | #define a_SEMICOLON 0x061B |
27 | #define a_QUESTION 0x061F |
28 | #define a_HAMZA 0x0621 |
29 | #define a_ALEF_MADDA 0x0622 |
30 | #define a_ALEF_HAMZA_ABOVE 0x0623 |
31 | #define a_WAW_HAMZA 0x0624 |
32 | #define a_ALEF_HAMZA_BELOW 0x0625 |
33 | #define a_YEH_HAMZA 0x0626 |
34 | #define a_ALEF 0x0627 |
35 | #define a_BEH 0x0628 |
36 | #define a_TEH_MARBUTA 0x0629 |
37 | #define a_TEH 0x062a |
38 | #define a_THEH 0x062b |
39 | #define a_JEEM 0x062c |
40 | #define a_HAH 0x062d |
41 | #define a_KHAH 0x062e |
42 | #define a_DAL 0x062f |
43 | #define a_THAL 0x0630 |
44 | #define a_REH 0x0631 |
45 | #define a_ZAIN 0x0632 |
46 | #define a_SEEN 0x0633 |
47 | #define a_SHEEN 0x0634 |
48 | #define a_SAD 0x0635 |
49 | #define a_DAD 0x0636 |
50 | #define a_TAH 0x0637 |
51 | #define a_ZAH 0x0638 |
52 | #define a_AIN 0x0639 |
53 | #define a_GHAIN 0x063a |
54 | #define a_TATWEEL 0x0640 |
55 | #define a_FEH 0x0641 |
56 | #define a_QAF 0x0642 |
57 | #define a_KAF 0x0643 |
58 | #define a_LAM 0x0644 |
59 | #define a_MEEM 0x0645 |
60 | #define a_NOON 0x0646 |
61 | #define a_HEH 0x0647 |
62 | #define a_WAW 0x0648 |
63 | #define a_ALEF_MAKSURA 0x0649 |
64 | #define a_YEH 0x064a |
65 | |
66 | #define a_FATHATAN 0x064b |
67 | #define a_DAMMATAN 0x064c |
68 | #define a_KASRATAN 0x064d |
69 | #define a_FATHA 0x064e |
70 | #define a_DAMMA 0x064f |
71 | #define a_KASRA 0x0650 |
72 | #define a_SHADDA 0x0651 |
73 | #define a_SUKUN 0x0652 |
74 | |
75 | #define a_MADDA_ABOVE 0x0653 |
76 | #define a_HAMZA_ABOVE 0x0654 |
77 | #define a_HAMZA_BELOW 0x0655 |
78 | |
79 | #define a_ZERO 0x0660 |
80 | #define a_ONE 0x0661 |
81 | #define a_TWO 0x0662 |
82 | #define a_THREE 0x0663 |
83 | #define a_FOUR 0x0664 |
84 | #define a_FIVE 0x0665 |
85 | #define a_SIX 0x0666 |
86 | #define a_SEVEN 0x0667 |
87 | #define a_EIGHT 0x0668 |
88 | #define a_NINE 0x0669 |
89 | #define a_PERCENT 0x066a |
90 | #define a_DECIMAL 0x066b |
91 | #define a_THOUSANDS 0x066c |
92 | #define a_STAR 0x066d |
93 | #define a_MINI_ALEF 0x0670 |
94 | // Rest of 8859-6 does not relate to Arabic |
95 | |
96 | // Arabic Presentation Form-B (subset of 10646; FE70 - FEFF) |
97 | // |
98 | // s -> isolated |
99 | // i -> initial |
100 | // m -> medial |
101 | // f -> final |
102 | #define a_s_FATHATAN 0xfe70 |
103 | #define a_m_TATWEEL_FATHATAN 0xfe71 |
104 | #define a_s_DAMMATAN 0xfe72 |
105 | |
106 | #define a_s_KASRATAN 0xfe74 |
107 | |
108 | #define a_s_FATHA 0xfe76 |
109 | #define a_m_FATHA 0xfe77 |
110 | #define a_s_DAMMA 0xfe78 |
111 | #define a_m_DAMMA 0xfe79 |
112 | #define a_s_KASRA 0xfe7a |
113 | #define a_m_KASRA 0xfe7b |
114 | #define a_s_SHADDA 0xfe7c |
115 | #define a_m_SHADDA 0xfe7d |
116 | #define a_s_SUKUN 0xfe7e |
117 | #define a_m_SUKUN 0xfe7f |
118 | |
119 | #define a_s_HAMZA 0xfe80 |
120 | #define a_s_ALEF_MADDA 0xfe81 |
121 | #define a_f_ALEF_MADDA 0xfe82 |
122 | #define a_s_ALEF_HAMZA_ABOVE 0xfe83 |
123 | #define a_f_ALEF_HAMZA_ABOVE 0xfe84 |
124 | #define a_s_WAW_HAMZA 0xfe85 |
125 | #define a_f_WAW_HAMZA 0xfe86 |
126 | #define a_s_ALEF_HAMZA_BELOW 0xfe87 |
127 | #define a_f_ALEF_HAMZA_BELOW 0xfe88 |
128 | #define a_s_YEH_HAMZA 0xfe89 |
129 | #define a_f_YEH_HAMZA 0xfe8a |
130 | #define a_i_YEH_HAMZA 0xfe8b |
131 | #define a_m_YEH_HAMZA 0xfe8c |
132 | #define a_s_ALEF 0xfe8d |
133 | #define a_f_ALEF 0xfe8e |
134 | #define a_s_BEH 0xfe8f |
135 | #define a_f_BEH 0xfe90 |
136 | #define a_i_BEH 0xfe91 |
137 | #define a_m_BEH 0xfe92 |
138 | #define a_s_TEH_MARBUTA 0xfe93 |
139 | #define a_f_TEH_MARBUTA 0xfe94 |
140 | #define a_s_TEH 0xfe95 |
141 | #define a_f_TEH 0xfe96 |
142 | #define a_i_TEH 0xfe97 |
143 | #define a_m_TEH 0xfe98 |
144 | #define a_s_THEH 0xfe99 |
145 | #define a_f_THEH 0xfe9a |
146 | #define a_i_THEH 0xfe9b |
147 | #define a_m_THEH 0xfe9c |
148 | #define a_s_JEEM 0xfe9d |
149 | #define a_f_JEEM 0xfe9e |
150 | #define a_i_JEEM 0xfe9f |
151 | #define a_m_JEEM 0xfea0 |
152 | #define a_s_HAH 0xfea1 |
153 | #define a_f_HAH 0xfea2 |
154 | #define a_i_HAH 0xfea3 |
155 | #define a_m_HAH 0xfea4 |
156 | #define a_s_KHAH 0xfea5 |
157 | #define a_f_KHAH 0xfea6 |
158 | #define a_i_KHAH 0xfea7 |
159 | #define a_m_KHAH 0xfea8 |
160 | #define a_s_DAL 0xfea9 |
161 | #define a_f_DAL 0xfeaa |
162 | #define a_s_THAL 0xfeab |
163 | #define a_f_THAL 0xfeac |
164 | #define a_s_REH 0xfead |
165 | #define a_f_REH 0xfeae |
166 | #define a_s_ZAIN 0xfeaf |
167 | #define a_f_ZAIN 0xfeb0 |
168 | #define a_s_SEEN 0xfeb1 |
169 | #define a_f_SEEN 0xfeb2 |
170 | #define a_i_SEEN 0xfeb3 |
171 | #define a_m_SEEN 0xfeb4 |
172 | #define a_s_SHEEN 0xfeb5 |
173 | #define a_f_SHEEN 0xfeb6 |
174 | #define a_i_SHEEN 0xfeb7 |
175 | #define a_m_SHEEN 0xfeb8 |
176 | #define a_s_SAD 0xfeb9 |
177 | #define a_f_SAD 0xfeba |
178 | #define a_i_SAD 0xfebb |
179 | #define a_m_SAD 0xfebc |
180 | #define a_s_DAD 0xfebd |
181 | #define a_f_DAD 0xfebe |
182 | #define a_i_DAD 0xfebf |
183 | #define a_m_DAD 0xfec0 |
184 | #define a_s_TAH 0xfec1 |
185 | #define a_f_TAH 0xfec2 |
186 | #define a_i_TAH 0xfec3 |
187 | #define a_m_TAH 0xfec4 |
188 | #define a_s_ZAH 0xfec5 |
189 | #define a_f_ZAH 0xfec6 |
190 | #define a_i_ZAH 0xfec7 |
191 | #define a_m_ZAH 0xfec8 |
192 | #define a_s_AIN 0xfec9 |
193 | #define a_f_AIN 0xfeca |
194 | #define a_i_AIN 0xfecb |
195 | #define a_m_AIN 0xfecc |
196 | #define a_s_GHAIN 0xfecd |
197 | #define a_f_GHAIN 0xfece |
198 | #define a_i_GHAIN 0xfecf |
199 | #define a_m_GHAIN 0xfed0 |
200 | #define a_s_FEH 0xfed1 |
201 | #define a_f_FEH 0xfed2 |
202 | #define a_i_FEH 0xfed3 |
203 | #define a_m_FEH 0xfed4 |
204 | #define a_s_QAF 0xfed5 |
205 | #define a_f_QAF 0xfed6 |
206 | #define a_i_QAF 0xfed7 |
207 | #define a_m_QAF 0xfed8 |
208 | #define a_s_KAF 0xfed9 |
209 | #define a_f_KAF 0xfeda |
210 | #define a_i_KAF 0xfedb |
211 | #define a_m_KAF 0xfedc |
212 | #define a_s_LAM 0xfedd |
213 | #define a_f_LAM 0xfede |
214 | #define a_i_LAM 0xfedf |
215 | #define a_m_LAM 0xfee0 |
216 | #define a_s_MEEM 0xfee1 |
217 | #define a_f_MEEM 0xfee2 |
218 | #define a_i_MEEM 0xfee3 |
219 | #define a_m_MEEM 0xfee4 |
220 | #define a_s_NOON 0xfee5 |
221 | #define a_f_NOON 0xfee6 |
222 | #define a_i_NOON 0xfee7 |
223 | #define a_m_NOON 0xfee8 |
224 | #define a_s_HEH 0xfee9 |
225 | #define a_f_HEH 0xfeea |
226 | #define a_i_HEH 0xfeeb |
227 | #define a_m_HEH 0xfeec |
228 | #define a_s_WAW 0xfeed |
229 | #define a_f_WAW 0xfeee |
230 | #define a_s_ALEF_MAKSURA 0xfeef |
231 | #define a_f_ALEF_MAKSURA 0xfef0 |
232 | #define a_s_YEH 0xfef1 |
233 | #define a_f_YEH 0xfef2 |
234 | #define a_i_YEH 0xfef3 |
235 | #define a_m_YEH 0xfef4 |
236 | #define a_s_LAM_ALEF_MADDA_ABOVE 0xfef5 |
237 | #define a_f_LAM_ALEF_MADDA_ABOVE 0xfef6 |
238 | #define a_s_LAM_ALEF_HAMZA_ABOVE 0xfef7 |
239 | #define a_f_LAM_ALEF_HAMZA_ABOVE 0xfef8 |
240 | #define a_s_LAM_ALEF_HAMZA_BELOW 0xfef9 |
241 | #define a_f_LAM_ALEF_HAMZA_BELOW 0xfefa |
242 | #define a_s_LAM_ALEF 0xfefb |
243 | #define a_f_LAM_ALEF 0xfefc |
244 | |
245 | #define a_BYTE_ORDER_MARK 0xfeff |
246 | |
247 | |
248 | #ifdef INCLUDE_GENERATED_DECLARATIONS |
249 | # include "arabic.c.generated.h" |
250 | #endif |
251 | |
252 | // Returns true if c is an ISO-8859-6 shaped ARABIC letter (user entered). |
253 | static bool A_is_a(int cur_c) |
254 | { |
255 | switch (cur_c) { |
256 | case a_HAMZA: |
257 | case a_ALEF_MADDA: |
258 | case a_ALEF_HAMZA_ABOVE: |
259 | case a_WAW_HAMZA: |
260 | case a_ALEF_HAMZA_BELOW: |
261 | case a_YEH_HAMZA: |
262 | case a_ALEF: |
263 | case a_BEH: |
264 | case a_TEH_MARBUTA: |
265 | case a_TEH: |
266 | case a_THEH: |
267 | case a_JEEM: |
268 | case a_HAH: |
269 | case a_KHAH: |
270 | case a_DAL: |
271 | case a_THAL: |
272 | case a_REH: |
273 | case a_ZAIN: |
274 | case a_SEEN: |
275 | case a_SHEEN: |
276 | case a_SAD: |
277 | case a_DAD: |
278 | case a_TAH: |
279 | case a_ZAH: |
280 | case a_AIN: |
281 | case a_GHAIN: |
282 | case a_TATWEEL: |
283 | case a_FEH: |
284 | case a_QAF: |
285 | case a_KAF: |
286 | case a_LAM: |
287 | case a_MEEM: |
288 | case a_NOON: |
289 | case a_HEH: |
290 | case a_WAW: |
291 | case a_ALEF_MAKSURA: |
292 | case a_YEH: |
293 | return true; |
294 | } |
295 | |
296 | return false; |
297 | } |
298 | |
299 | // Returns true if c is an Isolated Form-B ARABIC letter |
300 | static bool A_is_s(int cur_c) |
301 | { |
302 | switch (cur_c) { |
303 | case a_s_HAMZA: |
304 | case a_s_ALEF_MADDA: |
305 | case a_s_ALEF_HAMZA_ABOVE: |
306 | case a_s_WAW_HAMZA: |
307 | case a_s_ALEF_HAMZA_BELOW: |
308 | case a_s_YEH_HAMZA: |
309 | case a_s_ALEF: |
310 | case a_s_BEH: |
311 | case a_s_TEH_MARBUTA: |
312 | case a_s_TEH: |
313 | case a_s_THEH: |
314 | case a_s_JEEM: |
315 | case a_s_HAH: |
316 | case a_s_KHAH: |
317 | case a_s_DAL: |
318 | case a_s_THAL: |
319 | case a_s_REH: |
320 | case a_s_ZAIN: |
321 | case a_s_SEEN: |
322 | case a_s_SHEEN: |
323 | case a_s_SAD: |
324 | case a_s_DAD: |
325 | case a_s_TAH: |
326 | case a_s_ZAH: |
327 | case a_s_AIN: |
328 | case a_s_GHAIN: |
329 | case a_s_FEH: |
330 | case a_s_QAF: |
331 | case a_s_KAF: |
332 | case a_s_LAM: |
333 | case a_s_MEEM: |
334 | case a_s_NOON: |
335 | case a_s_HEH: |
336 | case a_s_WAW: |
337 | case a_s_ALEF_MAKSURA: |
338 | case a_s_YEH: |
339 | return true; |
340 | } |
341 | |
342 | return false; |
343 | } |
344 | |
345 | // Returns true if c is a Final shape of an ARABIC letter |
346 | static bool A_is_f(int cur_c) |
347 | { |
348 | switch (cur_c) { |
349 | case a_f_ALEF_MADDA: |
350 | case a_f_ALEF_HAMZA_ABOVE: |
351 | case a_f_WAW_HAMZA: |
352 | case a_f_ALEF_HAMZA_BELOW: |
353 | case a_f_YEH_HAMZA: |
354 | case a_f_ALEF: |
355 | case a_f_BEH: |
356 | case a_f_TEH_MARBUTA: |
357 | case a_f_TEH: |
358 | case a_f_THEH: |
359 | case a_f_JEEM: |
360 | case a_f_HAH: |
361 | case a_f_KHAH: |
362 | case a_f_DAL: |
363 | case a_f_THAL: |
364 | case a_f_REH: |
365 | case a_f_ZAIN: |
366 | case a_f_SEEN: |
367 | case a_f_SHEEN: |
368 | case a_f_SAD: |
369 | case a_f_DAD: |
370 | case a_f_TAH: |
371 | case a_f_ZAH: |
372 | case a_f_AIN: |
373 | case a_f_GHAIN: |
374 | case a_f_FEH: |
375 | case a_f_QAF: |
376 | case a_f_KAF: |
377 | case a_f_LAM: |
378 | case a_f_MEEM: |
379 | case a_f_NOON: |
380 | case a_f_HEH: |
381 | case a_f_WAW: |
382 | case a_f_ALEF_MAKSURA: |
383 | case a_f_YEH: |
384 | case a_f_LAM_ALEF_MADDA_ABOVE: |
385 | case a_f_LAM_ALEF_HAMZA_ABOVE: |
386 | case a_f_LAM_ALEF_HAMZA_BELOW: |
387 | case a_f_LAM_ALEF: |
388 | return true; |
389 | } |
390 | return false; |
391 | } |
392 | |
393 | // Change shape - from ISO-8859-6/Isolated to Form-B Isolated |
394 | static int chg_c_a2s(int cur_c) |
395 | { |
396 | switch (cur_c) { |
397 | case a_HAMZA: return a_s_HAMZA; |
398 | case a_ALEF_MADDA: return a_s_ALEF_MADDA; |
399 | case a_ALEF_HAMZA_ABOVE: return a_s_ALEF_HAMZA_ABOVE; |
400 | case a_WAW_HAMZA: return a_s_WAW_HAMZA; |
401 | case a_ALEF_HAMZA_BELOW: return a_s_ALEF_HAMZA_BELOW; |
402 | case a_YEH_HAMZA: return a_s_YEH_HAMZA; |
403 | case a_ALEF: return a_s_ALEF; |
404 | case a_TEH_MARBUTA: return a_s_TEH_MARBUTA; |
405 | case a_DAL: return a_s_DAL; |
406 | case a_THAL: return a_s_THAL; |
407 | case a_REH: return a_s_REH; |
408 | case a_ZAIN: return a_s_ZAIN; |
409 | case a_TATWEEL: return cur_c; // exceptions |
410 | case a_WAW: return a_s_WAW; |
411 | case a_ALEF_MAKSURA: return a_s_ALEF_MAKSURA; |
412 | case a_BEH: return a_s_BEH; |
413 | case a_TEH: return a_s_TEH; |
414 | case a_THEH: return a_s_THEH; |
415 | case a_JEEM: return a_s_JEEM; |
416 | case a_HAH: return a_s_HAH; |
417 | case a_KHAH: return a_s_KHAH; |
418 | case a_SEEN: return a_s_SEEN; |
419 | case a_SHEEN: return a_s_SHEEN; |
420 | case a_SAD: return a_s_SAD; |
421 | case a_DAD: return a_s_DAD; |
422 | case a_TAH: return a_s_TAH; |
423 | case a_ZAH: return a_s_ZAH; |
424 | case a_AIN: return a_s_AIN; |
425 | case a_GHAIN: return a_s_GHAIN; |
426 | case a_FEH: return a_s_FEH; |
427 | case a_QAF: return a_s_QAF; |
428 | case a_KAF: return a_s_KAF; |
429 | case a_LAM: return a_s_LAM; |
430 | case a_MEEM: return a_s_MEEM; |
431 | case a_NOON: return a_s_NOON; |
432 | case a_HEH: return a_s_HEH; |
433 | case a_YEH: return a_s_YEH; |
434 | } |
435 | return 0; |
436 | } |
437 | |
438 | // Change shape - from ISO-8859-6/Isolated to Initial |
439 | static int chg_c_a2i(int cur_c) |
440 | { |
441 | switch (cur_c) { |
442 | case a_YEH_HAMZA: return a_i_YEH_HAMZA; |
443 | case a_HAMZA: return a_s_HAMZA; // exceptions |
444 | case a_ALEF_MADDA: return a_s_ALEF_MADDA; // exceptions |
445 | case a_ALEF_HAMZA_ABOVE: return a_s_ALEF_HAMZA_ABOVE; // exceptions |
446 | case a_WAW_HAMZA: return a_s_WAW_HAMZA; // exceptions |
447 | case a_ALEF_HAMZA_BELOW: return a_s_ALEF_HAMZA_BELOW; // exceptions |
448 | case a_ALEF: return a_s_ALEF; // exceptions |
449 | case a_TEH_MARBUTA: return a_s_TEH_MARBUTA; // exceptions |
450 | case a_DAL: return a_s_DAL; // exceptions |
451 | case a_THAL: return a_s_THAL; // exceptions |
452 | case a_REH: return a_s_REH; // exceptions |
453 | case a_ZAIN: return a_s_ZAIN; // exceptions |
454 | case a_TATWEEL: return cur_c; // exceptions |
455 | case a_WAW: return a_s_WAW; // exceptions |
456 | case a_ALEF_MAKSURA: return a_s_ALEF_MAKSURA; // exceptions |
457 | case a_BEH: return a_i_BEH; |
458 | case a_TEH: return a_i_TEH; |
459 | case a_THEH: return a_i_THEH; |
460 | case a_JEEM: return a_i_JEEM; |
461 | case a_HAH: return a_i_HAH; |
462 | case a_KHAH: return a_i_KHAH; |
463 | case a_SEEN: return a_i_SEEN; |
464 | case a_SHEEN: return a_i_SHEEN; |
465 | case a_SAD: return a_i_SAD; |
466 | case a_DAD: return a_i_DAD; |
467 | case a_TAH: return a_i_TAH; |
468 | case a_ZAH: return a_i_ZAH; |
469 | case a_AIN: return a_i_AIN; |
470 | case a_GHAIN: return a_i_GHAIN; |
471 | case a_FEH: return a_i_FEH; |
472 | case a_QAF: return a_i_QAF; |
473 | case a_KAF: return a_i_KAF; |
474 | case a_LAM: return a_i_LAM; |
475 | case a_MEEM: return a_i_MEEM; |
476 | case a_NOON: return a_i_NOON; |
477 | case a_HEH: return a_i_HEH; |
478 | case a_YEH: return a_i_YEH; |
479 | } |
480 | return 0; |
481 | } |
482 | |
483 | // Change shape - from ISO-8859-6/Isolated to Medial |
484 | static int chg_c_a2m(int cur_c) |
485 | { |
486 | switch (cur_c) { |
487 | case a_HAMZA: return a_s_HAMZA; // exception |
488 | case a_ALEF_MADDA: return a_f_ALEF_MADDA; // exception |
489 | case a_ALEF_HAMZA_ABOVE: return a_f_ALEF_HAMZA_ABOVE; // exception |
490 | case a_WAW_HAMZA: return a_f_WAW_HAMZA; // exception |
491 | case a_ALEF_HAMZA_BELOW: return a_f_ALEF_HAMZA_BELOW; // exception |
492 | case a_YEH_HAMZA: return a_m_YEH_HAMZA; |
493 | case a_ALEF: return a_f_ALEF; // exception |
494 | case a_BEH: return a_m_BEH; |
495 | case a_TEH_MARBUTA: return a_f_TEH_MARBUTA; // exception |
496 | case a_TEH: return a_m_TEH; |
497 | case a_THEH: return a_m_THEH; |
498 | case a_JEEM: return a_m_JEEM; |
499 | case a_HAH: return a_m_HAH; |
500 | case a_KHAH: return a_m_KHAH; |
501 | case a_DAL: return a_f_DAL; // exception |
502 | case a_THAL: return a_f_THAL; // exception |
503 | case a_REH: return a_f_REH; // exception |
504 | case a_ZAIN: return a_f_ZAIN; // exception |
505 | case a_SEEN: return a_m_SEEN; |
506 | case a_SHEEN: return a_m_SHEEN; |
507 | case a_SAD: return a_m_SAD; |
508 | case a_DAD: return a_m_DAD; |
509 | case a_TAH: return a_m_TAH; |
510 | case a_ZAH: return a_m_ZAH; |
511 | case a_AIN: return a_m_AIN; |
512 | case a_GHAIN: return a_m_GHAIN; |
513 | case a_TATWEEL: return cur_c; // exception |
514 | case a_FEH: return a_m_FEH; |
515 | case a_QAF: return a_m_QAF; |
516 | case a_KAF: return a_m_KAF; |
517 | case a_LAM: return a_m_LAM; |
518 | case a_MEEM: return a_m_MEEM; |
519 | case a_NOON: return a_m_NOON; |
520 | case a_HEH: return a_m_HEH; |
521 | case a_WAW: return a_f_WAW; // exception |
522 | case a_ALEF_MAKSURA: return a_f_ALEF_MAKSURA; // exception |
523 | case a_YEH: return a_m_YEH; |
524 | } |
525 | return 0; |
526 | } |
527 | |
528 | // Change shape - from ISO-8859-6/Isolated to final |
529 | static int chg_c_a2f(int cur_c) |
530 | { |
531 | // NOTE: these encodings need to be accounted for |
532 | // |
533 | // a_f_ALEF_MADDA; |
534 | // a_f_ALEF_HAMZA_ABOVE; |
535 | // a_f_ALEF_HAMZA_BELOW; |
536 | // a_f_LAM_ALEF_MADDA_ABOVE; |
537 | // a_f_LAM_ALEF_HAMZA_ABOVE; |
538 | // a_f_LAM_ALEF_HAMZA_BELOW; |
539 | |
540 | switch (cur_c) { |
541 | case a_HAMZA: return a_s_HAMZA; // exception |
542 | case a_ALEF_MADDA: return a_f_ALEF_MADDA; |
543 | case a_ALEF_HAMZA_ABOVE: return a_f_ALEF_HAMZA_ABOVE; |
544 | case a_WAW_HAMZA: return a_f_WAW_HAMZA; |
545 | case a_ALEF_HAMZA_BELOW: return a_f_ALEF_HAMZA_BELOW; |
546 | case a_YEH_HAMZA: return a_f_YEH_HAMZA; |
547 | case a_ALEF: return a_f_ALEF; |
548 | case a_BEH: return a_f_BEH; |
549 | case a_TEH_MARBUTA: return a_f_TEH_MARBUTA; |
550 | case a_TEH: return a_f_TEH; |
551 | case a_THEH: return a_f_THEH; |
552 | case a_JEEM: return a_f_JEEM; |
553 | case a_HAH: return a_f_HAH; |
554 | case a_KHAH: return a_f_KHAH; |
555 | case a_DAL: return a_f_DAL; |
556 | case a_THAL: return a_f_THAL; |
557 | case a_REH: return a_f_REH; |
558 | case a_ZAIN: return a_f_ZAIN; |
559 | case a_SEEN: return a_f_SEEN; |
560 | case a_SHEEN: return a_f_SHEEN; |
561 | case a_SAD: return a_f_SAD; |
562 | case a_DAD: return a_f_DAD; |
563 | case a_TAH: return a_f_TAH; |
564 | case a_ZAH: return a_f_ZAH; |
565 | case a_AIN: return a_f_AIN; |
566 | case a_GHAIN: return a_f_GHAIN; |
567 | case a_TATWEEL: return cur_c; // exception |
568 | case a_FEH: return a_f_FEH; |
569 | case a_QAF: return a_f_QAF; |
570 | case a_KAF: return a_f_KAF; |
571 | case a_LAM: return a_f_LAM; |
572 | case a_MEEM: return a_f_MEEM; |
573 | case a_NOON: return a_f_NOON; |
574 | case a_HEH: return a_f_HEH; |
575 | case a_WAW: return a_f_WAW; |
576 | case a_ALEF_MAKSURA: return a_f_ALEF_MAKSURA; |
577 | case a_YEH: return a_f_YEH; |
578 | } |
579 | return 0; |
580 | } |
581 | |
582 | // Change shape - from Initial to Medial |
583 | // This code is unreachable, because for the relevant characters ARABIC_CHAR() |
584 | // is FALSE; |
585 | #if 0 |
586 | static int chg_c_i2m(int cur_c) |
587 | { |
588 | switch (cur_c) { |
589 | case a_i_YEH_HAMZA: return a_m_YEH_HAMZA; |
590 | case a_i_BEH: return a_m_BEH; |
591 | case a_i_TEH: return a_m_TEH; |
592 | case a_i_THEH: return a_m_THEH; |
593 | case a_i_JEEM: return a_m_JEEM; |
594 | case a_i_HAH: return a_m_HAH; |
595 | case a_i_KHAH: return a_m_KHAH; |
596 | case a_i_SEEN: return a_m_SEEN; |
597 | case a_i_SHEEN: return a_m_SHEEN; |
598 | case a_i_SAD: return a_m_SAD; |
599 | case a_i_DAD: return a_m_DAD; |
600 | case a_i_TAH: return a_m_TAH; |
601 | case a_i_ZAH: return a_m_ZAH; |
602 | case a_i_AIN: return a_m_AIN; |
603 | case a_i_GHAIN: return a_m_GHAIN; |
604 | case a_i_FEH: return a_m_FEH; |
605 | case a_i_QAF: return a_m_QAF; |
606 | case a_i_KAF: return a_m_KAF; |
607 | case a_i_LAM: return a_m_LAM; |
608 | case a_i_MEEM: return a_m_MEEM; |
609 | case a_i_NOON: return a_m_NOON; |
610 | case a_i_HEH: return a_m_HEH; |
611 | case a_i_YEH: return a_m_YEH; |
612 | } |
613 | return 0; |
614 | } |
615 | #endif |
616 | |
617 | // Change shape - from Final to Medial |
618 | static int chg_c_f2m(int cur_c) |
619 | { |
620 | switch (cur_c) { |
621 | // NOTE: these encodings are multi-positional, no ? |
622 | // case a_f_ALEF_MADDA: |
623 | // case a_f_ALEF_HAMZA_ABOVE: |
624 | // case a_f_ALEF_HAMZA_BELOW: |
625 | case a_f_YEH_HAMZA: return a_m_YEH_HAMZA; |
626 | case a_f_WAW_HAMZA: // exceptions |
627 | case a_f_ALEF: |
628 | case a_f_TEH_MARBUTA: |
629 | case a_f_DAL: |
630 | case a_f_THAL: |
631 | case a_f_REH: |
632 | case a_f_ZAIN: |
633 | case a_f_WAW: |
634 | case a_f_ALEF_MAKSURA: |
635 | return cur_c; |
636 | case a_f_BEH: return a_m_BEH; |
637 | case a_f_TEH: return a_m_TEH; |
638 | case a_f_THEH: return a_m_THEH; |
639 | case a_f_JEEM: return a_m_JEEM; |
640 | case a_f_HAH: return a_m_HAH; |
641 | case a_f_KHAH: return a_m_KHAH; |
642 | case a_f_SEEN: return a_m_SEEN; |
643 | case a_f_SHEEN: return a_m_SHEEN; |
644 | case a_f_SAD: return a_m_SAD; |
645 | case a_f_DAD: return a_m_DAD; |
646 | case a_f_TAH: return a_m_TAH; |
647 | case a_f_ZAH: return a_m_ZAH; |
648 | case a_f_AIN: return a_m_AIN; |
649 | case a_f_GHAIN: return a_m_GHAIN; |
650 | case a_f_FEH: return a_m_FEH; |
651 | case a_f_QAF: return a_m_QAF; |
652 | case a_f_KAF: return a_m_KAF; |
653 | case a_f_LAM: return a_m_LAM; |
654 | case a_f_MEEM: return a_m_MEEM; |
655 | case a_f_NOON: return a_m_NOON; |
656 | case a_f_HEH: return a_m_HEH; |
657 | case a_f_YEH: return a_m_YEH; |
658 | // NOTE: these encodings are multi-positional, no ? |
659 | // case a_f_LAM_ALEF_MADDA_ABOVE: |
660 | // case a_f_LAM_ALEF_HAMZA_ABOVE: |
661 | // case a_f_LAM_ALEF_HAMZA_BELOW: |
662 | // case a_f_LAM_ALEF: |
663 | } |
664 | return 0; |
665 | } |
666 | |
667 | // Change shape - from Combination (2 char) to an Isolated. |
668 | static int chg_c_laa2i(int hid_c) |
669 | { |
670 | switch (hid_c) { |
671 | case a_ALEF_MADDA: return a_s_LAM_ALEF_MADDA_ABOVE; |
672 | case a_ALEF_HAMZA_ABOVE: return a_s_LAM_ALEF_HAMZA_ABOVE; |
673 | case a_ALEF_HAMZA_BELOW: return a_s_LAM_ALEF_HAMZA_BELOW; |
674 | case a_ALEF: return a_s_LAM_ALEF; |
675 | } |
676 | return 0; |
677 | } |
678 | |
679 | // Change shape - from Combination-Isolated to Final. |
680 | static int chg_c_laa2f(int hid_c) |
681 | { |
682 | switch (hid_c) { |
683 | case a_ALEF_MADDA: return a_f_LAM_ALEF_MADDA_ABOVE; |
684 | case a_ALEF_HAMZA_ABOVE: return a_f_LAM_ALEF_HAMZA_ABOVE; |
685 | case a_ALEF_HAMZA_BELOW: return a_f_LAM_ALEF_HAMZA_BELOW; |
686 | case a_ALEF: return a_f_LAM_ALEF; |
687 | } |
688 | return 0; |
689 | } |
690 | |
691 | // Do "half-shaping" on character "c". Return zero if no shaping. |
692 | static int half_shape(int c) |
693 | { |
694 | if (A_is_a(c)) { |
695 | return chg_c_a2i(c); |
696 | } |
697 | |
698 | if (A_is_valid(c) && A_is_f(c)) { |
699 | return chg_c_f2m(c); |
700 | } |
701 | return 0; |
702 | } |
703 | |
704 | // Do Arabic shaping on character "c". Returns the shaped character. |
705 | // out: "ccp" points to the first byte of the character to be shaped. |
706 | // in/out: "c1p" points to the first composing char for "c". |
707 | // in: "prev_c" is the previous character (not shaped) |
708 | // in: "prev_c1" is the first composing char for the previous char |
709 | // (not shaped) |
710 | // in: "next_c" is the next character (not shaped). |
711 | int arabic_shape(int c, int *ccp, int *c1p, int prev_c, int prev_c1, |
712 | int next_c) |
713 | { |
714 | // Deal only with Arabic character, pass back all others |
715 | if (!A_is_ok(c)) { |
716 | return c; |
717 | } |
718 | |
719 | // half-shape current and previous character |
720 | int shape_c = half_shape(prev_c); |
721 | |
722 | // Save away current character |
723 | int curr_c = c; |
724 | |
725 | int curr_laa = A_firstc_laa(c, *c1p); |
726 | int prev_laa = A_firstc_laa(prev_c, prev_c1); |
727 | |
728 | if (curr_laa) { |
729 | if (A_is_valid(prev_c) && !A_is_f(shape_c) && !A_is_s(shape_c) |
730 | && !prev_laa) { |
731 | curr_c = chg_c_laa2f(curr_laa); |
732 | } else { |
733 | curr_c = chg_c_laa2i(curr_laa); |
734 | } |
735 | |
736 | // Remove the composing character |
737 | *c1p = 0; |
738 | } else if (!A_is_valid(prev_c) && A_is_valid(next_c)) { |
739 | curr_c = chg_c_a2i(c); |
740 | } else if (!shape_c || A_is_f(shape_c) || A_is_s(shape_c) || prev_laa) { |
741 | curr_c = A_is_valid(next_c) ? chg_c_a2i(c) : chg_c_a2s(c); |
742 | } else if (A_is_valid(next_c)) { |
743 | #if 0 |
744 | curr_c = A_is_iso(c) ? chg_c_a2m(c) : chg_c_i2m(c); |
745 | #else |
746 | curr_c = A_is_iso(c) ? chg_c_a2m(c) : 0; |
747 | #endif |
748 | } else if (A_is_valid(prev_c)) { |
749 | curr_c = chg_c_a2f(c); |
750 | } else { |
751 | curr_c = chg_c_a2s(c); |
752 | } |
753 | |
754 | // Sanity check -- curr_c should, in the future, never be 0. |
755 | // We should, in the future, insert a fatal error here. |
756 | if (curr_c == NUL) { |
757 | curr_c = c; |
758 | } |
759 | |
760 | if ((curr_c != c) && (ccp != NULL)) { |
761 | char_u buf[MB_MAXBYTES + 1]; |
762 | |
763 | // Update the first byte of the character |
764 | utf_char2bytes(curr_c, buf); |
765 | *ccp = buf[0]; |
766 | } |
767 | |
768 | // Return the shaped character |
769 | return curr_c; |
770 | } |
771 | |
772 | /// Check whether we are dealing with Arabic combining characters. |
773 | /// Note: these are NOT really composing characters! |
774 | /// |
775 | /// @param one First character. |
776 | /// @param two Character just after "one". |
777 | bool arabic_combine(int one, int two) |
778 | { |
779 | if (one == a_LAM) { |
780 | return arabic_maycombine(two); |
781 | } |
782 | return false; |
783 | } |
784 | |
785 | /// Check whether we are dealing with a character that could be regarded as an |
786 | /// Arabic combining character, need to check the character before this. |
787 | bool arabic_maycombine(int two) |
788 | { |
789 | if (p_arshape && !p_tbidi) { |
790 | return two == a_ALEF_MADDA |
791 | || two == a_ALEF_HAMZA_ABOVE |
792 | || two == a_ALEF_HAMZA_BELOW |
793 | || two == a_ALEF; |
794 | } |
795 | return false; |
796 | } |
797 | |
798 | // A_firstc_laa returns first character of LAA combination if it ex.ists |
799 | // in: "c" base character |
800 | // in: "c1" first composing character |
801 | static int A_firstc_laa(int c, int c1) |
802 | { |
803 | if ((c1 != NUL) && (c == a_LAM) && !A_is_harakat(c1)) { |
804 | return c1; |
805 | } |
806 | return 0; |
807 | } |
808 | |
809 | // A_is_harakat returns true if 'c' is an Arabic Harakat character. |
810 | // (harakat/tanween) |
811 | static bool A_is_harakat(int c) |
812 | { |
813 | return c >= a_FATHATAN && c <= a_SUKUN; |
814 | } |
815 | |
816 | // A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character. |
817 | // (alphabet/number/punctuation) |
818 | static bool A_is_iso(int c) |
819 | { |
820 | return ((c >= a_HAMZA && c <= a_GHAIN) |
821 | || (c >= a_TATWEEL && c <= a_HAMZA_BELOW) |
822 | || c == a_MINI_ALEF); |
823 | } |
824 | |
825 | // A_is_formb returns true if 'c' is an Arabic 10646-1 FormB character. |
826 | // (alphabet/number/punctuation) |
827 | static bool A_is_formb(int c) |
828 | { |
829 | return ((c >= a_s_FATHATAN && c <= a_s_DAMMATAN) |
830 | || c == a_s_KASRATAN |
831 | || (c >= a_s_FATHA && c <= a_f_LAM_ALEF) |
832 | || c == a_BYTE_ORDER_MARK); |
833 | } |
834 | |
835 | // A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B). |
836 | static bool A_is_ok(int c) |
837 | { |
838 | return A_is_iso(c) || A_is_formb(c); |
839 | } |
840 | |
841 | // A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B), |
842 | // with some exceptions/exclusions. |
843 | static bool A_is_valid(int c) |
844 | { |
845 | return A_is_ok(c) && !A_is_special(c); |
846 | } |
847 | |
848 | // A_is_special returns true if 'c' is not a special Arabic character. |
849 | // Specials don't adhere to most of the rules. |
850 | static bool A_is_special(int c) |
851 | { |
852 | return c == a_HAMZA || c == a_s_HAMZA; |
853 | } |
854 | |