1// This is an open source non-commercial project. Dear PVS-Studio, please check
2// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4/// @file arabic.c
5///
6/// Functions for Arabic language.
7///
8/// Arabic characters are categorized into following types:
9///
10/// Isolated - iso-8859-6 form char denoted with a_*
11/// Initial - unicode form-B start char denoted with a_i_*
12/// Medial - unicode form-B middle char denoted with a_m_*
13/// Final - unicode form-B final char denoted with a_f_*
14/// Stand-Alone - unicode form-B isolated char denoted with a_s_* (NOT USED)
15
16#include <stdbool.h>
17
18#include "nvim/vim.h"
19#include "nvim/ascii.h"
20#include "nvim/arabic.h"
21
22// Arabic ISO-10646-1 character set definition
23
24// Arabic ISO-8859-6 (subset of 10646; 0600 - 06FF)
25#define a_COMMA 0x060C
26#define a_SEMICOLON 0x061B
27#define a_QUESTION 0x061F
28#define a_HAMZA 0x0621
29#define a_ALEF_MADDA 0x0622
30#define a_ALEF_HAMZA_ABOVE 0x0623
31#define a_WAW_HAMZA 0x0624
32#define a_ALEF_HAMZA_BELOW 0x0625
33#define a_YEH_HAMZA 0x0626
34#define a_ALEF 0x0627
35#define a_BEH 0x0628
36#define a_TEH_MARBUTA 0x0629
37#define a_TEH 0x062a
38#define a_THEH 0x062b
39#define a_JEEM 0x062c
40#define a_HAH 0x062d
41#define a_KHAH 0x062e
42#define a_DAL 0x062f
43#define a_THAL 0x0630
44#define a_REH 0x0631
45#define a_ZAIN 0x0632
46#define a_SEEN 0x0633
47#define a_SHEEN 0x0634
48#define a_SAD 0x0635
49#define a_DAD 0x0636
50#define a_TAH 0x0637
51#define a_ZAH 0x0638
52#define a_AIN 0x0639
53#define a_GHAIN 0x063a
54#define a_TATWEEL 0x0640
55#define a_FEH 0x0641
56#define a_QAF 0x0642
57#define a_KAF 0x0643
58#define a_LAM 0x0644
59#define a_MEEM 0x0645
60#define a_NOON 0x0646
61#define a_HEH 0x0647
62#define a_WAW 0x0648
63#define a_ALEF_MAKSURA 0x0649
64#define a_YEH 0x064a
65
66#define a_FATHATAN 0x064b
67#define a_DAMMATAN 0x064c
68#define a_KASRATAN 0x064d
69#define a_FATHA 0x064e
70#define a_DAMMA 0x064f
71#define a_KASRA 0x0650
72#define a_SHADDA 0x0651
73#define a_SUKUN 0x0652
74
75#define a_MADDA_ABOVE 0x0653
76#define a_HAMZA_ABOVE 0x0654
77#define a_HAMZA_BELOW 0x0655
78
79#define a_ZERO 0x0660
80#define a_ONE 0x0661
81#define a_TWO 0x0662
82#define a_THREE 0x0663
83#define a_FOUR 0x0664
84#define a_FIVE 0x0665
85#define a_SIX 0x0666
86#define a_SEVEN 0x0667
87#define a_EIGHT 0x0668
88#define a_NINE 0x0669
89#define a_PERCENT 0x066a
90#define a_DECIMAL 0x066b
91#define a_THOUSANDS 0x066c
92#define a_STAR 0x066d
93#define a_MINI_ALEF 0x0670
94// Rest of 8859-6 does not relate to Arabic
95
96// Arabic Presentation Form-B (subset of 10646; FE70 - FEFF)
97//
98// s -> isolated
99// i -> initial
100// m -> medial
101// f -> final
102#define a_s_FATHATAN 0xfe70
103#define a_m_TATWEEL_FATHATAN 0xfe71
104#define a_s_DAMMATAN 0xfe72
105
106#define a_s_KASRATAN 0xfe74
107
108#define a_s_FATHA 0xfe76
109#define a_m_FATHA 0xfe77
110#define a_s_DAMMA 0xfe78
111#define a_m_DAMMA 0xfe79
112#define a_s_KASRA 0xfe7a
113#define a_m_KASRA 0xfe7b
114#define a_s_SHADDA 0xfe7c
115#define a_m_SHADDA 0xfe7d
116#define a_s_SUKUN 0xfe7e
117#define a_m_SUKUN 0xfe7f
118
119#define a_s_HAMZA 0xfe80
120#define a_s_ALEF_MADDA 0xfe81
121#define a_f_ALEF_MADDA 0xfe82
122#define a_s_ALEF_HAMZA_ABOVE 0xfe83
123#define a_f_ALEF_HAMZA_ABOVE 0xfe84
124#define a_s_WAW_HAMZA 0xfe85
125#define a_f_WAW_HAMZA 0xfe86
126#define a_s_ALEF_HAMZA_BELOW 0xfe87
127#define a_f_ALEF_HAMZA_BELOW 0xfe88
128#define a_s_YEH_HAMZA 0xfe89
129#define a_f_YEH_HAMZA 0xfe8a
130#define a_i_YEH_HAMZA 0xfe8b
131#define a_m_YEH_HAMZA 0xfe8c
132#define a_s_ALEF 0xfe8d
133#define a_f_ALEF 0xfe8e
134#define a_s_BEH 0xfe8f
135#define a_f_BEH 0xfe90
136#define a_i_BEH 0xfe91
137#define a_m_BEH 0xfe92
138#define a_s_TEH_MARBUTA 0xfe93
139#define a_f_TEH_MARBUTA 0xfe94
140#define a_s_TEH 0xfe95
141#define a_f_TEH 0xfe96
142#define a_i_TEH 0xfe97
143#define a_m_TEH 0xfe98
144#define a_s_THEH 0xfe99
145#define a_f_THEH 0xfe9a
146#define a_i_THEH 0xfe9b
147#define a_m_THEH 0xfe9c
148#define a_s_JEEM 0xfe9d
149#define a_f_JEEM 0xfe9e
150#define a_i_JEEM 0xfe9f
151#define a_m_JEEM 0xfea0
152#define a_s_HAH 0xfea1
153#define a_f_HAH 0xfea2
154#define a_i_HAH 0xfea3
155#define a_m_HAH 0xfea4
156#define a_s_KHAH 0xfea5
157#define a_f_KHAH 0xfea6
158#define a_i_KHAH 0xfea7
159#define a_m_KHAH 0xfea8
160#define a_s_DAL 0xfea9
161#define a_f_DAL 0xfeaa
162#define a_s_THAL 0xfeab
163#define a_f_THAL 0xfeac
164#define a_s_REH 0xfead
165#define a_f_REH 0xfeae
166#define a_s_ZAIN 0xfeaf
167#define a_f_ZAIN 0xfeb0
168#define a_s_SEEN 0xfeb1
169#define a_f_SEEN 0xfeb2
170#define a_i_SEEN 0xfeb3
171#define a_m_SEEN 0xfeb4
172#define a_s_SHEEN 0xfeb5
173#define a_f_SHEEN 0xfeb6
174#define a_i_SHEEN 0xfeb7
175#define a_m_SHEEN 0xfeb8
176#define a_s_SAD 0xfeb9
177#define a_f_SAD 0xfeba
178#define a_i_SAD 0xfebb
179#define a_m_SAD 0xfebc
180#define a_s_DAD 0xfebd
181#define a_f_DAD 0xfebe
182#define a_i_DAD 0xfebf
183#define a_m_DAD 0xfec0
184#define a_s_TAH 0xfec1
185#define a_f_TAH 0xfec2
186#define a_i_TAH 0xfec3
187#define a_m_TAH 0xfec4
188#define a_s_ZAH 0xfec5
189#define a_f_ZAH 0xfec6
190#define a_i_ZAH 0xfec7
191#define a_m_ZAH 0xfec8
192#define a_s_AIN 0xfec9
193#define a_f_AIN 0xfeca
194#define a_i_AIN 0xfecb
195#define a_m_AIN 0xfecc
196#define a_s_GHAIN 0xfecd
197#define a_f_GHAIN 0xfece
198#define a_i_GHAIN 0xfecf
199#define a_m_GHAIN 0xfed0
200#define a_s_FEH 0xfed1
201#define a_f_FEH 0xfed2
202#define a_i_FEH 0xfed3
203#define a_m_FEH 0xfed4
204#define a_s_QAF 0xfed5
205#define a_f_QAF 0xfed6
206#define a_i_QAF 0xfed7
207#define a_m_QAF 0xfed8
208#define a_s_KAF 0xfed9
209#define a_f_KAF 0xfeda
210#define a_i_KAF 0xfedb
211#define a_m_KAF 0xfedc
212#define a_s_LAM 0xfedd
213#define a_f_LAM 0xfede
214#define a_i_LAM 0xfedf
215#define a_m_LAM 0xfee0
216#define a_s_MEEM 0xfee1
217#define a_f_MEEM 0xfee2
218#define a_i_MEEM 0xfee3
219#define a_m_MEEM 0xfee4
220#define a_s_NOON 0xfee5
221#define a_f_NOON 0xfee6
222#define a_i_NOON 0xfee7
223#define a_m_NOON 0xfee8
224#define a_s_HEH 0xfee9
225#define a_f_HEH 0xfeea
226#define a_i_HEH 0xfeeb
227#define a_m_HEH 0xfeec
228#define a_s_WAW 0xfeed
229#define a_f_WAW 0xfeee
230#define a_s_ALEF_MAKSURA 0xfeef
231#define a_f_ALEF_MAKSURA 0xfef0
232#define a_s_YEH 0xfef1
233#define a_f_YEH 0xfef2
234#define a_i_YEH 0xfef3
235#define a_m_YEH 0xfef4
236#define a_s_LAM_ALEF_MADDA_ABOVE 0xfef5
237#define a_f_LAM_ALEF_MADDA_ABOVE 0xfef6
238#define a_s_LAM_ALEF_HAMZA_ABOVE 0xfef7
239#define a_f_LAM_ALEF_HAMZA_ABOVE 0xfef8
240#define a_s_LAM_ALEF_HAMZA_BELOW 0xfef9
241#define a_f_LAM_ALEF_HAMZA_BELOW 0xfefa
242#define a_s_LAM_ALEF 0xfefb
243#define a_f_LAM_ALEF 0xfefc
244
245#define a_BYTE_ORDER_MARK 0xfeff
246
247
248#ifdef INCLUDE_GENERATED_DECLARATIONS
249# include "arabic.c.generated.h"
250#endif
251
252// Returns true if c is an ISO-8859-6 shaped ARABIC letter (user entered).
253static bool A_is_a(int cur_c)
254{
255 switch (cur_c) {
256 case a_HAMZA:
257 case a_ALEF_MADDA:
258 case a_ALEF_HAMZA_ABOVE:
259 case a_WAW_HAMZA:
260 case a_ALEF_HAMZA_BELOW:
261 case a_YEH_HAMZA:
262 case a_ALEF:
263 case a_BEH:
264 case a_TEH_MARBUTA:
265 case a_TEH:
266 case a_THEH:
267 case a_JEEM:
268 case a_HAH:
269 case a_KHAH:
270 case a_DAL:
271 case a_THAL:
272 case a_REH:
273 case a_ZAIN:
274 case a_SEEN:
275 case a_SHEEN:
276 case a_SAD:
277 case a_DAD:
278 case a_TAH:
279 case a_ZAH:
280 case a_AIN:
281 case a_GHAIN:
282 case a_TATWEEL:
283 case a_FEH:
284 case a_QAF:
285 case a_KAF:
286 case a_LAM:
287 case a_MEEM:
288 case a_NOON:
289 case a_HEH:
290 case a_WAW:
291 case a_ALEF_MAKSURA:
292 case a_YEH:
293 return true;
294 }
295
296 return false;
297}
298
299// Returns true if c is an Isolated Form-B ARABIC letter
300static bool A_is_s(int cur_c)
301{
302 switch (cur_c) {
303 case a_s_HAMZA:
304 case a_s_ALEF_MADDA:
305 case a_s_ALEF_HAMZA_ABOVE:
306 case a_s_WAW_HAMZA:
307 case a_s_ALEF_HAMZA_BELOW:
308 case a_s_YEH_HAMZA:
309 case a_s_ALEF:
310 case a_s_BEH:
311 case a_s_TEH_MARBUTA:
312 case a_s_TEH:
313 case a_s_THEH:
314 case a_s_JEEM:
315 case a_s_HAH:
316 case a_s_KHAH:
317 case a_s_DAL:
318 case a_s_THAL:
319 case a_s_REH:
320 case a_s_ZAIN:
321 case a_s_SEEN:
322 case a_s_SHEEN:
323 case a_s_SAD:
324 case a_s_DAD:
325 case a_s_TAH:
326 case a_s_ZAH:
327 case a_s_AIN:
328 case a_s_GHAIN:
329 case a_s_FEH:
330 case a_s_QAF:
331 case a_s_KAF:
332 case a_s_LAM:
333 case a_s_MEEM:
334 case a_s_NOON:
335 case a_s_HEH:
336 case a_s_WAW:
337 case a_s_ALEF_MAKSURA:
338 case a_s_YEH:
339 return true;
340 }
341
342 return false;
343}
344
345// Returns true if c is a Final shape of an ARABIC letter
346static bool A_is_f(int cur_c)
347{
348 switch (cur_c) {
349 case a_f_ALEF_MADDA:
350 case a_f_ALEF_HAMZA_ABOVE:
351 case a_f_WAW_HAMZA:
352 case a_f_ALEF_HAMZA_BELOW:
353 case a_f_YEH_HAMZA:
354 case a_f_ALEF:
355 case a_f_BEH:
356 case a_f_TEH_MARBUTA:
357 case a_f_TEH:
358 case a_f_THEH:
359 case a_f_JEEM:
360 case a_f_HAH:
361 case a_f_KHAH:
362 case a_f_DAL:
363 case a_f_THAL:
364 case a_f_REH:
365 case a_f_ZAIN:
366 case a_f_SEEN:
367 case a_f_SHEEN:
368 case a_f_SAD:
369 case a_f_DAD:
370 case a_f_TAH:
371 case a_f_ZAH:
372 case a_f_AIN:
373 case a_f_GHAIN:
374 case a_f_FEH:
375 case a_f_QAF:
376 case a_f_KAF:
377 case a_f_LAM:
378 case a_f_MEEM:
379 case a_f_NOON:
380 case a_f_HEH:
381 case a_f_WAW:
382 case a_f_ALEF_MAKSURA:
383 case a_f_YEH:
384 case a_f_LAM_ALEF_MADDA_ABOVE:
385 case a_f_LAM_ALEF_HAMZA_ABOVE:
386 case a_f_LAM_ALEF_HAMZA_BELOW:
387 case a_f_LAM_ALEF:
388 return true;
389 }
390 return false;
391}
392
393// Change shape - from ISO-8859-6/Isolated to Form-B Isolated
394static int chg_c_a2s(int cur_c)
395{
396 switch (cur_c) {
397 case a_HAMZA: return a_s_HAMZA;
398 case a_ALEF_MADDA: return a_s_ALEF_MADDA;
399 case a_ALEF_HAMZA_ABOVE: return a_s_ALEF_HAMZA_ABOVE;
400 case a_WAW_HAMZA: return a_s_WAW_HAMZA;
401 case a_ALEF_HAMZA_BELOW: return a_s_ALEF_HAMZA_BELOW;
402 case a_YEH_HAMZA: return a_s_YEH_HAMZA;
403 case a_ALEF: return a_s_ALEF;
404 case a_TEH_MARBUTA: return a_s_TEH_MARBUTA;
405 case a_DAL: return a_s_DAL;
406 case a_THAL: return a_s_THAL;
407 case a_REH: return a_s_REH;
408 case a_ZAIN: return a_s_ZAIN;
409 case a_TATWEEL: return cur_c; // exceptions
410 case a_WAW: return a_s_WAW;
411 case a_ALEF_MAKSURA: return a_s_ALEF_MAKSURA;
412 case a_BEH: return a_s_BEH;
413 case a_TEH: return a_s_TEH;
414 case a_THEH: return a_s_THEH;
415 case a_JEEM: return a_s_JEEM;
416 case a_HAH: return a_s_HAH;
417 case a_KHAH: return a_s_KHAH;
418 case a_SEEN: return a_s_SEEN;
419 case a_SHEEN: return a_s_SHEEN;
420 case a_SAD: return a_s_SAD;
421 case a_DAD: return a_s_DAD;
422 case a_TAH: return a_s_TAH;
423 case a_ZAH: return a_s_ZAH;
424 case a_AIN: return a_s_AIN;
425 case a_GHAIN: return a_s_GHAIN;
426 case a_FEH: return a_s_FEH;
427 case a_QAF: return a_s_QAF;
428 case a_KAF: return a_s_KAF;
429 case a_LAM: return a_s_LAM;
430 case a_MEEM: return a_s_MEEM;
431 case a_NOON: return a_s_NOON;
432 case a_HEH: return a_s_HEH;
433 case a_YEH: return a_s_YEH;
434 }
435 return 0;
436}
437
438// Change shape - from ISO-8859-6/Isolated to Initial
439static int chg_c_a2i(int cur_c)
440{
441 switch (cur_c) {
442 case a_YEH_HAMZA: return a_i_YEH_HAMZA;
443 case a_HAMZA: return a_s_HAMZA; // exceptions
444 case a_ALEF_MADDA: return a_s_ALEF_MADDA; // exceptions
445 case a_ALEF_HAMZA_ABOVE: return a_s_ALEF_HAMZA_ABOVE; // exceptions
446 case a_WAW_HAMZA: return a_s_WAW_HAMZA; // exceptions
447 case a_ALEF_HAMZA_BELOW: return a_s_ALEF_HAMZA_BELOW; // exceptions
448 case a_ALEF: return a_s_ALEF; // exceptions
449 case a_TEH_MARBUTA: return a_s_TEH_MARBUTA; // exceptions
450 case a_DAL: return a_s_DAL; // exceptions
451 case a_THAL: return a_s_THAL; // exceptions
452 case a_REH: return a_s_REH; // exceptions
453 case a_ZAIN: return a_s_ZAIN; // exceptions
454 case a_TATWEEL: return cur_c; // exceptions
455 case a_WAW: return a_s_WAW; // exceptions
456 case a_ALEF_MAKSURA: return a_s_ALEF_MAKSURA; // exceptions
457 case a_BEH: return a_i_BEH;
458 case a_TEH: return a_i_TEH;
459 case a_THEH: return a_i_THEH;
460 case a_JEEM: return a_i_JEEM;
461 case a_HAH: return a_i_HAH;
462 case a_KHAH: return a_i_KHAH;
463 case a_SEEN: return a_i_SEEN;
464 case a_SHEEN: return a_i_SHEEN;
465 case a_SAD: return a_i_SAD;
466 case a_DAD: return a_i_DAD;
467 case a_TAH: return a_i_TAH;
468 case a_ZAH: return a_i_ZAH;
469 case a_AIN: return a_i_AIN;
470 case a_GHAIN: return a_i_GHAIN;
471 case a_FEH: return a_i_FEH;
472 case a_QAF: return a_i_QAF;
473 case a_KAF: return a_i_KAF;
474 case a_LAM: return a_i_LAM;
475 case a_MEEM: return a_i_MEEM;
476 case a_NOON: return a_i_NOON;
477 case a_HEH: return a_i_HEH;
478 case a_YEH: return a_i_YEH;
479 }
480 return 0;
481}
482
483// Change shape - from ISO-8859-6/Isolated to Medial
484static int chg_c_a2m(int cur_c)
485{
486 switch (cur_c) {
487 case a_HAMZA: return a_s_HAMZA; // exception
488 case a_ALEF_MADDA: return a_f_ALEF_MADDA; // exception
489 case a_ALEF_HAMZA_ABOVE: return a_f_ALEF_HAMZA_ABOVE; // exception
490 case a_WAW_HAMZA: return a_f_WAW_HAMZA; // exception
491 case a_ALEF_HAMZA_BELOW: return a_f_ALEF_HAMZA_BELOW; // exception
492 case a_YEH_HAMZA: return a_m_YEH_HAMZA;
493 case a_ALEF: return a_f_ALEF; // exception
494 case a_BEH: return a_m_BEH;
495 case a_TEH_MARBUTA: return a_f_TEH_MARBUTA; // exception
496 case a_TEH: return a_m_TEH;
497 case a_THEH: return a_m_THEH;
498 case a_JEEM: return a_m_JEEM;
499 case a_HAH: return a_m_HAH;
500 case a_KHAH: return a_m_KHAH;
501 case a_DAL: return a_f_DAL; // exception
502 case a_THAL: return a_f_THAL; // exception
503 case a_REH: return a_f_REH; // exception
504 case a_ZAIN: return a_f_ZAIN; // exception
505 case a_SEEN: return a_m_SEEN;
506 case a_SHEEN: return a_m_SHEEN;
507 case a_SAD: return a_m_SAD;
508 case a_DAD: return a_m_DAD;
509 case a_TAH: return a_m_TAH;
510 case a_ZAH: return a_m_ZAH;
511 case a_AIN: return a_m_AIN;
512 case a_GHAIN: return a_m_GHAIN;
513 case a_TATWEEL: return cur_c; // exception
514 case a_FEH: return a_m_FEH;
515 case a_QAF: return a_m_QAF;
516 case a_KAF: return a_m_KAF;
517 case a_LAM: return a_m_LAM;
518 case a_MEEM: return a_m_MEEM;
519 case a_NOON: return a_m_NOON;
520 case a_HEH: return a_m_HEH;
521 case a_WAW: return a_f_WAW; // exception
522 case a_ALEF_MAKSURA: return a_f_ALEF_MAKSURA; // exception
523 case a_YEH: return a_m_YEH;
524 }
525 return 0;
526}
527
528// Change shape - from ISO-8859-6/Isolated to final
529static int chg_c_a2f(int cur_c)
530{
531 // NOTE: these encodings need to be accounted for
532 //
533 // a_f_ALEF_MADDA;
534 // a_f_ALEF_HAMZA_ABOVE;
535 // a_f_ALEF_HAMZA_BELOW;
536 // a_f_LAM_ALEF_MADDA_ABOVE;
537 // a_f_LAM_ALEF_HAMZA_ABOVE;
538 // a_f_LAM_ALEF_HAMZA_BELOW;
539
540 switch (cur_c) {
541 case a_HAMZA: return a_s_HAMZA; // exception
542 case a_ALEF_MADDA: return a_f_ALEF_MADDA;
543 case a_ALEF_HAMZA_ABOVE: return a_f_ALEF_HAMZA_ABOVE;
544 case a_WAW_HAMZA: return a_f_WAW_HAMZA;
545 case a_ALEF_HAMZA_BELOW: return a_f_ALEF_HAMZA_BELOW;
546 case a_YEH_HAMZA: return a_f_YEH_HAMZA;
547 case a_ALEF: return a_f_ALEF;
548 case a_BEH: return a_f_BEH;
549 case a_TEH_MARBUTA: return a_f_TEH_MARBUTA;
550 case a_TEH: return a_f_TEH;
551 case a_THEH: return a_f_THEH;
552 case a_JEEM: return a_f_JEEM;
553 case a_HAH: return a_f_HAH;
554 case a_KHAH: return a_f_KHAH;
555 case a_DAL: return a_f_DAL;
556 case a_THAL: return a_f_THAL;
557 case a_REH: return a_f_REH;
558 case a_ZAIN: return a_f_ZAIN;
559 case a_SEEN: return a_f_SEEN;
560 case a_SHEEN: return a_f_SHEEN;
561 case a_SAD: return a_f_SAD;
562 case a_DAD: return a_f_DAD;
563 case a_TAH: return a_f_TAH;
564 case a_ZAH: return a_f_ZAH;
565 case a_AIN: return a_f_AIN;
566 case a_GHAIN: return a_f_GHAIN;
567 case a_TATWEEL: return cur_c; // exception
568 case a_FEH: return a_f_FEH;
569 case a_QAF: return a_f_QAF;
570 case a_KAF: return a_f_KAF;
571 case a_LAM: return a_f_LAM;
572 case a_MEEM: return a_f_MEEM;
573 case a_NOON: return a_f_NOON;
574 case a_HEH: return a_f_HEH;
575 case a_WAW: return a_f_WAW;
576 case a_ALEF_MAKSURA: return a_f_ALEF_MAKSURA;
577 case a_YEH: return a_f_YEH;
578 }
579 return 0;
580}
581
582// Change shape - from Initial to Medial
583// This code is unreachable, because for the relevant characters ARABIC_CHAR()
584// is FALSE;
585#if 0
586static int chg_c_i2m(int cur_c)
587{
588 switch (cur_c) {
589 case a_i_YEH_HAMZA: return a_m_YEH_HAMZA;
590 case a_i_BEH: return a_m_BEH;
591 case a_i_TEH: return a_m_TEH;
592 case a_i_THEH: return a_m_THEH;
593 case a_i_JEEM: return a_m_JEEM;
594 case a_i_HAH: return a_m_HAH;
595 case a_i_KHAH: return a_m_KHAH;
596 case a_i_SEEN: return a_m_SEEN;
597 case a_i_SHEEN: return a_m_SHEEN;
598 case a_i_SAD: return a_m_SAD;
599 case a_i_DAD: return a_m_DAD;
600 case a_i_TAH: return a_m_TAH;
601 case a_i_ZAH: return a_m_ZAH;
602 case a_i_AIN: return a_m_AIN;
603 case a_i_GHAIN: return a_m_GHAIN;
604 case a_i_FEH: return a_m_FEH;
605 case a_i_QAF: return a_m_QAF;
606 case a_i_KAF: return a_m_KAF;
607 case a_i_LAM: return a_m_LAM;
608 case a_i_MEEM: return a_m_MEEM;
609 case a_i_NOON: return a_m_NOON;
610 case a_i_HEH: return a_m_HEH;
611 case a_i_YEH: return a_m_YEH;
612 }
613 return 0;
614}
615#endif
616
617// Change shape - from Final to Medial
618static int chg_c_f2m(int cur_c)
619{
620 switch (cur_c) {
621 // NOTE: these encodings are multi-positional, no ?
622 // case a_f_ALEF_MADDA:
623 // case a_f_ALEF_HAMZA_ABOVE:
624 // case a_f_ALEF_HAMZA_BELOW:
625 case a_f_YEH_HAMZA: return a_m_YEH_HAMZA;
626 case a_f_WAW_HAMZA: // exceptions
627 case a_f_ALEF:
628 case a_f_TEH_MARBUTA:
629 case a_f_DAL:
630 case a_f_THAL:
631 case a_f_REH:
632 case a_f_ZAIN:
633 case a_f_WAW:
634 case a_f_ALEF_MAKSURA:
635 return cur_c;
636 case a_f_BEH: return a_m_BEH;
637 case a_f_TEH: return a_m_TEH;
638 case a_f_THEH: return a_m_THEH;
639 case a_f_JEEM: return a_m_JEEM;
640 case a_f_HAH: return a_m_HAH;
641 case a_f_KHAH: return a_m_KHAH;
642 case a_f_SEEN: return a_m_SEEN;
643 case a_f_SHEEN: return a_m_SHEEN;
644 case a_f_SAD: return a_m_SAD;
645 case a_f_DAD: return a_m_DAD;
646 case a_f_TAH: return a_m_TAH;
647 case a_f_ZAH: return a_m_ZAH;
648 case a_f_AIN: return a_m_AIN;
649 case a_f_GHAIN: return a_m_GHAIN;
650 case a_f_FEH: return a_m_FEH;
651 case a_f_QAF: return a_m_QAF;
652 case a_f_KAF: return a_m_KAF;
653 case a_f_LAM: return a_m_LAM;
654 case a_f_MEEM: return a_m_MEEM;
655 case a_f_NOON: return a_m_NOON;
656 case a_f_HEH: return a_m_HEH;
657 case a_f_YEH: return a_m_YEH;
658 // NOTE: these encodings are multi-positional, no ?
659 // case a_f_LAM_ALEF_MADDA_ABOVE:
660 // case a_f_LAM_ALEF_HAMZA_ABOVE:
661 // case a_f_LAM_ALEF_HAMZA_BELOW:
662 // case a_f_LAM_ALEF:
663 }
664 return 0;
665}
666
667// Change shape - from Combination (2 char) to an Isolated.
668static int chg_c_laa2i(int hid_c)
669{
670 switch (hid_c) {
671 case a_ALEF_MADDA: return a_s_LAM_ALEF_MADDA_ABOVE;
672 case a_ALEF_HAMZA_ABOVE: return a_s_LAM_ALEF_HAMZA_ABOVE;
673 case a_ALEF_HAMZA_BELOW: return a_s_LAM_ALEF_HAMZA_BELOW;
674 case a_ALEF: return a_s_LAM_ALEF;
675 }
676 return 0;
677}
678
679// Change shape - from Combination-Isolated to Final.
680static int chg_c_laa2f(int hid_c)
681{
682 switch (hid_c) {
683 case a_ALEF_MADDA: return a_f_LAM_ALEF_MADDA_ABOVE;
684 case a_ALEF_HAMZA_ABOVE: return a_f_LAM_ALEF_HAMZA_ABOVE;
685 case a_ALEF_HAMZA_BELOW: return a_f_LAM_ALEF_HAMZA_BELOW;
686 case a_ALEF: return a_f_LAM_ALEF;
687 }
688 return 0;
689}
690
691// Do "half-shaping" on character "c". Return zero if no shaping.
692static int half_shape(int c)
693{
694 if (A_is_a(c)) {
695 return chg_c_a2i(c);
696 }
697
698 if (A_is_valid(c) && A_is_f(c)) {
699 return chg_c_f2m(c);
700 }
701 return 0;
702}
703
704// Do Arabic shaping on character "c". Returns the shaped character.
705// out: "ccp" points to the first byte of the character to be shaped.
706// in/out: "c1p" points to the first composing char for "c".
707// in: "prev_c" is the previous character (not shaped)
708// in: "prev_c1" is the first composing char for the previous char
709// (not shaped)
710// in: "next_c" is the next character (not shaped).
711int arabic_shape(int c, int *ccp, int *c1p, int prev_c, int prev_c1,
712 int next_c)
713{
714 // Deal only with Arabic character, pass back all others
715 if (!A_is_ok(c)) {
716 return c;
717 }
718
719 // half-shape current and previous character
720 int shape_c = half_shape(prev_c);
721
722 // Save away current character
723 int curr_c = c;
724
725 int curr_laa = A_firstc_laa(c, *c1p);
726 int prev_laa = A_firstc_laa(prev_c, prev_c1);
727
728 if (curr_laa) {
729 if (A_is_valid(prev_c) && !A_is_f(shape_c) && !A_is_s(shape_c)
730 && !prev_laa) {
731 curr_c = chg_c_laa2f(curr_laa);
732 } else {
733 curr_c = chg_c_laa2i(curr_laa);
734 }
735
736 // Remove the composing character
737 *c1p = 0;
738 } else if (!A_is_valid(prev_c) && A_is_valid(next_c)) {
739 curr_c = chg_c_a2i(c);
740 } else if (!shape_c || A_is_f(shape_c) || A_is_s(shape_c) || prev_laa) {
741 curr_c = A_is_valid(next_c) ? chg_c_a2i(c) : chg_c_a2s(c);
742 } else if (A_is_valid(next_c)) {
743#if 0
744 curr_c = A_is_iso(c) ? chg_c_a2m(c) : chg_c_i2m(c);
745#else
746 curr_c = A_is_iso(c) ? chg_c_a2m(c) : 0;
747#endif
748 } else if (A_is_valid(prev_c)) {
749 curr_c = chg_c_a2f(c);
750 } else {
751 curr_c = chg_c_a2s(c);
752 }
753
754 // Sanity check -- curr_c should, in the future, never be 0.
755 // We should, in the future, insert a fatal error here.
756 if (curr_c == NUL) {
757 curr_c = c;
758 }
759
760 if ((curr_c != c) && (ccp != NULL)) {
761 char_u buf[MB_MAXBYTES + 1];
762
763 // Update the first byte of the character
764 utf_char2bytes(curr_c, buf);
765 *ccp = buf[0];
766 }
767
768 // Return the shaped character
769 return curr_c;
770}
771
772/// Check whether we are dealing with Arabic combining characters.
773/// Note: these are NOT really composing characters!
774///
775/// @param one First character.
776/// @param two Character just after "one".
777bool arabic_combine(int one, int two)
778{
779 if (one == a_LAM) {
780 return arabic_maycombine(two);
781 }
782 return false;
783}
784
785/// Check whether we are dealing with a character that could be regarded as an
786/// Arabic combining character, need to check the character before this.
787bool arabic_maycombine(int two)
788{
789 if (p_arshape && !p_tbidi) {
790 return two == a_ALEF_MADDA
791 || two == a_ALEF_HAMZA_ABOVE
792 || two == a_ALEF_HAMZA_BELOW
793 || two == a_ALEF;
794 }
795 return false;
796}
797
798// A_firstc_laa returns first character of LAA combination if it ex.ists
799// in: "c" base character
800// in: "c1" first composing character
801static int A_firstc_laa(int c, int c1)
802{
803 if ((c1 != NUL) && (c == a_LAM) && !A_is_harakat(c1)) {
804 return c1;
805 }
806 return 0;
807}
808
809// A_is_harakat returns true if 'c' is an Arabic Harakat character.
810// (harakat/tanween)
811static bool A_is_harakat(int c)
812{
813 return c >= a_FATHATAN && c <= a_SUKUN;
814}
815
816// A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character.
817// (alphabet/number/punctuation)
818static bool A_is_iso(int c)
819{
820 return ((c >= a_HAMZA && c <= a_GHAIN)
821 || (c >= a_TATWEEL && c <= a_HAMZA_BELOW)
822 || c == a_MINI_ALEF);
823}
824
825// A_is_formb returns true if 'c' is an Arabic 10646-1 FormB character.
826// (alphabet/number/punctuation)
827static bool A_is_formb(int c)
828{
829 return ((c >= a_s_FATHATAN && c <= a_s_DAMMATAN)
830 || c == a_s_KASRATAN
831 || (c >= a_s_FATHA && c <= a_f_LAM_ALEF)
832 || c == a_BYTE_ORDER_MARK);
833}
834
835// A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B).
836static bool A_is_ok(int c)
837{
838 return A_is_iso(c) || A_is_formb(c);
839}
840
841// A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B),
842// with some exceptions/exclusions.
843static bool A_is_valid(int c)
844{
845 return A_is_ok(c) && !A_is_special(c);
846}
847
848// A_is_special returns true if 'c' is not a special Arabic character.
849// Specials don't adhere to most of the rules.
850static bool A_is_special(int c)
851{
852 return c == a_HAMZA || c == a_s_HAMZA;
853}
854