bidi.c source code [MuPDF/source/fitz/bidi.c]

1	/*
2	* Bidirectional text processing.
3	*
4	* Processes unicode text by arranging the characters into an order suitable
5	* for display. E.g. Hebrew text will be arranged from right-to-left and
6	* any English within the text will remain in the left-to-right order.
7	* Characters such as parenthesis will be substituted for their mirrored
8	* equivalents if they are part of text which must be reversed.
9	*
10	* This is an implementation of the unicode Bidirectional Algorithm which
11	* can be found here: http://www.unicode.org/reports/tr9/ and is based
12	* on the reference implementation of the algorithm found on that page.
13	*
14	* For a nice overview of how it works, read this...
15	* http://www.w3.org/TR/REC-html40/struct/dirlang.html
16	*
17	* Extracted from the SmartOffice code, where it was modified by Ian
18	* Beveridge.
19	*
20	* Copyright (C) Picsel, 2004. All Rights Reserved.
21	*/
22
23	/*
24	* Original copyright notice from unicode reference implementation.
25	* ----------------------------------------------------------------
26	* Written by: Asmus Freytag
27	* C++ and Windows dependencies removed, and
28	* command line interface added by: Rick McGowan
29	*
30	* Copyright (C) 1999, ASMUS, Inc. All Rights Reserved
31	*/
32
33	/*
34	* Includes...
35	*/
36
37	#include "mupdf/fitz.h"
38	#include "mupdf/ucdn.h"
39	#include "bidi-imp.h" /* standard bidi code interface */
40	#include <assert.h>
41
42	/*
43	* Macros...
44	*/
45
46	#define ODD(x) ((x) & 1)
47
48	#define REPLACEABLE_TYPE(t) ( \
49	((t)==BDI_ES) \|\| ((t)==BDI_ET) \|\| ((t)==BDI_CS) \|\| \
50	((t)==BDI_NSM) \|\| ((t)==BDI_PDF) \|\| ((t)==BDI_BN) \|\| \
51	((t)==BDI_S) \|\| ((t)==BDI_WS) \|\| ((t)==BDI_N) )
52
53	#ifdef DEBUG_BIDI_VERBOSE
54	#define DBUGVF(params) do { fz_warn params; } while (0)
55	#else
56	#define DBUGVF(params) do {} while (0)
57	#endif
58
59	#ifdef DEBUG_BIDI_OUTLINE
60	#define DBUGH(params) do { fz_warn params; } while (0)
61	#else
62	#define DBUGH(params) do {} while (0)
63	#endif
64
65	#define UNICODE_EOS 0
66	#define UNICODE_DIGIT_ZERO 0x0030
67	#define UNICODE_DIGIT_NINE 0x0039
68	#define UNICODE_SUPERSCRIPT_TWO 0x00B2
69	#define UNICODE_SUPERSCRIPT_THREE 0x00B3
70	#define UNICODE_SUPERSCRIPT_ONE 0x00B9
71	#define UNICODE_RTL_START 0x0590
72	#define UNICODE_RTL_END 0x07BF
73	#define UNICODE_ARABIC_INDIC_DIGIT_ZERO 0x0660
74	#define UNICODE_ARABIC_INDIC_DIGIT_NINE 0x0669
75	#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_ZERO 0x06F0
76	#define UNICODE_EXTENDED_ARABIC_INDIC_DIGIT_NINE 0x06F9
77	#define UNICODE_ZERO_WIDTH_NON_JOINER 0x200C
78	#define UNICODE_SUPERSCRIPT_ZERO 0x2070
79	#define UNICODE_SUPERSCRIPT_FOUR 0x2074
80	#define UNICODE_SUPERSCRIPT_NINE 0x2079
81	#define UNICODE_SUBSCRIPT_ZERO 0x2080
82	#define UNICODE_SUBSCRIPT_NINE 0x2089
83	#define UNICODE_CIRCLED_DIGIT_ONE 0x2460
84	#define UNICODE_NUMBER_TWENTY_FULL_STOP 0x249B
85	#define UNICODE_CIRCLED_DIGIT_ZERO 0x24EA
86	#define UNICODE_FULLWIDTH_DIGIT_ZERO 0xFF10
87	#define UNICODE_FULLWIDTH_DIGIT_NINE 0xFF19
88
89	#ifndef TRUE
90	#define TRUE (1)
91	#endif
92	#ifndef FALSE
93	#define FALSE (0)
94	#endif
95
96	/*
97	* Enumerations...
98	*/
99
100	#ifdef DEBUG_BIDI_VERBOSE
101	/ display support: /
102	static const char char_from_types[] =
103	{
104	`' '`, / ON /
105	`'>'`, / L /
106	`'<'`, / R /
107	`'9'`, / AN /
108	`'1'`, / EN /
109	`'a'`, / AL /
110	`'@'`, / NSM /
111	`'.'`, / CS /
112	`','`, / ES /
113	`'$'`, / ET /
114	`':'`, / BN /
115	`'X'`, / S /
116	`'_'`, / WS /
117	`'B'`, / B /
118	`'+'`, / RLO /
119	`'+'`, / RLE /
120	`'+'`, / LRO /
121	`'+'`, / LRE /
122	`'-'`, / PDF /
123	`'='` / LS /
124	};
125	#endif
126
127	/*
128	* Functions and static functions...
129	*/
130
131	/ UCDN uses a different ordering than Bidi does. We cannot*
132	* change to the UCDN ordering, as the bidi-std.c code relies
133	* on the exact ordering (at least that N = ON = 0). We
134	* therefore map between the two using this small table. It
135	* also takes care of fudging LRI, RLI, FSI and PDI, that this
136	* code does not currently support. */
137	static const uint8_t ucdn_to_bidi[] =
138	{
139	BDI_L, / UCDN_BIDI_CLASS_L = 0 /
140	BDI_LRE, / UCDN_BIDI_CLASS_LRE = 1 /
141	BDI_LRO, / UCDN_BIDI_CLASS_LRO = 2 /
142	BDI_R, / UCDN_BIDI_CLASS_R = 3 /
143	BDI_AL, / UCDN_BIDI_CLASS_AL = 4 /
144	BDI_RLE, / UCDN_BIDI_CLASS_RLE = 5 /
145	BDI_RLO, / UCDN_BIDI_CLASS_RLO = 6 /
146	BDI_PDF, / UCDN_BIDI_CLASS_PDF = 7 /
147	BDI_EN, / UCDN_BIDI_CLASS_EN = 8 /
148	BDI_ES, / UCDN_BIDI_CLASS_ES = 9 /
149	BDI_ET, / UCDN_BIDI_CLASS_ET = 10 /
150	BDI_AN, / UCDN_BIDI_CLASS_AN = 11 /
151	BDI_CS, / UCDN_BIDI_CLASS_CS = 12 /
152	BDI_NSM, / UCDN_BIDI_CLASS_NSM = 13 /
153	BDI_BN, / UCDN_BIDI_CLASS_BN = 14 /
154	BDI_B, / UCDN_BIDI_CLASS_B = 15 /
155	BDI_S, / UCDN_BIDI_CLASS_S = 16 /
156	BDI_WS, / UCDN_BIDI_CLASS_WS = 17 /
157	BDI_ON, / UCDN_BIDI_CLASS_ON = 18 /
158	BDI_LRE, / UCDN_BIDI_CLASS_LRI = 19 /
159	BDI_RLE, / UCDN_BIDI_CLASS_RLI = 20 /
160	BDI_N, / UCDN_BIDI_CLASS_FSI = 21 /
161	BDI_N, / UCDN_BIDI_CLASS_PDI = 22 /
162	};
163
164	#define class_from_ch_ws(ch) (ucdn_to_bidi[ucdn_get_bidi_class(ch)])
165
166	/ Return a direction for white-space on the second pass of the algorithm. /
167	static fz_bidi_chartype class_from_ch_n(uint32_t ch)
168	{
169	fz_bidi_chartype from_ch_ws = class_from_ch_ws(ch);
170	if (from_ch_ws == BDI_S \|\| from_ch_ws == BDI_WS)
171	return BDI_N;
172	return from_ch_ws;
173	}
174
175	/ Split fragments into single scripts (or punctuation + single script) /
176	static void
177	split_at_script(const uint32_t *fragment,
178	size_t fragment_len,
179	int level,
180	void *arg,
181	fz_bidi_fragment_fn *callback)
182	{
183	int script = UCDN_SCRIPT_COMMON;
184	size_t script_start, i;
185
186	script_start = `0`;
187	for (i = `0`; i < fragment_len; i++)
188	{
189	int s = ucdn_get_script(fragment[i]);
190	if (s == UCDN_SCRIPT_COMMON \|\| s == UCDN_SCRIPT_INHERITED)
191	{
192	/ Punctuation etc. This is fine. /
193	}
194	else if (s == script)
195	{
196	/ Same script. Still fine. /
197	}
198	else if (script == UCDN_SCRIPT_COMMON \|\| script == UCDN_SCRIPT_INHERITED)
199	{
200	/ First non punctuation thing. Set the script. /
201	script = s;
202	}
203	else
204	{
205	/ Change of script. Break the fragment. /
206	(*callback)(&fragment[script_start], i - script_start, level, script, arg);
207	script_start = i;
208	script = s;
209	}
210	}
211	if (script_start != fragment_len)
212	{
213	(*callback)(&fragment[script_start], fragment_len - script_start, level, script, arg);
214	}
215	}
216
217	/ Determines the character classes for all following*
218	* passes of the algorithm. A character class is basically the type of Bidi
219	* behaviour that the character exhibits.
220	*/
221	static void
222	classify_characters(const uint32_t *text,
223	fz_bidi_chartype *types,
224	size_t len,
225	fz_bidi_flags flags)
226	{
227	size_t i;
228
229	if ((flags & FZ_BIDI_CLASSIFY_WHITE_SPACE)!=`0`)
230	{
231	for (i = `0`; i < len; i++)
232	{
233	types[i] = class_from_ch_ws(text[i]);
234	}
235	}
236	else
237	{
238	#ifdef DEBUG_BIDI_VERBOSE
239	fprintf(stderr, "Text: ");
240	for (i = `0`; i < len; i++)
241	{
242	/ So that we can actually sort of read the debug string, any*
243	* non-ascii characters are replaced with a 1-digit hash
244	* value from 0-9, making non-english characters appear
245	* as numbers
246	*/
247	fprintf(stderr, "%c", (text[i] <= `127` && text[i] >= `32`) ?
248	text[i] : text[i] % `9` + `'0'`);
249	}
250	fprintf(stderr, "\nTypes: ");
251	#endif
252	for (i = `0`; i < len; i++)
253	{
254	types[i] = class_from_ch_n(text[i]);
255	#ifdef DEBUG_BIDI_VERBOSE
256	fprintf(stderr, "%c", char_from_types[(int)types[i]]);
257	#endif
258	}
259	#ifdef DEBUG_BIDI_VERBOSE
260	fprintf(stderr, "\n");
261	#endif
262	}
263	}
264
265	/ Determines the base level of the text.*
266	* Implements rule P2 of the Unicode Bidi Algorithm.
267	* Note: Ignores explicit embeddings
268	*/
269	static fz_bidi_level base_level_from_text(fz_bidi_chartype *types, size_t len)
270	{
271	size_t i;
272
273	for (i = `0`; i < len; i++)
274	{
275	switch (types[i])
276	{
277	/ strong left /
278	case BDI_L:
279	return FZ_BIDI_LTR;
280
281	/ strong right /
282	case BDI_R:
283	case BDI_AL:
284	return FZ_BIDI_RTL;
285	}
286	}
287	return FZ_BIDI_LTR;
288	}
289
290	static fz_bidi_direction direction_from_type(fz_bidi_chartype type)
291	{
292	switch (type)
293	{
294	case BDI_L:
295	case BDI_EN:
296	return FZ_BIDI_LTR;
297
298	case BDI_R:
299	case BDI_AL:
300	return FZ_BIDI_RTL;
301
302	default:
303	return FZ_BIDI_NEUTRAL;
304	}
305	}
306
307	static void
308	classify_quoted_blocks(const uint32_t *text,
309	fz_bidi_chartype *types,
310	size_t len)
311	{
312	size_t i;
313	int inQuote = FALSE;
314	int pdfNeeded = FALSE;
315	int ltrFound = FALSE;
316	int rtlFound = FALSE;
317
318	/ Only do anything special here if there is mixed content*
319	* (LTR and RTL) in the text.
320	*/
321	for (i = `0`; i < len; i++)
322	{
323	switch (direction_from_type(types[i]))
324	{
325	case FZ_BIDI_LTR:
326	ltrFound = TRUE;
327	break;
328
329	case FZ_BIDI_RTL:
330	rtlFound = TRUE;
331	break;
332
333	default:
334	break;
335	}
336	}
337
338	/ Only make any changes if both LTR and RTL characters exist*
339	* in this text.
340	*/
341	if (!ltrFound \|\| !rtlFound)
342	{
343	return;
344	}
345
346	for (i = `0`; i < len; i++)
347	{
348	if (text[i]==`'"'`)
349	{
350	/ If we're already in a quote then terminate it,*
351	* else start a new block.
352	*/
353	if (inQuote)
354	{
355	inQuote = FALSE;
356	if (pdfNeeded)
357	{
358	pdfNeeded = FALSE;
359	types[i] = BDI_PDF;
360	}
361	}
362	else
363	{
364	size_t j;
365	int done = FALSE;
366
367	inQuote = TRUE;
368
369	/ Find the first strong right or left type and*
370	* use that to determine whether we should classify
371	* the quote as LRE or RLE. Or neither, if we
372	* hit another quote before any strongly-directional
373	* character.
374	*/
375	for (j = i + `1`; !done && (j < len) && text[j] != `'"'`; ++j)
376	{
377	switch(types[j])
378	{
379	case BDI_RLE:
380	case BDI_LRE:
381	done = TRUE;
382	break;
383
384	case BDI_L:
385	case BDI_EN:
386	types[i] = BDI_LRE;
387	pdfNeeded = TRUE;
388	done = TRUE;
389	break;
390
391	case BDI_R:
392	case BDI_AL:
393	types[i] = BDI_RLE;
394	pdfNeeded = TRUE;
395	done = TRUE;
396	break;
397
398	default:
399	break;
400	}
401	}
402	}
403	}
404	}
405	}
406
407	/ Creates a buffer with an embedding level for every character in the*
408	* given text. Also determines the base level and returns it in
409	* baseDir if baseDir does not initially contain a valid direction.
410	*/
411	static fz_bidi_level *
412	create_levels(fz_context *ctx,
413	const uint32_t *text,
414	size_t len,
415	fz_bidi_direction *baseDir,
416	int resolveWhiteSpace,
417	int flags)
418	{
419	fz_bidi_level levels, plevels;
420	fz_bidi_chartype *types = NULL;
421	fz_bidi_chartype *ptypes;
422	fz_bidi_level baseLevel;
423	const uint32_t *ptext;
424	size_t plen, remaining;
425
426	levels = fz_malloc(ctx, len * sizeof(*levels));
427
428	fz_var(types);
429
430	fz_try(ctx)
431	{
432	types = fz_malloc(ctx, len * sizeof(fz_bidi_chartype));
433
434	classify_characters(text, types, len, flags);
435
436	if (baseDir != FZ_BIDI_LTR && baseDir != FZ_BIDI_RTL)
437	{
438	/ Derive the base level from the text and*
439	* update *baseDir in case the caller wants to know.
440	*/
441	baseLevel = base_level_from_text(types, len);
442	*baseDir = ODD(baseLevel)==`1` ? FZ_BIDI_RTL : FZ_BIDI_LTR;
443	}
444	else
445	{
446	baseLevel = (fz_bidi_level)*baseDir;
447	}
448
449	{
450	/ Replace tab with base direction, i.e. make tab appear as*
451	* 'strong left' if the base direction is left-to-right and
452	* 'strong right' if base direction is right-to-left. This
453	* allows Layout to implicitly treat tabs as 'segment separators'.
454	*/
455	size_t i;
456
457	for (i = `0u`; i < len; i++)
458	{
459	if (text[i]==`'\t'`)
460	{
461	types[i] = (*baseDir == FZ_BIDI_RTL) ? BDI_R : BDI_L;
462	}
463	}
464	}
465
466	/ Look for quotation marks. Classify them as RLE or LRE*
467	* or leave them alone, depending on what follows them.
468	*/
469	classify_quoted_blocks(text, types, len);
470
471	/ Work one paragraph at a time. /
472	plevels = levels;
473	ptypes = types;
474	ptext = text;
475	remaining = len;
476	while (remaining)
477	{
478	plen = fz_bidi_resolve_paragraphs(ptypes, remaining);
479
480	/ Work out the levels and character types... /
481	(void)fz_bidi_resolve_explicit(baseLevel, BDI_N, ptypes, plevels, plen, `0`);
482	fz_bidi_resolve_weak(ctx, baseLevel, ptypes, plevels, plen);
483	fz_bidi_resolve_neutrals(baseLevel, ptypes, plevels, plen);
484	fz_bidi_resolve_implicit(ptypes, plevels, plen);
485
486	classify_characters(ptext, ptypes, plen, FZ_BIDI_CLASSIFY_WHITE_SPACE);
487
488	if (resolveWhiteSpace)
489	{
490	/ resolve whitespace /
491	fz_bidi_resolve_whitespace(baseLevel, ptypes, plevels, plen);
492	}
493
494	plevels += plen;
495	ptypes += plen;
496	ptext += plen;
497	remaining -= plen;
498	}
499
500	/ The levels buffer now has odd and even numbers indicating*
501	* rtl or ltr characters, respectively.
502	*/
503	#ifdef DEBUG_BIDI_VERBOSE
504	fprintf(stderr, "Levels: ");
505	{
506	size_t i;
507	for (i = `0`; i < len; i++)
508	{
509	fprintf(stderr, "%d", levels[i]>`9`?`0`:levels[i]);
510	}
511	fprintf(stderr, "\n");
512	}
513	#endif
514	}
515	fz_always(ctx)
516	{
517	fz_free(ctx, types);
518	}
519	fz_catch(ctx)
520	{
521	fz_free(ctx, levels);
522	fz_rethrow(ctx);
523	}
524	return levels;
525	}
526
527	/ Partitions the given character sequence into one or more unidirectional*
528	* fragments and invokes the given callback function for each fragment.
529	*/
530	void fz_bidi_fragment_text(fz_context *ctx,
531	const uint32_t *text,
532	size_t textlen,
533	fz_bidi_direction *baseDir,
534	fz_bidi_fragment_fn *callback,
535	void *arg,
536	int flags)
537	{
538	size_t startOfFragment;
539	size_t i;
540	fz_bidi_level *levels;
541
542	if (text == NULL \|\| callback == NULL \|\| textlen == `0`)
543	return;
544
545	DBUGH(("fz_bidi_fragment_text('%S', len = %d)\n", text, textlen));
546
547	levels = create_levels(ctx, text, textlen, baseDir, FALSE, flags);
548
549	/ We now have an array with an embedding level*
550	* for each character in text.
551	*/
552	assert(levels != NULL);
553
554	fz_try(ctx)
555	{
556	startOfFragment = `0`;
557	for (i = `1`; i < textlen; i++)
558	{
559	if (levels[i] != levels[i-`1`])
560	{
561	/ We've gone past the end of the fragment.*
562	* Create a text object for it, then start
563	* a new fragment.
564	*/
565	split_at_script(&text[startOfFragment],
566	i - startOfFragment,
567	levels[startOfFragment],
568	arg,
569	callback);
570	startOfFragment = i;
571	}
572	}
573	/ Now i == textlen. Deal with the final (or maybe only) fragment. /
574	/ otherwise create 1 fragment /
575	split_at_script(&text[startOfFragment],
576	i - startOfFragment,
577	levels[startOfFragment],
578	arg,
579	callback);
580	}
581	fz_always(ctx)
582	{
583	fz_free(ctx, levels);
584	}
585	fz_catch(ctx)
586	{
587	fz_rethrow(ctx);
588	}
589	}
590

Browse the source code of MuPDF/source/fitz/bidi.c