pdf-lex.c source code [MuPDF/source/pdf/pdf-lex.c]

1	#include "mupdf/fitz.h"
2	#include "mupdf/pdf.h"
3
4	#include <string.h>
5
6	#define IS_NUMBER \
7	'+':case'-':case'.':case'0':case'1':case'2':case'3':\
8	case'4':case'5':case'6':case'7':case'8':case'9'
9	#define IS_WHITE \
10	'\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
11	#define IS_HEX \
12	'0':case'1':case'2':case'3':case'4':case'5':case'6':\
13	case'7':case'8':case'9':case'A':case'B':case'C':\
14	case'D':case'E':case'F':case'a':case'b':case'c':\
15	case'd':case'e':case'f'
16	#define IS_DELIM \
17	'(':case')':case'<':case'>':case'[':case']':case'{':\
18	case'}':case'/':case'%'
19
20	#define RANGE_0_9 \
21	'0':case'1':case'2':case'3':case'4':case'5':\
22	case'6':case'7':case'8':case'9'
23	#define RANGE_a_f \
24	'a':case'b':case'c':case'd':case'e':case'f'
25	#define RANGE_A_F \
26	'A':case'B':case'C':case'D':case'E':case'F'
27	#define RANGE_0_7 \
28	'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
29
30	/ #define DUMP_LEXER_STREAM /
31	#ifdef DUMP_LEXER_STREAM
32	static inline int lex_byte(fz_context ctx, fz_stream stm)
33	{
34	int c = fz_read_byte(ctx, stm);
35
36	if (c == EOF)
37	fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
38	else if (c >= `32` && c < `128`)
39	fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
40	else
41	fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
42	return c;
43	}
44	#else
45	#define lex_byte(C,S) fz_read_byte(C,S)
46	#endif
47
48	static inline int iswhite(int ch)
49	{
50	return
51	ch == `'\000'` \|\|
52	ch == `'\011'` \|\|
53	ch == `'\012'` \|\|
54	ch == `'\014'` \|\|
55	ch == `'\015'` \|\|
56	ch == `'\040'`;
57	}
58
59	static inline int fz_isprint(int ch)
60	{
61	return ch >= `' '` && ch <= `'~'`;
62	}
63
64	static inline int unhex(int ch)
65	{
66	if (ch >= `'0'` && ch <= `'9'`) return ch - `'0'`;
67	if (ch >= `'A'` && ch <= `'F'`) return ch - `'A'` + `0xA`;
68	if (ch >= `'a'` && ch <= `'f'`) return ch - `'a'` + `0xA`;
69	return `0`;
70	}
71
72	static void
73	lex_white(fz_context ctx, fz_stream f)
74	{
75	int c;
76	do {
77	c = lex_byte(ctx, f);
78	} while ((c <= `32`) && (iswhite(c)));
79	if (c != EOF)
80	fz_unread_byte(ctx, f);
81	}
82
83	static void
84	lex_comment(fz_context ctx, fz_stream f)
85	{
86	int c;
87	do {
88	c = lex_byte(ctx, f);
89	} while ((c != `'\012'`) && (c != `'\015'`) && (c != EOF));
90	}
91
92	/ Fast(ish) but inaccurate strtof, with Adobe overflow handling. /
93	static float acrobat_compatible_atof(char *s)
94	{
95	int neg = `0`;
96	int i = `0`;
97
98	while (*s == `'-'`)
99	{
100	neg = `1`;
101	++s;
102	}
103	while (*s == `'+'`)
104	{
105	++s;
106	}
107
108	while (s >= `'0'` && s <= `'9'`)
109	{
110	/ We deliberately ignore overflow here.*
111	* Tests show that Acrobat handles * overflows in exactly the same way we do:
112	* 123450000000000000000678 is read as 678.
113	*/
114	i = i * `10` + (*s - `'0'`);
115	++s;
116	}
117
118	if (*s == `'.'`)
119	{
120	float v = i;
121	float n = `0`;
122	float d = `1`;
123	++s;
124	while (s >= `'0'` && s <= `'9'`)
125	{
126	n = `10` * n + (*s - `'0'`);
127	d = `10` * d;
128	++s;
129	}
130	v += n / d;
131	return neg ? -v : v;
132	}
133	else
134	{
135	return neg ? -i : i;
136	}
137	}
138
139	/ Fast but inaccurate atoi. /
140	static int fast_atoi(char *s)
141	{
142	int neg = `0`;
143	int i = `0`;
144
145	while (*s == `'-'`)
146	{
147	neg = `1`;
148	++s;
149	}
150	while (*s == `'+'`)
151	{
152	++s;
153	}
154
155	while (s >= `'0'` && s <= `'9'`)
156	{
157	/ We deliberately ignore overflow here. /
158	i = i * `10` + (*s - `'0'`);
159	++s;
160	}
161
162	return neg ? -i : i;
163	}
164
165	static int
166	lex_number(fz_context ctx, fz_stream f, pdf_lexbuf buf, int* c)
167	{
168	char *s = buf->scratch;
169	char e = buf->scratch + buf->size - `1`; /* leave space for zero terminator /
170	char *isreal = (c == `'.'` ? s : NULL);
171	int neg = (c == `'-'`);
172	int isbad = `0`;
173
174	*s++ = c;
175
176	c = lex_byte(ctx, f);
177
178	/ skip extra '-' signs at start of number /
179	if (neg)
180	{
181	while (c == `'-'`)
182	c = lex_byte(ctx, f);
183	}
184
185	while (s < e)
186	{
187	switch (c)
188	{
189	case IS_WHITE:
190	case IS_DELIM:
191	fz_unread_byte(ctx, f);
192	goto end;
193	case EOF:
194	goto end;
195	case `'.'`:
196	if (isreal)
197	isbad = `1`;
198	isreal = s;
199	*s++ = c;
200	break;
201	case RANGE_0_9:
202	*s++ = c;
203	break;
204	default:
205	isbad = `1`;
206	*s++ = c;
207	break;
208	}
209	c = lex_byte(ctx, f);
210	}
211
212	end:
213	*s = `'\0'`;
214	if (isbad)
215	return PDF_TOK_ERROR;
216	if (isreal)
217	{
218	/ We'd like to use the fastest possible atof*
219	* routine, but we'd rather match acrobats
220	* handling of broken numbers. As such, we
221	* spot common broken cases and call an
222	* acrobat compatible routine where required. */
223	if (neg > `1` \|\| isreal - buf->scratch >= `10`)
224	buf->f = acrobat_compatible_atof(buf->scratch);
225	else
226	buf->f = fz_atof(buf->scratch);
227	return PDF_TOK_REAL;
228	}
229	else
230	{
231	buf->i = fast_atoi(buf->scratch);
232	return PDF_TOK_INT;
233	}
234	}
235
236	static void
237	lex_name(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
238	{
239	char *s = lb->scratch;
240	char *e = s + fz_mini(`127`, lb->size);
241	int c;
242
243	while (`1`)
244	{
245	if (s == e)
246	{
247	if (e - lb->scratch < `127`)
248	{
249	s += pdf_lexbuf_grow(ctx, lb);
250	e = lb->scratch + fz_mini(`127`, lb->size);
251	}
252	else
253	{
254	/ truncate names that are too long /
255	fz_warn(ctx, "name is too long");
256	*s = `0`;
257	lb->len = s - lb->scratch;
258	s = NULL;
259	}
260	}
261	c = lex_byte(ctx, f);
262	switch (c)
263	{
264	case IS_WHITE:
265	case IS_DELIM:
266	fz_unread_byte(ctx, f);
267	goto end;
268	case EOF:
269	goto end;
270	case `'#'`:
271	{
272	int hex[`2`];
273	int i;
274	for (i = `0`; i < `2`; i++)
275	{
276	c = fz_peek_byte(ctx, f);
277	switch (c)
278	{
279	case RANGE_0_9:
280	if (i == `1` && c == `'0'` && hex[`0`] == `0`)
281	goto illegal;
282	hex[i] = lex_byte(ctx, f) - `'0'`;
283	break;
284	case RANGE_a_f:
285	hex[i] = lex_byte(ctx, f) - `'a'` + `10`;
286	break;
287	case RANGE_A_F:
288	hex[i] = lex_byte(ctx, f) - `'A'` + `10`;
289	break;
290	default:
291	case EOF:
292	goto illegal;
293	}
294	}
295	if (s) *s++ = (hex[`0`] << `4`) + hex[`1`];
296	break;
297	illegal:
298	if (i == `1`)
299	fz_unread_byte(ctx, f);
300	if (s) *s++ = `'#'`;
301	continue;
302	}
303	default:
304	if (s) *s++ = c;
305	break;
306	}
307	}
308	end:
309	if (s)
310	{
311	*s = `'\0'`;
312	lb->len = s - lb->scratch;
313	}
314	}
315
316	static int
317	lex_string(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
318	{
319	char *s = lb->scratch;
320	char *e = s + lb->size;
321	int bal = `1`;
322	int oct;
323	int c;
324
325	while (`1`)
326	{
327	if (s == e)
328	{
329	s += pdf_lexbuf_grow(ctx, lb);
330	e = lb->scratch + lb->size;
331	}
332	c = lex_byte(ctx, f);
333	switch (c)
334	{
335	case EOF:
336	return PDF_TOK_ERROR;
337	case `'('`:
338	bal++;
339	*s++ = c;
340	break;
341	case `')'`:
342	bal --;
343	if (bal == `0`)
344	goto end;
345	*s++ = c;
346	break;
347	case `'\\'`:
348	c = lex_byte(ctx, f);
349	switch (c)
350	{
351	case EOF:
352	return PDF_TOK_ERROR;
353	case `'n'`:
354	*s++ = `'\n'`;
355	break;
356	case `'r'`:
357	*s++ = `'\r'`;
358	break;
359	case `'t'`:
360	*s++ = `'\t'`;
361	break;
362	case `'b'`:
363	*s++ = `'\b'`;
364	break;
365	case `'f'`:
366	*s++ = `'\f'`;
367	break;
368	case `'('`:
369	*s++ = `'('`;
370	break;
371	case `')'`:
372	*s++ = `')'`;
373	break;
374	case `'\\'`:
375	*s++ = `'\\'`;
376	break;
377	case RANGE_0_7:
378	oct = c - `'0'`;
379	c = lex_byte(ctx, f);
380	if (c >= `'0'` && c <= `'7'`)
381	{
382	oct = oct * `8` + (c - `'0'`);
383	c = lex_byte(ctx, f);
384	if (c >= `'0'` && c <= `'7'`)
385	oct = oct * `8` + (c - `'0'`);
386	else if (c != EOF)
387	fz_unread_byte(ctx, f);
388	}
389	else if (c != EOF)
390	fz_unread_byte(ctx, f);
391	*s++ = oct;
392	break;
393	case `'\n'`:
394	break;
395	case `'\r'`:
396	c = lex_byte(ctx, f);
397	if ((c != `'\n'`) && (c != EOF))
398	fz_unread_byte(ctx, f);
399	break;
400	default:
401	*s++ = c;
402	}
403	break;
404	default:
405	*s++ = c;
406	break;
407	}
408	}
409	end:
410	lb->len = s - lb->scratch;
411	return PDF_TOK_STRING;
412	}
413
414	static int
415	lex_hex_string(fz_context ctx, fz_stream f, pdf_lexbuf *lb)
416	{
417	char *s = lb->scratch;
418	char *e = s + lb->size;
419	int a = `0`, x = `0`;
420	int c;
421
422	while (`1`)
423	{
424	if (s == e)
425	{
426	s += pdf_lexbuf_grow(ctx, lb);
427	e = lb->scratch + lb->size;
428	}
429	c = lex_byte(ctx, f);
430	switch (c)
431	{
432	case IS_WHITE:
433	break;
434	default:
435	fz_warn(ctx, "invalid character in hex string");
436	/ fall through /
437	case IS_HEX:
438	if (x)
439	{
440	s++ = a `16` + unhex(c);
441	x = !x;
442	}
443	else
444	{
445	a = unhex(c);
446	x = !x;
447	}
448	break;
449	case `'>'`:
450	if (x)
451	{
452	s++ = a `16`; / pad truncated string with '0' /
453	}
454	goto end;
455	case EOF:
456	return PDF_TOK_ERROR;
457	}
458	}
459	end:
460	lb->len = s - lb->scratch;
461	return PDF_TOK_STRING;
462	}
463
464	static pdf_token
465	pdf_token_from_keyword(char *key)
466	{
467	switch (*key)
468	{
469	case `'R'`:
470	if (!strcmp(key, "R")) return PDF_TOK_R;
471	break;
472	case `'t'`:
473	if (!strcmp(key, "true")) return PDF_TOK_TRUE;
474	if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
475	break;
476	case `'f'`:
477	if (!strcmp(key, "false")) return PDF_TOK_FALSE;
478	break;
479	case `'n'`:
480	if (!strcmp(key, "null")) return PDF_TOK_NULL;
481	break;
482	case `'o'`:
483	if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
484	break;
485	case `'e'`:
486	if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
487	if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
488	break;
489	case `'s'`:
490	if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
491	if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
492	break;
493	case `'x'`:
494	if (!strcmp(key, "xref")) return PDF_TOK_XREF;
495	break;
496	}
497
498	while (*key)
499	{
500	if (!fz_isprint(*key))
501	return PDF_TOK_ERROR;
502	++key;
503	}
504
505	return PDF_TOK_KEYWORD;
506	}
507
508	void pdf_lexbuf_init(fz_context ctx, pdf_lexbuf lb, int size)
509	{
510	lb->size = lb->base_size = size;
511	lb->len = `0`;
512	lb->scratch = &lb->buffer[`0`];
513	}
514
515	void pdf_lexbuf_fin(fz_context ctx, pdf_lexbuf lb)
516	{
517	if (lb && lb->size != lb->base_size)
518	fz_free(ctx, lb->scratch);
519	}
520
521	ptrdiff_t pdf_lexbuf_grow(fz_context ctx, pdf_lexbuf lb)
522	{
523	char *old = lb->scratch;
524	int newsize = lb->size * `2`;
525	if (lb->size == lb->base_size)
526	{
527	lb->scratch = fz_malloc(ctx, newsize);
528	memcpy(lb->scratch, lb->buffer, lb->size);
529	}
530	else
531	{
532	lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
533	}
534	lb->size = newsize;
535	return lb->scratch - old;
536	}
537
538	pdf_token
539	pdf_lex(fz_context ctx, fz_stream f, pdf_lexbuf *buf)
540	{
541	while (`1`)
542	{
543	int c = lex_byte(ctx, f);
544	switch (c)
545	{
546	case EOF:
547	return PDF_TOK_EOF;
548	case IS_WHITE:
549	lex_white(ctx, f);
550	break;
551	case `'%'`:
552	lex_comment(ctx, f);
553	break;
554	case `'/'`:
555	lex_name(ctx, f, buf);
556	return PDF_TOK_NAME;
557	case `'('`:
558	return lex_string(ctx, f, buf);
559	case `')'`:
560	return PDF_TOK_ERROR;
561	case `'<'`:
562	c = lex_byte(ctx, f);
563	if (c == `'<'`)
564	return PDF_TOK_OPEN_DICT;
565	if (c != EOF)
566	fz_unread_byte(ctx, f);
567	return lex_hex_string(ctx, f, buf);
568	case `'>'`:
569	c = lex_byte(ctx, f);
570	if (c == `'>'`)
571	return PDF_TOK_CLOSE_DICT;
572	if (c != EOF)
573	fz_unread_byte(ctx, f);
574	return PDF_TOK_ERROR;
575	case `'['`:
576	return PDF_TOK_OPEN_ARRAY;
577	case `']'`:
578	return PDF_TOK_CLOSE_ARRAY;
579	case `'{'`:
580	return PDF_TOK_OPEN_BRACE;
581	case `'}'`:
582	return PDF_TOK_CLOSE_BRACE;
583	case IS_NUMBER:
584	return lex_number(ctx, f, buf, c);
585	default: / isregular: !isdelim && !iswhite && c != EOF /
586	fz_unread_byte(ctx, f);
587	lex_name(ctx, f, buf);
588	return pdf_token_from_keyword(buf->scratch);
589	}
590	}
591	}
592
593	pdf_token
594	pdf_lex_no_string(fz_context ctx, fz_stream f, pdf_lexbuf *buf)
595	{
596	while (`1`)
597	{
598	int c = lex_byte(ctx, f);
599	switch (c)
600	{
601	case EOF:
602	return PDF_TOK_EOF;
603	case IS_WHITE:
604	lex_white(ctx, f);
605	break;
606	case `'%'`:
607	lex_comment(ctx, f);
608	break;
609	case `'/'`:
610	lex_name(ctx, f, buf);
611	return PDF_TOK_NAME;
612	case `'('`:
613	return PDF_TOK_ERROR; / no strings allowed /
614	case `')'`:
615	return PDF_TOK_ERROR; / no strings allowed /
616	case `'<'`:
617	c = lex_byte(ctx, f);
618	if (c == `'<'`)
619	return PDF_TOK_OPEN_DICT;
620	if (c != EOF)
621	fz_unread_byte(ctx, f);
622	return PDF_TOK_ERROR; / no strings allowed /
623	case `'>'`:
624	c = lex_byte(ctx, f);
625	if (c == `'>'`)
626	return PDF_TOK_CLOSE_DICT;
627	if (c != EOF)
628	fz_unread_byte(ctx, f);
629	return PDF_TOK_ERROR;
630	case `'['`:
631	return PDF_TOK_OPEN_ARRAY;
632	case `']'`:
633	return PDF_TOK_CLOSE_ARRAY;
634	case `'{'`:
635	return PDF_TOK_OPEN_BRACE;
636	case `'}'`:
637	return PDF_TOK_CLOSE_BRACE;
638	case IS_NUMBER:
639	return lex_number(ctx, f, buf, c);
640	default: / isregular: !isdelim && !iswhite && c != EOF /
641	fz_unread_byte(ctx, f);
642	lex_name(ctx, f, buf);
643	return pdf_token_from_keyword(buf->scratch);
644	}
645	}
646	}
647
648	/*
649	print a lexed token to a buffer, growing if necessary
650	*/
651	void pdf_append_token(fz_context ctx, fz_buffer fzbuf, int tok, pdf_lexbuf *buf)
652	{
653	switch (tok)
654	{
655	case PDF_TOK_NAME:
656	fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
657	break;
658	case PDF_TOK_STRING:
659	if (buf->len >= buf->size)
660	pdf_lexbuf_grow(ctx, buf);
661	buf->scratch[buf->len] = `0`;
662	fz_append_pdf_string(ctx, fzbuf, buf->scratch);
663	break;
664	case PDF_TOK_OPEN_DICT:
665	fz_append_string(ctx, fzbuf, "<<");
666	break;
667	case PDF_TOK_CLOSE_DICT:
668	fz_append_string(ctx, fzbuf, ">>");
669	break;
670	case PDF_TOK_OPEN_ARRAY:
671	fz_append_byte(ctx, fzbuf, `'['`);
672	break;
673	case PDF_TOK_CLOSE_ARRAY:
674	fz_append_byte(ctx, fzbuf, `']'`);
675	break;
676	case PDF_TOK_OPEN_BRACE:
677	fz_append_byte(ctx, fzbuf, `'{'`);
678	break;
679	case PDF_TOK_CLOSE_BRACE:
680	fz_append_byte(ctx, fzbuf, `'}'`);
681	break;
682	case PDF_TOK_INT:
683	fz_append_printf(ctx, fzbuf, "%ld", buf->i);
684	break;
685	case PDF_TOK_REAL:
686	fz_append_printf(ctx, fzbuf, "%g", buf->f);
687	break;
688	default:
689	fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
690	break;
691	}
692	}
693

Browse the source code of MuPDF/source/pdf/pdf-lex.c