hpdf_encoder_utf.c source code [POCO/PDF/src/hpdf_encoder_utf.c]

1	/*
2	* << Haru Free PDF Library >> -- hpdf_encoder_utf.c
3	*
4	* URL: http://libharu.org
5	*
6	* Copyright (c) 1999-2006 Takeshi Kanno <takeshi_kanno@est.hi-ho.ne.jp>
7	* Copyright (c) 2007-2008 Antony Dovgal <tony@daylessday.org>
8	* Copyright (c) 2010 Sergey Konovalov <webmaster@crynet.ru>
9	* Copyright (c) 2011 Koen Deforche <koen@emweb.be>
10	*
11	* Permission to use, copy, modify, distribute and sell this software
12	* and its documentation for any purpose is hereby granted without fee,
13	* provided that the above copyright notice appear in all copies and
14	* that both that copyright notice and this permission notice appear
15	* in supporting documentation.
16	* It is provided "as is" without express or implied warranty.
17	*
18	*/
19
20	#include "hpdf_conf.h"
21	#include "hpdf_utils.h"
22	#include "hpdf_encoder.h"
23	#include "hpdf.h"
24
25	typedef struct _UTF8_EncoderAttr_Rec *UTF8_EncoderAttr;
26	typedef struct _UTF8_EncoderAttr_Rec {
27	HPDF_BYTE current_byte;
28	HPDF_BYTE end_byte;
29	HPDF_BYTE utf8_bytes[`8`];
30	} UTF8_EncoderAttr_Rec;
31
32	static const HPDF_CidRange_Rec UTF8_NOTDEF_RANGE = {`0x0000`, `0x001F`, `1`};
33	static const HPDF_CidRange_Rec UTF8_SPACE_RANGE = {`0x0000`, `0xFFFF`, `0`};
34	static const HPDF_CidRange_Rec UTF8_CID_RANGE[] = {
35	{ `0x0000`, `0xFFFF`, `0x0` },
36	{ `0xFFFF`, `0xFFFF`, `0x0` }
37	};
38
39	static HPDF_ByteType
40	UTF8_Encoder_ByteType_Func (HPDF_Encoder encoder,
41	HPDF_ParseText_Rec *state);
42
43	static HPDF_UNICODE
44	UTF8_Encoder_ToUnicode_Func (HPDF_Encoder encoder,
45	HPDF_UINT16 code);
46
47	static char *
48	UTF8_Encoder_EncodeText_Func (HPDF_Encoder encoder,
49	const char *text,
50	HPDF_UINT len,
51	HPDF_UINT *length);
52
53	static HPDF_STATUS
54	UTF8_Init (HPDF_Encoder encoder);
55
56	/--------------------------------------------------------------------------/
57
58
59	/*
60	* This function is taken from hpdf_encoder_utf8.c, originally submitted
61	* to libharu by 'Mirco'
62	*/
63	static HPDF_ByteType
64	UTF8_Encoder_ByteType_Func (HPDF_Encoder encoder,
65	HPDF_ParseText_Rec *state)
66	{
67	// This function is supposed to increment state->index
68	// Not logical ! (look at function HPDF_String_Write in hpdf_string.c)
69
70	// When HPDF_BYTE_TYPE_SINGLE is returned, the current byte is the
71	// CODE argument in call ToUnicode_Func
72	// When HPDF_BYTE_TYPE_LEAD is returned, the current byte (msb) and the
73	// next byte (lsb) is the CODE arguement in call ToUnicodeFunc
74	// When HPDF_BYTE_TYPE_TRIAL is returned, the current byte is ignored
75
76	HPDF_CMapEncoderAttr encoder_attr;
77	HPDF_BYTE byte;
78	UTF8_EncoderAttr utf8_attr;
79
80	encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr;
81	utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[`0`]);
82
83	if (state->index == `0`) {
84	//First byte, initialize.
85	HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Initialize: (%u) %s\n",
86	state->len, state->text));
87
88	utf8_attr->current_byte = `0`;
89	}
90
91	byte = state->text[state->index];
92	state->index++;
93
94	HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Byte: %hx\n", byte));
95
96	if (utf8_attr->current_byte == `0`) {
97	utf8_attr->utf8_bytes[`0`] = byte;
98	utf8_attr->current_byte = `1`;
99
100	if (!(byte & `0x80`)) {
101	utf8_attr->current_byte = `0`;
102	utf8_attr->end_byte = `0`;
103	return HPDF_BYTE_TYPE_SINGLE;
104	}
105
106	if ((byte & `0xf8`) == `0xf0`)
107	utf8_attr->end_byte = `3`;
108	else if ((byte & `0xf0`) == `0xe0`)
109	utf8_attr->end_byte = `2`;
110	else if ((byte & `0xe0`) == `0xc0`)
111	utf8_attr->end_byte = `1`;
112	else
113	utf8_attr->current_byte = `0`; //ERROR, skip this byte
114	} else {
115	utf8_attr->utf8_bytes[utf8_attr->current_byte] = byte;
116	if (utf8_attr->current_byte == utf8_attr->end_byte) {
117	utf8_attr->current_byte = `0`;
118	return HPDF_BYTE_TYPE_SINGLE;
119	}
120
121	utf8_attr->current_byte++;
122	}
123
124	return HPDF_BYTE_TYPE_TRIAL;
125	}
126
127	/*
128	* This function is taken from hpdf_encoder_utf8.c, originally submitted
129	* to libharu by 'Mirco'
130	*/
131	static HPDF_UNICODE
132	UTF8_Encoder_ToUnicode_Func (HPDF_Encoder encoder,
133	HPDF_UINT16 code)
134	{
135	// Supposed to convert CODE to unicode.
136	// This function is allways called after ByteType_Func.
137	// ByteType_Func recognizes the utf-8 bytes belonging to one character.
138
139	HPDF_CMapEncoderAttr encoder_attr;
140	UTF8_EncoderAttr utf8_attr;
141	unsigned int val;
142
143	encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr;
144	utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[`0`]);
145
146	switch (utf8_attr->end_byte) {
147	case `3`:
148	val = (unsigned int) ((utf8_attr->utf8_bytes[`0`] & `0x7`) << `18`) +
149	(unsigned int) ((utf8_attr->utf8_bytes[`1`]) << `12`) +
150	(unsigned int) ((utf8_attr->utf8_bytes[`2`] & `0x3f`) << `6`) +
151	(unsigned int) ((utf8_attr->utf8_bytes[`3`] & `0x3f`));
152	break;
153	case `2`:
154	val = (unsigned int) ((utf8_attr->utf8_bytes[`0`] & `0xf`) << `12`) +
155	(unsigned int) ((utf8_attr->utf8_bytes[`1`] & `0x3f`) << `6`) +
156	(unsigned int) ((utf8_attr->utf8_bytes[`2`] & `0x3f`));
157	break;
158	case `1`:
159	val = (unsigned int) ((utf8_attr->utf8_bytes[`0`] & `0x1f`) << `6`) +
160	(unsigned int) ((utf8_attr->utf8_bytes[`1`] & `0x3f`));
161	break;
162	case `0`:
163	val = (unsigned int) utf8_attr->utf8_bytes[`0`];
164	break;
165	default:
166	val = `32`; // Unknown character
167	}
168
169	if (val > `65535`) //Convert everything outside UCS-2 to space
170	val = `32`;
171
172	return val;
173	}
174
175	static char *
176	UTF8_Encoder_EncodeText_Func (HPDF_Encoder encoder,
177	const char *text,
178	HPDF_UINT len,
179	HPDF_UINT *length)
180	{
181	char result = malloc(len `2`);
182	char *c = result;
183	HPDF_ParseText_Rec parse_state;
184	HPDF_UINT i;
185
186	HPDF_Encoder_SetParseText (encoder, &parse_state,
187	(const HPDF_BYTE *)text, len);
188
189	for (i = `0`; i < len; i++) {
190	HPDF_UNICODE tmp_unicode;
191	HPDF_ByteType btype = HPDF_Encoder_ByteType (encoder, &parse_state);
192
193	if (btype != HPDF_BYTE_TYPE_TRIAL) {
194	tmp_unicode = HPDF_Encoder_ToUnicode (encoder, `0`);
195
196	HPDF_UInt16Swap (&tmp_unicode);
197	HPDF_MemCpy ((HPDF_BYTE )c, (const* HPDF_BYTE*)&tmp_unicode, `2`);
198	c += `2`;
199	}
200	}
201
202	*length = c - result;
203
204	return result;
205	}
206
207	static HPDF_STATUS
208	UTF8_Init (HPDF_Encoder encoder)
209	{
210	HPDF_CMapEncoderAttr attr;
211	HPDF_STATUS ret;
212
213	if ((ret = HPDF_CMapEncoder_InitAttr (encoder)) != HPDF_OK)
214	return ret;
215
216	/*
217	* We override these two
218	*/
219	encoder->byte_type_fn = UTF8_Encoder_ByteType_Func;
220	encoder->to_unicode_fn = UTF8_Encoder_ToUnicode_Func;
221	encoder->encode_text_fn = UTF8_Encoder_EncodeText_Func;
222
223	attr = (HPDF_CMapEncoderAttr)encoder->attr;
224
225	if (HPDF_CMapEncoder_AddCMap (encoder, UTF8_CID_RANGE) != HPDF_OK)
226	return encoder->error->error_no;
227
228	if (HPDF_CMapEncoder_AddCodeSpaceRange (encoder, UTF8_SPACE_RANGE)
229	!= HPDF_OK)
230	return encoder->error->error_no;
231
232	if (HPDF_CMapEncoder_AddNotDefRange (encoder, UTF8_NOTDEF_RANGE)
233	!= HPDF_OK)
234	return encoder->error->error_no;
235
236	attr->is_lead_byte_fn = NULL;
237	attr->is_trial_byte_fn = NULL;
238
239	HPDF_StrCpy (attr->registry, "Adobe", attr->registry +
240	HPDF_LIMIT_MAX_NAME_LEN);
241	HPDF_StrCpy (attr->ordering, "Identity-H", attr->ordering +
242	HPDF_LIMIT_MAX_NAME_LEN);
243	attr->suppliment = `0`;
244	attr->writing_mode = HPDF_WMODE_HORIZONTAL;
245
246	/ Not sure about this*
247	attr->uid_offset = 0;
248	attr->xuid[0] = 0;
249	attr->xuid[1] = 0;
250	attr->xuid[2] = 0;
251	*/
252
253	encoder->type = HPDF_ENCODER_TYPE_DOUBLE_BYTE;
254
255	return HPDF_OK;
256	}
257
258	/--------------------------------------------------------------------------/
259
260	HPDF_EXPORT(HPDF_STATUS)
261	HPDF_UseUTFEncodings (HPDF_Doc pdf)
262	{
263	HPDF_Encoder encoder;
264	HPDF_STATUS ret;
265
266	if (!HPDF_HasDoc (pdf))
267	return HPDF_INVALID_DOCUMENT;
268
269	encoder = HPDF_CMapEncoder_New (pdf->mmgr, "UTF-8",
270	UTF8_Init);
271
272	if ((ret = HPDF_Doc_RegisterEncoder (pdf, encoder)) != HPDF_OK)
273	return ret;
274
275	return HPDF_OK;
276	}
277

Browse the source code of POCO/PDF/src/hpdf_encoder_utf.c