1/*
2 * << Haru Free PDF Library >> -- hpdf_encoder_utf.c
3 *
4 * URL: http://libharu.org
5 *
6 * Copyright (c) 1999-2006 Takeshi Kanno <takeshi_kanno@est.hi-ho.ne.jp>
7 * Copyright (c) 2007-2008 Antony Dovgal <tony@daylessday.org>
8 * Copyright (c) 2010 Sergey Konovalov <webmaster@crynet.ru>
9 * Copyright (c) 2011 Koen Deforche <koen@emweb.be>
10 *
11 * Permission to use, copy, modify, distribute and sell this software
12 * and its documentation for any purpose is hereby granted without fee,
13 * provided that the above copyright notice appear in all copies and
14 * that both that copyright notice and this permission notice appear
15 * in supporting documentation.
16 * It is provided "as is" without express or implied warranty.
17 *
18 */
19
20#include "hpdf_conf.h"
21#include "hpdf_utils.h"
22#include "hpdf_encoder.h"
23#include "hpdf.h"
24
25typedef struct _UTF8_EncoderAttr_Rec *UTF8_EncoderAttr;
26typedef struct _UTF8_EncoderAttr_Rec {
27 HPDF_BYTE current_byte;
28 HPDF_BYTE end_byte;
29 HPDF_BYTE utf8_bytes[8];
30} UTF8_EncoderAttr_Rec;
31
32static const HPDF_CidRange_Rec UTF8_NOTDEF_RANGE = {0x0000, 0x001F, 1};
33static const HPDF_CidRange_Rec UTF8_SPACE_RANGE = {0x0000, 0xFFFF, 0};
34static const HPDF_CidRange_Rec UTF8_CID_RANGE[] = {
35 { 0x0000, 0xFFFF, 0x0 },
36 { 0xFFFF, 0xFFFF, 0x0 }
37};
38
39static HPDF_ByteType
40UTF8_Encoder_ByteType_Func (HPDF_Encoder encoder,
41 HPDF_ParseText_Rec *state);
42
43static HPDF_UNICODE
44UTF8_Encoder_ToUnicode_Func (HPDF_Encoder encoder,
45 HPDF_UINT16 code);
46
47static char *
48UTF8_Encoder_EncodeText_Func (HPDF_Encoder encoder,
49 const char *text,
50 HPDF_UINT len,
51 HPDF_UINT *length);
52
53static HPDF_STATUS
54UTF8_Init (HPDF_Encoder encoder);
55
56/*--------------------------------------------------------------------------*/
57
58
59/*
60 * This function is taken from hpdf_encoder_utf8.c, originally submitted
61 * to libharu by 'Mirco'
62 */
63static HPDF_ByteType
64UTF8_Encoder_ByteType_Func (HPDF_Encoder encoder,
65 HPDF_ParseText_Rec *state)
66{
67 // This function is supposed to increment state->index
68 // Not logical ! (look at function HPDF_String_Write in hpdf_string.c)
69
70 // When HPDF_BYTE_TYPE_SINGLE is returned, the current byte is the
71 // CODE argument in call ToUnicode_Func
72 // When HPDF_BYTE_TYPE_LEAD is returned, the current byte (msb) and the
73 // next byte (lsb) is the CODE arguement in call ToUnicodeFunc
74 // When HPDF_BYTE_TYPE_TRIAL is returned, the current byte is ignored
75
76 HPDF_CMapEncoderAttr encoder_attr;
77 HPDF_BYTE byte;
78 UTF8_EncoderAttr utf8_attr;
79
80 encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr;
81 utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[0]);
82
83 if (state->index == 0) {
84 //First byte, initialize.
85 HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Initialize: (%u) %s\n",
86 state->len, state->text));
87
88 utf8_attr->current_byte = 0;
89 }
90
91 byte = state->text[state->index];
92 state->index++;
93
94 HPDF_PTRACE ((" UTF8_Encoder_ByteType_Func - Byte: %hx\n", byte));
95
96 if (utf8_attr->current_byte == 0) {
97 utf8_attr->utf8_bytes[0] = byte;
98 utf8_attr->current_byte = 1;
99
100 if (!(byte & 0x80)) {
101 utf8_attr->current_byte = 0;
102 utf8_attr->end_byte = 0;
103 return HPDF_BYTE_TYPE_SINGLE;
104 }
105
106 if ((byte & 0xf8) == 0xf0)
107 utf8_attr->end_byte = 3;
108 else if ((byte & 0xf0) == 0xe0)
109 utf8_attr->end_byte = 2;
110 else if ((byte & 0xe0) == 0xc0)
111 utf8_attr->end_byte = 1;
112 else
113 utf8_attr->current_byte = 0; //ERROR, skip this byte
114 } else {
115 utf8_attr->utf8_bytes[utf8_attr->current_byte] = byte;
116 if (utf8_attr->current_byte == utf8_attr->end_byte) {
117 utf8_attr->current_byte = 0;
118 return HPDF_BYTE_TYPE_SINGLE;
119 }
120
121 utf8_attr->current_byte++;
122 }
123
124 return HPDF_BYTE_TYPE_TRIAL;
125}
126
127/*
128 * This function is taken from hpdf_encoder_utf8.c, originally submitted
129 * to libharu by 'Mirco'
130 */
131static HPDF_UNICODE
132UTF8_Encoder_ToUnicode_Func (HPDF_Encoder encoder,
133 HPDF_UINT16 code)
134{
135 // Supposed to convert CODE to unicode.
136 // This function is allways called after ByteType_Func.
137 // ByteType_Func recognizes the utf-8 bytes belonging to one character.
138
139 HPDF_CMapEncoderAttr encoder_attr;
140 UTF8_EncoderAttr utf8_attr;
141 unsigned int val;
142
143 encoder_attr = (HPDF_CMapEncoderAttr) encoder->attr;
144 utf8_attr = (UTF8_EncoderAttr) ((void *)encoder_attr->cid_map[0]);
145
146 switch (utf8_attr->end_byte) {
147 case 3:
148 val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0x7) << 18) +
149 (unsigned int) ((utf8_attr->utf8_bytes[1]) << 12) +
150 (unsigned int) ((utf8_attr->utf8_bytes[2] & 0x3f) << 6) +
151 (unsigned int) ((utf8_attr->utf8_bytes[3] & 0x3f));
152 break;
153 case 2:
154 val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0xf) << 12) +
155 (unsigned int) ((utf8_attr->utf8_bytes[1] & 0x3f) << 6) +
156 (unsigned int) ((utf8_attr->utf8_bytes[2] & 0x3f));
157 break;
158 case 1:
159 val = (unsigned int) ((utf8_attr->utf8_bytes[0] & 0x1f) << 6) +
160 (unsigned int) ((utf8_attr->utf8_bytes[1] & 0x3f));
161 break;
162 case 0:
163 val = (unsigned int) utf8_attr->utf8_bytes[0];
164 break;
165 default:
166 val = 32; // Unknown character
167 }
168
169 if (val > 65535) //Convert everything outside UCS-2 to space
170 val = 32;
171
172 return val;
173}
174
175static char *
176UTF8_Encoder_EncodeText_Func (HPDF_Encoder encoder,
177 const char *text,
178 HPDF_UINT len,
179 HPDF_UINT *length)
180{
181 char *result = malloc(len * 2);
182 char *c = result;
183 HPDF_ParseText_Rec parse_state;
184 HPDF_UINT i;
185
186 HPDF_Encoder_SetParseText (encoder, &parse_state,
187 (const HPDF_BYTE *)text, len);
188
189 for (i = 0; i < len; i++) {
190 HPDF_UNICODE tmp_unicode;
191 HPDF_ByteType btype = HPDF_Encoder_ByteType (encoder, &parse_state);
192
193 if (btype != HPDF_BYTE_TYPE_TRIAL) {
194 tmp_unicode = HPDF_Encoder_ToUnicode (encoder, 0);
195
196 HPDF_UInt16Swap (&tmp_unicode);
197 HPDF_MemCpy ((HPDF_BYTE *)c, (const HPDF_BYTE*)&tmp_unicode, 2);
198 c += 2;
199 }
200 }
201
202 *length = c - result;
203
204 return result;
205}
206
207static HPDF_STATUS
208UTF8_Init (HPDF_Encoder encoder)
209{
210 HPDF_CMapEncoderAttr attr;
211 HPDF_STATUS ret;
212
213 if ((ret = HPDF_CMapEncoder_InitAttr (encoder)) != HPDF_OK)
214 return ret;
215
216 /*
217 * We override these two
218 */
219 encoder->byte_type_fn = UTF8_Encoder_ByteType_Func;
220 encoder->to_unicode_fn = UTF8_Encoder_ToUnicode_Func;
221 encoder->encode_text_fn = UTF8_Encoder_EncodeText_Func;
222
223 attr = (HPDF_CMapEncoderAttr)encoder->attr;
224
225 if (HPDF_CMapEncoder_AddCMap (encoder, UTF8_CID_RANGE) != HPDF_OK)
226 return encoder->error->error_no;
227
228 if (HPDF_CMapEncoder_AddCodeSpaceRange (encoder, UTF8_SPACE_RANGE)
229 != HPDF_OK)
230 return encoder->error->error_no;
231
232 if (HPDF_CMapEncoder_AddNotDefRange (encoder, UTF8_NOTDEF_RANGE)
233 != HPDF_OK)
234 return encoder->error->error_no;
235
236 attr->is_lead_byte_fn = NULL;
237 attr->is_trial_byte_fn = NULL;
238
239 HPDF_StrCpy (attr->registry, "Adobe", attr->registry +
240 HPDF_LIMIT_MAX_NAME_LEN);
241 HPDF_StrCpy (attr->ordering, "Identity-H", attr->ordering +
242 HPDF_LIMIT_MAX_NAME_LEN);
243 attr->suppliment = 0;
244 attr->writing_mode = HPDF_WMODE_HORIZONTAL;
245
246 /* Not sure about this
247 attr->uid_offset = 0;
248 attr->xuid[0] = 0;
249 attr->xuid[1] = 0;
250 attr->xuid[2] = 0;
251 */
252
253 encoder->type = HPDF_ENCODER_TYPE_DOUBLE_BYTE;
254
255 return HPDF_OK;
256}
257
258/*--------------------------------------------------------------------------*/
259
260HPDF_EXPORT(HPDF_STATUS)
261HPDF_UseUTFEncodings (HPDF_Doc pdf)
262{
263 HPDF_Encoder encoder;
264 HPDF_STATUS ret;
265
266 if (!HPDF_HasDoc (pdf))
267 return HPDF_INVALID_DOCUMENT;
268
269 encoder = HPDF_CMapEncoder_New (pdf->mmgr, "UTF-8",
270 UTF8_Init);
271
272 if ((ret = HPDF_Doc_RegisterEncoder (pdf, encoder)) != HPDF_OK)
273 return ret;
274
275 return HPDF_OK;
276}
277