| 1 | /* |
| 2 | * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
| 3 | * |
| 4 | * Permission to use, copy, modify, and/or distribute this software for any |
| 5 | * purpose with or without fee is hereby granted, provided that the above |
| 6 | * copyright notice and this permission notice appear in all copies. |
| 7 | * |
| 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
| 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
| 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 15 | */ |
| 16 | |
| 17 | #include "mupdf/fitz.h" |
| 18 | #include "mupdf/ucdn.h" |
| 19 | |
| 20 | #include <stdio.h> |
| 21 | #include <stdlib.h> |
| 22 | |
| 23 | typedef struct { |
| 24 | unsigned char category; |
| 25 | unsigned char combining; |
| 26 | unsigned char bidi_class; |
| 27 | unsigned char east_asian_width; |
| 28 | unsigned char script; |
| 29 | unsigned char linebreak_class; |
| 30 | } UCDRecord; |
| 31 | |
| 32 | typedef struct { |
| 33 | unsigned short from, to; |
| 34 | } MirrorPair; |
| 35 | |
| 36 | typedef struct { |
| 37 | unsigned short from, to; |
| 38 | unsigned char type; |
| 39 | } BracketPair; |
| 40 | |
| 41 | typedef struct { |
| 42 | unsigned int start; |
| 43 | short count, index; |
| 44 | } Reindex; |
| 45 | |
| 46 | #include "ucdn_db.h" |
| 47 | |
| 48 | /* constants required for Hangul (de)composition */ |
| 49 | #define SBASE 0xAC00 |
| 50 | #define LBASE 0x1100 |
| 51 | #define VBASE 0x1161 |
| 52 | #define TBASE 0x11A7 |
| 53 | #define SCOUNT 11172 |
| 54 | #define LCOUNT 19 |
| 55 | #define VCOUNT 21 |
| 56 | #define TCOUNT 28 |
| 57 | #define NCOUNT (VCOUNT * TCOUNT) |
| 58 | |
| 59 | static const UCDRecord *get_ucd_record(uint32_t code) |
| 60 | { |
| 61 | int index, offset; |
| 62 | |
| 63 | if (code >= 0x110000) |
| 64 | index = 0; |
| 65 | else { |
| 66 | index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; |
| 67 | offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); |
| 68 | index = index1[index + offset] << SHIFT2; |
| 69 | offset = code & ((1<<SHIFT2) - 1); |
| 70 | index = index2[index + offset]; |
| 71 | } |
| 72 | |
| 73 | return &ucd_records[index]; |
| 74 | } |
| 75 | |
| 76 | static const unsigned short *get_decomp_record(uint32_t code) |
| 77 | { |
| 78 | int index, offset; |
| 79 | |
| 80 | if (code >= 0x110000) |
| 81 | index = 0; |
| 82 | else { |
| 83 | index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] |
| 84 | << DECOMP_SHIFT1; |
| 85 | offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); |
| 86 | index = decomp_index1[index + offset] << DECOMP_SHIFT2; |
| 87 | offset = code & ((1<<DECOMP_SHIFT2) - 1); |
| 88 | index = decomp_index2[index + offset]; |
| 89 | } |
| 90 | |
| 91 | return &decomp_data[index]; |
| 92 | } |
| 93 | |
| 94 | static int compare_reindex(const void *a, const void *b) |
| 95 | { |
| 96 | Reindex *ra = (Reindex *)a; |
| 97 | Reindex *rb = (Reindex *)b; |
| 98 | |
| 99 | if (ra->start < rb->start) |
| 100 | return -1; |
| 101 | else if (ra->start > (rb->start + rb->count)) |
| 102 | return 1; |
| 103 | else |
| 104 | return 0; |
| 105 | } |
| 106 | |
| 107 | static int get_comp_index(uint32_t code, const Reindex *idx, size_t len) |
| 108 | { |
| 109 | Reindex *res; |
| 110 | Reindex r = {0, 0, 0}; |
| 111 | r.start = code; |
| 112 | res = (Reindex *) bsearch(&r, idx, len, sizeof(Reindex), compare_reindex); |
| 113 | |
| 114 | if (res != NULL) |
| 115 | return res->index + (code - res->start); |
| 116 | else |
| 117 | return -1; |
| 118 | } |
| 119 | |
| 120 | static int compare_mp(const void *a, const void *b) |
| 121 | { |
| 122 | MirrorPair *mpa = (MirrorPair *)a; |
| 123 | MirrorPair *mpb = (MirrorPair *)b; |
| 124 | return mpa->from - mpb->from; |
| 125 | } |
| 126 | |
| 127 | static int compare_bp(const void *a, const void *b) |
| 128 | { |
| 129 | BracketPair *bpa = (BracketPair *)a; |
| 130 | BracketPair *bpb = (BracketPair *)b; |
| 131 | return bpa->from - bpb->from; |
| 132 | } |
| 133 | |
| 134 | static BracketPair *search_bp(uint32_t code) |
| 135 | { |
| 136 | BracketPair bp = {0,0,2}; |
| 137 | BracketPair *res; |
| 138 | |
| 139 | bp.from = code; |
| 140 | res = (BracketPair *) bsearch(&bp, bracket_pairs, BIDI_BRACKET_LEN, |
| 141 | sizeof(BracketPair), compare_bp); |
| 142 | return res; |
| 143 | } |
| 144 | |
| 145 | static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
| 146 | { |
| 147 | int si = code - SBASE; |
| 148 | |
| 149 | if (si < 0 || si >= SCOUNT) |
| 150 | return 0; |
| 151 | |
| 152 | if (si % TCOUNT) { |
| 153 | /* LV,T */ |
| 154 | *a = SBASE + (si / TCOUNT) * TCOUNT; |
| 155 | *b = TBASE + (si % TCOUNT); |
| 156 | return 3; |
| 157 | } else { |
| 158 | /* L,V */ |
| 159 | *a = LBASE + (si / NCOUNT); |
| 160 | *b = VBASE + (si % NCOUNT) / TCOUNT; |
| 161 | return 2; |
| 162 | } |
| 163 | } |
| 164 | |
| 165 | static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) |
| 166 | { |
| 167 | if (a >= SBASE && a < (SBASE + SCOUNT) && b >= TBASE && b < (TBASE + TCOUNT)) { |
| 168 | /* LV,T */ |
| 169 | *code = a + (b - TBASE); |
| 170 | return 3; |
| 171 | } else if (a >= LBASE && a < (LBASE + LCOUNT) && b >= VBASE && b < (VBASE + VCOUNT)) { |
| 172 | /* L,V */ |
| 173 | int li = a - LBASE; |
| 174 | int vi = b - VBASE; |
| 175 | *code = SBASE + li * NCOUNT + vi * TCOUNT; |
| 176 | return 2; |
| 177 | } else { |
| 178 | return 0; |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | static uint32_t decode_utf16(const unsigned short **code_ptr) |
| 183 | { |
| 184 | const unsigned short *code = *code_ptr; |
| 185 | |
| 186 | if (code[0] < 0xd800 || code[0] > 0xdc00) { |
| 187 | *code_ptr += 1; |
| 188 | return (uint32_t)code[0]; |
| 189 | } else { |
| 190 | *code_ptr += 2; |
| 191 | return 0x10000 + ((uint32_t)code[1] - 0xdc00) + |
| 192 | (((uint32_t)code[0] - 0xd800) << 10); |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | const char *ucdn_get_unicode_version(void) |
| 197 | { |
| 198 | return UNIDATA_VERSION; |
| 199 | } |
| 200 | |
| 201 | int ucdn_get_combining_class(uint32_t code) |
| 202 | { |
| 203 | return get_ucd_record(code)->combining; |
| 204 | } |
| 205 | |
| 206 | int ucdn_get_east_asian_width(uint32_t code) |
| 207 | { |
| 208 | return get_ucd_record(code)->east_asian_width; |
| 209 | } |
| 210 | |
| 211 | int ucdn_get_general_category(uint32_t code) |
| 212 | { |
| 213 | return get_ucd_record(code)->category; |
| 214 | } |
| 215 | |
| 216 | int ucdn_get_bidi_class(uint32_t code) |
| 217 | { |
| 218 | return get_ucd_record(code)->bidi_class; |
| 219 | } |
| 220 | |
| 221 | int ucdn_get_mirrored(uint32_t code) |
| 222 | { |
| 223 | return ucdn_mirror(code) != code; |
| 224 | } |
| 225 | |
| 226 | int ucdn_get_script(uint32_t code) |
| 227 | { |
| 228 | return get_ucd_record(code)->script; |
| 229 | } |
| 230 | |
| 231 | int ucdn_get_linebreak_class(uint32_t code) |
| 232 | { |
| 233 | return get_ucd_record(code)->linebreak_class; |
| 234 | } |
| 235 | |
| 236 | int ucdn_get_resolved_linebreak_class(uint32_t code) |
| 237 | { |
| 238 | const UCDRecord *record = get_ucd_record(code); |
| 239 | |
| 240 | switch (record->linebreak_class) |
| 241 | { |
| 242 | case UCDN_LINEBREAK_CLASS_AI: |
| 243 | case UCDN_LINEBREAK_CLASS_SG: |
| 244 | case UCDN_LINEBREAK_CLASS_XX: |
| 245 | return UCDN_LINEBREAK_CLASS_AL; |
| 246 | |
| 247 | case UCDN_LINEBREAK_CLASS_SA: |
| 248 | if (record->category == UCDN_GENERAL_CATEGORY_MC || |
| 249 | record->category == UCDN_GENERAL_CATEGORY_MN) |
| 250 | return UCDN_LINEBREAK_CLASS_CM; |
| 251 | return UCDN_LINEBREAK_CLASS_AL; |
| 252 | |
| 253 | case UCDN_LINEBREAK_CLASS_CJ: |
| 254 | return UCDN_LINEBREAK_CLASS_NS; |
| 255 | |
| 256 | case UCDN_LINEBREAK_CLASS_CB: |
| 257 | return UCDN_LINEBREAK_CLASS_B2; |
| 258 | |
| 259 | case UCDN_LINEBREAK_CLASS_NL: |
| 260 | return UCDN_LINEBREAK_CLASS_BK; |
| 261 | |
| 262 | default: |
| 263 | return record->linebreak_class; |
| 264 | } |
| 265 | } |
| 266 | |
| 267 | uint32_t ucdn_mirror(uint32_t code) |
| 268 | { |
| 269 | MirrorPair mp = {0}; |
| 270 | MirrorPair *res; |
| 271 | |
| 272 | mp.from = code; |
| 273 | res = (MirrorPair *) bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, |
| 274 | sizeof(MirrorPair), compare_mp); |
| 275 | |
| 276 | if (res == NULL) |
| 277 | return code; |
| 278 | else |
| 279 | return res->to; |
| 280 | } |
| 281 | |
| 282 | uint32_t ucdn_paired_bracket(uint32_t code) |
| 283 | { |
| 284 | BracketPair *res = search_bp(code); |
| 285 | if (res == NULL) |
| 286 | return code; |
| 287 | else |
| 288 | return res->to; |
| 289 | } |
| 290 | |
| 291 | int ucdn_paired_bracket_type(uint32_t code) |
| 292 | { |
| 293 | BracketPair *res = search_bp(code); |
| 294 | if (res == NULL) |
| 295 | return UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE; |
| 296 | else |
| 297 | return res->type; |
| 298 | } |
| 299 | |
| 300 | int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
| 301 | { |
| 302 | const unsigned short *rec; |
| 303 | int len; |
| 304 | |
| 305 | if (hangul_pair_decompose(code, a, b)) |
| 306 | return 1; |
| 307 | |
| 308 | rec = get_decomp_record(code); |
| 309 | len = rec[0] >> 8; |
| 310 | |
| 311 | if ((rec[0] & 0xff) != 0 || len == 0) |
| 312 | return 0; |
| 313 | |
| 314 | rec++; |
| 315 | *a = decode_utf16(&rec); |
| 316 | if (len > 1) |
| 317 | *b = decode_utf16(&rec); |
| 318 | else |
| 319 | *b = 0; |
| 320 | |
| 321 | return 1; |
| 322 | } |
| 323 | |
| 324 | int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) |
| 325 | { |
| 326 | int l, r, index, indexi, offset; |
| 327 | |
| 328 | if (hangul_pair_compose(code, a, b)) |
| 329 | return 1; |
| 330 | |
| 331 | l = get_comp_index(a, nfc_first, sizeof(nfc_first) / sizeof(Reindex)); |
| 332 | r = get_comp_index(b, nfc_last, sizeof(nfc_last) / sizeof(Reindex)); |
| 333 | |
| 334 | if (l < 0 || r < 0) |
| 335 | return 0; |
| 336 | |
| 337 | indexi = l * TOTAL_LAST + r; |
| 338 | index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; |
| 339 | offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); |
| 340 | index = comp_index1[index + offset] << COMP_SHIFT2; |
| 341 | offset = indexi & ((1<<COMP_SHIFT2) - 1); |
| 342 | *code = comp_data[index + offset]; |
| 343 | |
| 344 | return *code != 0; |
| 345 | } |
| 346 | |
| 347 | int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) |
| 348 | { |
| 349 | int i, len; |
| 350 | const unsigned short *rec = get_decomp_record(code); |
| 351 | len = rec[0] >> 8; |
| 352 | |
| 353 | if (len == 0) |
| 354 | return 0; |
| 355 | |
| 356 | rec++; |
| 357 | for (i = 0; i < len; i++) |
| 358 | decomposed[i] = decode_utf16(&rec); |
| 359 | |
| 360 | return len; |
| 361 | } |
| 362 | |