| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * GB18030 <--> UTF8 |
| 4 | * |
| 5 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 6 | * Portions Copyright (c) 1994, Regents of the University of California |
| 7 | * |
| 8 | * IDENTIFICATION |
| 9 | * src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c |
| 10 | * |
| 11 | *------------------------------------------------------------------------- |
| 12 | */ |
| 13 | |
| 14 | #include "postgres.h" |
| 15 | #include "fmgr.h" |
| 16 | #include "mb/pg_wchar.h" |
| 17 | #include "../../Unicode/gb18030_to_utf8.map" |
| 18 | #include "../../Unicode/utf8_to_gb18030.map" |
| 19 | |
| 20 | PG_MODULE_MAGIC; |
| 21 | |
| 22 | PG_FUNCTION_INFO_V1(gb18030_to_utf8); |
| 23 | PG_FUNCTION_INFO_V1(utf8_to_gb18030); |
| 24 | |
| 25 | /* |
| 26 | * Convert 4-byte GB18030 characters to and from a linear code space |
| 27 | * |
| 28 | * The first and third bytes can range from 0x81 to 0xfe (126 values), |
| 29 | * while the second and fourth bytes can range from 0x30 to 0x39 (10 values). |
| 30 | */ |
| 31 | static inline uint32 |
| 32 | gb_linear(uint32 gb) |
| 33 | { |
| 34 | uint32 b0 = (gb & 0xff000000) >> 24; |
| 35 | uint32 b1 = (gb & 0x00ff0000) >> 16; |
| 36 | uint32 b2 = (gb & 0x0000ff00) >> 8; |
| 37 | uint32 b3 = (gb & 0x000000ff); |
| 38 | |
| 39 | return b0 * 12600 + b1 * 1260 + b2 * 10 + b3 - |
| 40 | (0x81 * 12600 + 0x30 * 1260 + 0x81 * 10 + 0x30); |
| 41 | } |
| 42 | |
| 43 | static inline uint32 |
| 44 | gb_unlinear(uint32 lin) |
| 45 | { |
| 46 | uint32 r0 = 0x81 + lin / 12600; |
| 47 | uint32 r1 = 0x30 + (lin / 1260) % 10; |
| 48 | uint32 r2 = 0x81 + (lin / 10) % 126; |
| 49 | uint32 r3 = 0x30 + lin % 10; |
| 50 | |
| 51 | return (r0 << 24) | (r1 << 16) | (r2 << 8) | r3; |
| 52 | } |
| 53 | |
| 54 | /* |
| 55 | * Convert word-formatted UTF8 to and from Unicode code points |
| 56 | * |
| 57 | * Probably this should be somewhere else ... |
| 58 | */ |
| 59 | static inline uint32 |
| 60 | unicode_to_utf8word(uint32 c) |
| 61 | { |
| 62 | uint32 word; |
| 63 | |
| 64 | if (c <= 0x7F) |
| 65 | { |
| 66 | word = c; |
| 67 | } |
| 68 | else if (c <= 0x7FF) |
| 69 | { |
| 70 | word = (0xC0 | ((c >> 6) & 0x1F)) << 8; |
| 71 | word |= 0x80 | (c & 0x3F); |
| 72 | } |
| 73 | else if (c <= 0xFFFF) |
| 74 | { |
| 75 | word = (0xE0 | ((c >> 12) & 0x0F)) << 16; |
| 76 | word |= (0x80 | ((c >> 6) & 0x3F)) << 8; |
| 77 | word |= 0x80 | (c & 0x3F); |
| 78 | } |
| 79 | else |
| 80 | { |
| 81 | word = (0xF0 | ((c >> 18) & 0x07)) << 24; |
| 82 | word |= (0x80 | ((c >> 12) & 0x3F)) << 16; |
| 83 | word |= (0x80 | ((c >> 6) & 0x3F)) << 8; |
| 84 | word |= 0x80 | (c & 0x3F); |
| 85 | } |
| 86 | |
| 87 | return word; |
| 88 | } |
| 89 | |
| 90 | static inline uint32 |
| 91 | utf8word_to_unicode(uint32 c) |
| 92 | { |
| 93 | uint32 ucs; |
| 94 | |
| 95 | if (c <= 0x7F) |
| 96 | { |
| 97 | ucs = c; |
| 98 | } |
| 99 | else if (c <= 0xFFFF) |
| 100 | { |
| 101 | ucs = ((c >> 8) & 0x1F) << 6; |
| 102 | ucs |= c & 0x3F; |
| 103 | } |
| 104 | else if (c <= 0xFFFFFF) |
| 105 | { |
| 106 | ucs = ((c >> 16) & 0x0F) << 12; |
| 107 | ucs |= ((c >> 8) & 0x3F) << 6; |
| 108 | ucs |= c & 0x3F; |
| 109 | } |
| 110 | else |
| 111 | { |
| 112 | ucs = ((c >> 24) & 0x07) << 18; |
| 113 | ucs |= ((c >> 16) & 0x3F) << 12; |
| 114 | ucs |= ((c >> 8) & 0x3F) << 6; |
| 115 | ucs |= c & 0x3F; |
| 116 | } |
| 117 | |
| 118 | return ucs; |
| 119 | } |
| 120 | |
| 121 | /* |
| 122 | * Perform mapping of GB18030 ranges to UTF8 |
| 123 | * |
| 124 | * The ranges we need to convert are specified in gb-18030-2000.xml. |
| 125 | * All are ranges of 4-byte GB18030 codes. |
| 126 | */ |
| 127 | static uint32 |
| 128 | conv_18030_to_utf8(uint32 code) |
| 129 | { |
| 130 | #define conv18030(minunicode, mincode, maxcode) \ |
| 131 | if (code >= mincode && code <= maxcode) \ |
| 132 | return unicode_to_utf8word(gb_linear(code) - gb_linear(mincode) + minunicode) |
| 133 | |
| 134 | conv18030(0x0452, 0x8130D330, 0x8136A531); |
| 135 | conv18030(0x2643, 0x8137A839, 0x8138FD38); |
| 136 | conv18030(0x361B, 0x8230A633, 0x8230F237); |
| 137 | conv18030(0x3CE1, 0x8231D438, 0x8232AF32); |
| 138 | conv18030(0x4160, 0x8232C937, 0x8232F837); |
| 139 | conv18030(0x44D7, 0x8233A339, 0x8233C931); |
| 140 | conv18030(0x478E, 0x8233E838, 0x82349638); |
| 141 | conv18030(0x49B8, 0x8234A131, 0x8234E733); |
| 142 | conv18030(0x9FA6, 0x82358F33, 0x8336C738); |
| 143 | conv18030(0xE865, 0x8336D030, 0x84308534); |
| 144 | conv18030(0xFA2A, 0x84309C38, 0x84318537); |
| 145 | conv18030(0xFFE6, 0x8431A234, 0x8431A439); |
| 146 | conv18030(0x10000, 0x90308130, 0xE3329A35); |
| 147 | /* No mapping exists */ |
| 148 | return 0; |
| 149 | } |
| 150 | |
| 151 | /* |
| 152 | * Perform mapping of UTF8 ranges to GB18030 |
| 153 | */ |
| 154 | static uint32 |
| 155 | conv_utf8_to_18030(uint32 code) |
| 156 | { |
| 157 | uint32 ucs = utf8word_to_unicode(code); |
| 158 | |
| 159 | #define convutf8(minunicode, maxunicode, mincode) \ |
| 160 | if (ucs >= minunicode && ucs <= maxunicode) \ |
| 161 | return gb_unlinear(ucs - minunicode + gb_linear(mincode)) |
| 162 | |
| 163 | convutf8(0x0452, 0x200F, 0x8130D330); |
| 164 | convutf8(0x2643, 0x2E80, 0x8137A839); |
| 165 | convutf8(0x361B, 0x3917, 0x8230A633); |
| 166 | convutf8(0x3CE1, 0x4055, 0x8231D438); |
| 167 | convutf8(0x4160, 0x4336, 0x8232C937); |
| 168 | convutf8(0x44D7, 0x464B, 0x8233A339); |
| 169 | convutf8(0x478E, 0x4946, 0x8233E838); |
| 170 | convutf8(0x49B8, 0x4C76, 0x8234A131); |
| 171 | convutf8(0x9FA6, 0xD7FF, 0x82358F33); |
| 172 | convutf8(0xE865, 0xF92B, 0x8336D030); |
| 173 | convutf8(0xFA2A, 0xFE2F, 0x84309C38); |
| 174 | convutf8(0xFFE6, 0xFFFF, 0x8431A234); |
| 175 | convutf8(0x10000, 0x10FFFF, 0x90308130); |
| 176 | /* No mapping exists */ |
| 177 | return 0; |
| 178 | } |
| 179 | |
| 180 | /* ---------- |
| 181 | * conv_proc( |
| 182 | * INTEGER, -- source encoding id |
| 183 | * INTEGER, -- destination encoding id |
| 184 | * CSTRING, -- source string (null terminated C string) |
| 185 | * CSTRING, -- destination string (null terminated C string) |
| 186 | * INTEGER -- source string length |
| 187 | * ) returns VOID; |
| 188 | * ---------- |
| 189 | */ |
| 190 | Datum |
| 191 | gb18030_to_utf8(PG_FUNCTION_ARGS) |
| 192 | { |
| 193 | unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); |
| 194 | unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); |
| 195 | int len = PG_GETARG_INT32(4); |
| 196 | |
| 197 | CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8); |
| 198 | |
| 199 | LocalToUtf(src, len, dest, |
| 200 | &gb18030_to_unicode_tree, |
| 201 | NULL, 0, |
| 202 | conv_18030_to_utf8, |
| 203 | PG_GB18030); |
| 204 | |
| 205 | PG_RETURN_VOID(); |
| 206 | } |
| 207 | |
| 208 | Datum |
| 209 | utf8_to_gb18030(PG_FUNCTION_ARGS) |
| 210 | { |
| 211 | unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); |
| 212 | unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); |
| 213 | int len = PG_GETARG_INT32(4); |
| 214 | |
| 215 | CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030); |
| 216 | |
| 217 | UtfToLocal(src, len, dest, |
| 218 | &gb18030_from_unicode_tree, |
| 219 | NULL, 0, |
| 220 | conv_utf8_to_18030, |
| 221 | PG_GB18030); |
| 222 | |
| 223 | PG_RETURN_VOID(); |
| 224 | } |
| 225 | |