| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * EUC_JIS_2004, SHIFT_JIS_2004 |
| 4 | * |
| 5 | * Copyright (c) 2007-2019, PostgreSQL Global Development Group |
| 6 | * |
| 7 | * IDENTIFICATION |
| 8 | * src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c |
| 9 | * |
| 10 | *------------------------------------------------------------------------- |
| 11 | */ |
| 12 | |
| 13 | #include "postgres.h" |
| 14 | #include "fmgr.h" |
| 15 | #include "mb/pg_wchar.h" |
| 16 | |
| 17 | PG_MODULE_MAGIC; |
| 18 | |
| 19 | PG_FUNCTION_INFO_V1(euc_jis_2004_to_shift_jis_2004); |
| 20 | PG_FUNCTION_INFO_V1(shift_jis_2004_to_euc_jis_2004); |
| 21 | |
| 22 | static void euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len); |
| 23 | static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len); |
| 24 | |
| 25 | /* ---------- |
| 26 | * conv_proc( |
| 27 | * INTEGER, -- source encoding id |
| 28 | * INTEGER, -- destination encoding id |
| 29 | * CSTRING, -- source string (null terminated C string) |
| 30 | * CSTRING, -- destination string (null terminated C string) |
| 31 | * INTEGER -- source string length |
| 32 | * ) returns VOID; |
| 33 | * ---------- |
| 34 | */ |
| 35 | |
| 36 | Datum |
| 37 | euc_jis_2004_to_shift_jis_2004(PG_FUNCTION_ARGS) |
| 38 | { |
| 39 | unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); |
| 40 | unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); |
| 41 | int len = PG_GETARG_INT32(4); |
| 42 | |
| 43 | CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_SHIFT_JIS_2004); |
| 44 | |
| 45 | euc_jis_20042shift_jis_2004(src, dest, len); |
| 46 | |
| 47 | PG_RETURN_VOID(); |
| 48 | } |
| 49 | |
| 50 | Datum |
| 51 | shift_jis_2004_to_euc_jis_2004(PG_FUNCTION_ARGS) |
| 52 | { |
| 53 | unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); |
| 54 | unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); |
| 55 | int len = PG_GETARG_INT32(4); |
| 56 | |
| 57 | CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_EUC_JIS_2004); |
| 58 | |
| 59 | shift_jis_20042euc_jis_2004(src, dest, len); |
| 60 | |
| 61 | PG_RETURN_VOID(); |
| 62 | } |
| 63 | |
| 64 | /* |
| 65 | * EUC_JIS_2004 -> SHIFT_JIS_2004 |
| 66 | */ |
| 67 | static void |
| 68 | euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len) |
| 69 | { |
| 70 | int c1, |
| 71 | ku, |
| 72 | ten; |
| 73 | int l; |
| 74 | |
| 75 | while (len > 0) |
| 76 | { |
| 77 | c1 = *euc; |
| 78 | if (!IS_HIGHBIT_SET(c1)) |
| 79 | { |
| 80 | /* ASCII */ |
| 81 | if (c1 == 0) |
| 82 | report_invalid_encoding(PG_EUC_JIS_2004, |
| 83 | (const char *) euc, len); |
| 84 | *p++ = c1; |
| 85 | euc++; |
| 86 | len--; |
| 87 | continue; |
| 88 | } |
| 89 | |
| 90 | l = pg_encoding_verifymb(PG_EUC_JIS_2004, (const char *) euc, len); |
| 91 | |
| 92 | if (l < 0) |
| 93 | report_invalid_encoding(PG_EUC_JIS_2004, |
| 94 | (const char *) euc, len); |
| 95 | |
| 96 | if (c1 == SS2 && l == 2) /* JIS X 0201 kana? */ |
| 97 | { |
| 98 | *p++ = euc[1]; |
| 99 | } |
| 100 | else if (c1 == SS3 && l == 3) /* JIS X 0213 plane 2? */ |
| 101 | { |
| 102 | ku = euc[1] - 0xa0; |
| 103 | ten = euc[2] - 0xa0; |
| 104 | |
| 105 | switch (ku) |
| 106 | { |
| 107 | case 1: |
| 108 | case 3: |
| 109 | case 4: |
| 110 | case 5: |
| 111 | case 8: |
| 112 | case 12: |
| 113 | case 13: |
| 114 | case 14: |
| 115 | case 15: |
| 116 | *p++ = ((ku + 0x1df) >> 1) - (ku >> 3) * 3; |
| 117 | break; |
| 118 | default: |
| 119 | if (ku >= 78 && ku <= 94) |
| 120 | { |
| 121 | *p++ = (ku + 0x19b) >> 1; |
| 122 | } |
| 123 | else |
| 124 | report_invalid_encoding(PG_EUC_JIS_2004, |
| 125 | (const char *) euc, len); |
| 126 | } |
| 127 | |
| 128 | if (ku % 2) |
| 129 | { |
| 130 | if (ten >= 1 && ten <= 63) |
| 131 | *p++ = ten + 0x3f; |
| 132 | else if (ten >= 64 && ten <= 94) |
| 133 | *p++ = ten + 0x40; |
| 134 | else |
| 135 | report_invalid_encoding(PG_EUC_JIS_2004, |
| 136 | (const char *) euc, len); |
| 137 | } |
| 138 | else |
| 139 | *p++ = ten + 0x9e; |
| 140 | } |
| 141 | |
| 142 | else if (l == 2) /* JIS X 0213 plane 1? */ |
| 143 | { |
| 144 | ku = c1 - 0xa0; |
| 145 | ten = euc[1] - 0xa0; |
| 146 | |
| 147 | if (ku >= 1 && ku <= 62) |
| 148 | *p++ = (ku + 0x101) >> 1; |
| 149 | else if (ku >= 63 && ku <= 94) |
| 150 | *p++ = (ku + 0x181) >> 1; |
| 151 | else |
| 152 | report_invalid_encoding(PG_EUC_JIS_2004, |
| 153 | (const char *) euc, len); |
| 154 | |
| 155 | if (ku % 2) |
| 156 | { |
| 157 | if (ten >= 1 && ten <= 63) |
| 158 | *p++ = ten + 0x3f; |
| 159 | else if (ten >= 64 && ten <= 94) |
| 160 | *p++ = ten + 0x40; |
| 161 | else |
| 162 | report_invalid_encoding(PG_EUC_JIS_2004, |
| 163 | (const char *) euc, len); |
| 164 | } |
| 165 | else |
| 166 | *p++ = ten + 0x9e; |
| 167 | } |
| 168 | else |
| 169 | report_invalid_encoding(PG_EUC_JIS_2004, |
| 170 | (const char *) euc, len); |
| 171 | |
| 172 | euc += l; |
| 173 | len -= l; |
| 174 | } |
| 175 | *p = '\0'; |
| 176 | } |
| 177 | |
| 178 | /* |
| 179 | * returns SHIFT_JIS_2004 "ku" code indicated by second byte |
| 180 | * *ku = 0: "ku" = even |
| 181 | * *ku = 1: "ku" = odd |
| 182 | */ |
| 183 | static int |
| 184 | get_ten(int b, int *ku) |
| 185 | { |
| 186 | int ten; |
| 187 | |
| 188 | if (b >= 0x40 && b <= 0x7e) |
| 189 | { |
| 190 | ten = b - 0x3f; |
| 191 | *ku = 1; |
| 192 | } |
| 193 | else if (b >= 0x80 && b <= 0x9e) |
| 194 | { |
| 195 | ten = b - 0x40; |
| 196 | *ku = 1; |
| 197 | } |
| 198 | else if (b >= 0x9f && b <= 0xfc) |
| 199 | { |
| 200 | ten = b - 0x9e; |
| 201 | *ku = 0; |
| 202 | } |
| 203 | else |
| 204 | { |
| 205 | ten = -1; /* error */ |
| 206 | *ku = 0; /* keep compiler quiet */ |
| 207 | } |
| 208 | return ten; |
| 209 | } |
| 210 | |
| 211 | /* |
| 212 | * SHIFT_JIS_2004 ---> EUC_JIS_2004 |
| 213 | */ |
| 214 | |
| 215 | static void |
| 216 | shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len) |
| 217 | { |
| 218 | int c1; |
| 219 | int ku, |
| 220 | ten, |
| 221 | kubun; |
| 222 | int plane; |
| 223 | int l; |
| 224 | |
| 225 | while (len > 0) |
| 226 | { |
| 227 | c1 = *sjis; |
| 228 | |
| 229 | if (!IS_HIGHBIT_SET(c1)) |
| 230 | { |
| 231 | /* ASCII */ |
| 232 | if (c1 == 0) |
| 233 | report_invalid_encoding(PG_SHIFT_JIS_2004, |
| 234 | (const char *) sjis, len); |
| 235 | *p++ = c1; |
| 236 | sjis++; |
| 237 | len--; |
| 238 | continue; |
| 239 | } |
| 240 | |
| 241 | l = pg_encoding_verifymb(PG_SHIFT_JIS_2004, (const char *) sjis, len); |
| 242 | |
| 243 | if (l < 0 || l > len) |
| 244 | report_invalid_encoding(PG_SHIFT_JIS_2004, |
| 245 | (const char *) sjis, len); |
| 246 | |
| 247 | if (c1 >= 0xa1 && c1 <= 0xdf && l == 1) |
| 248 | { |
| 249 | /* JIS X0201 (1 byte kana) */ |
| 250 | *p++ = SS2; |
| 251 | *p++ = c1; |
| 252 | } |
| 253 | else if (l == 2) |
| 254 | { |
| 255 | int c2 = sjis[1]; |
| 256 | |
| 257 | plane = 1; |
| 258 | ku = 1; |
| 259 | ten = 1; |
| 260 | |
| 261 | /* |
| 262 | * JIS X 0213 |
| 263 | */ |
| 264 | if (c1 >= 0x81 && c1 <= 0x9f) /* plane 1 1ku-62ku */ |
| 265 | { |
| 266 | ku = (c1 << 1) - 0x100; |
| 267 | ten = get_ten(c2, &kubun); |
| 268 | if (ten < 0) |
| 269 | report_invalid_encoding(PG_SHIFT_JIS_2004, |
| 270 | (const char *) sjis, len); |
| 271 | ku -= kubun; |
| 272 | } |
| 273 | else if (c1 >= 0xe0 && c1 <= 0xef) /* plane 1 62ku-94ku */ |
| 274 | { |
| 275 | ku = (c1 << 1) - 0x180; |
| 276 | ten = get_ten(c2, &kubun); |
| 277 | if (ten < 0) |
| 278 | report_invalid_encoding(PG_SHIFT_JIS_2004, |
| 279 | |
| 280 | (const char *) sjis, len); |
| 281 | ku -= kubun; |
| 282 | } |
| 283 | else if (c1 >= 0xf0 && c1 <= 0xf3) /* plane 2 |
| 284 | * 1,3,4,5,8,12,13,14,15 ku */ |
| 285 | { |
| 286 | plane = 2; |
| 287 | ten = get_ten(c2, &kubun); |
| 288 | if (ten < 0) |
| 289 | report_invalid_encoding(PG_SHIFT_JIS_2004, |
| 290 | (const char *) sjis, len); |
| 291 | switch (c1) |
| 292 | { |
| 293 | case 0xf0: |
| 294 | ku = kubun == 0 ? 8 : 1; |
| 295 | break; |
| 296 | case 0xf1: |
| 297 | ku = kubun == 0 ? 4 : 3; |
| 298 | break; |
| 299 | case 0xf2: |
| 300 | ku = kubun == 0 ? 12 : 5; |
| 301 | break; |
| 302 | default: |
| 303 | ku = kubun == 0 ? 14 : 13; |
| 304 | break; |
| 305 | } |
| 306 | } |
| 307 | else if (c1 >= 0xf4 && c1 <= 0xfc) /* plane 2 78-94ku */ |
| 308 | { |
| 309 | plane = 2; |
| 310 | ten = get_ten(c2, &kubun); |
| 311 | if (ten < 0) |
| 312 | report_invalid_encoding(PG_SHIFT_JIS_2004, |
| 313 | (const char *) sjis, len); |
| 314 | if (c1 == 0xf4 && kubun == 1) |
| 315 | ku = 15; |
| 316 | else |
| 317 | ku = (c1 << 1) - 0x19a - kubun; |
| 318 | } |
| 319 | else |
| 320 | report_invalid_encoding(PG_SHIFT_JIS_2004, |
| 321 | (const char *) sjis, len); |
| 322 | |
| 323 | if (plane == 2) |
| 324 | *p++ = SS3; |
| 325 | |
| 326 | *p++ = ku + 0xa0; |
| 327 | *p++ = ten + 0xa0; |
| 328 | } |
| 329 | sjis += l; |
| 330 | len -= l; |
| 331 | } |
| 332 | *p = '\0'; |
| 333 | } |
| 334 | |