1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ********************************************************************** |
5 | * Copyright (C) 2005-2013, International Business Machines |
6 | * Corporation and others. All Rights Reserved. |
7 | ********************************************************************** |
8 | */ |
9 | |
10 | #include "unicode/utypes.h" |
11 | |
12 | #if !UCONFIG_NO_CONVERSION |
13 | |
14 | #include "csrucode.h" |
15 | #include "csmatch.h" |
16 | |
17 | U_NAMESPACE_BEGIN |
18 | |
19 | CharsetRecog_Unicode::~CharsetRecog_Unicode() |
20 | { |
21 | // nothing to do |
22 | } |
23 | |
24 | CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE() |
25 | { |
26 | // nothing to do |
27 | } |
28 | |
29 | const char *CharsetRecog_UTF_16_BE::getName() const |
30 | { |
31 | return "UTF-16BE" ; |
32 | } |
33 | |
34 | // UTF-16 confidence calculation. Very simple minded, but better than nothing. |
35 | // Any 8 bit non-control characters bump the confidence up. These have a zero high byte, |
36 | // and are very likely to be UTF-16, although they could also be part of a UTF-32 code. |
37 | // NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. |
38 | // NULs should be rare in actual text. |
39 | |
40 | static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) { |
41 | if (codeUnit == 0) { |
42 | confidence -= 10; |
43 | } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { |
44 | confidence += 10; |
45 | } |
46 | if (confidence < 0) { |
47 | confidence = 0; |
48 | } else if (confidence > 100) { |
49 | confidence = 100; |
50 | } |
51 | return confidence; |
52 | } |
53 | |
54 | |
55 | UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const |
56 | { |
57 | const uint8_t *input = textIn->fRawInput; |
58 | int32_t confidence = 10; |
59 | int32_t length = textIn->fRawLength; |
60 | |
61 | int32_t bytesToCheck = (length > 30) ? 30 : length; |
62 | for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { |
63 | UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1]; |
64 | if (charIndex == 0 && codeUnit == 0xFEFF) { |
65 | confidence = 100; |
66 | break; |
67 | } |
68 | confidence = adjustConfidence(codeUnit, confidence); |
69 | if (confidence == 0 || confidence == 100) { |
70 | break; |
71 | } |
72 | } |
73 | if (bytesToCheck < 4 && confidence < 100) { |
74 | confidence = 0; |
75 | } |
76 | results->set(textIn, this, confidence); |
77 | return (confidence > 0); |
78 | } |
79 | |
80 | CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE() |
81 | { |
82 | // nothing to do |
83 | } |
84 | |
85 | const char *CharsetRecog_UTF_16_LE::getName() const |
86 | { |
87 | return "UTF-16LE" ; |
88 | } |
89 | |
90 | UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const |
91 | { |
92 | const uint8_t *input = textIn->fRawInput; |
93 | int32_t confidence = 10; |
94 | int32_t length = textIn->fRawLength; |
95 | |
96 | int32_t bytesToCheck = (length > 30) ? 30 : length; |
97 | for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) { |
98 | UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8); |
99 | if (charIndex == 0 && codeUnit == 0xFEFF) { |
100 | confidence = 100; // UTF-16 BOM |
101 | if (length >= 4 && input[2] == 0 && input[3] == 0) { |
102 | confidence = 0; // UTF-32 BOM |
103 | } |
104 | break; |
105 | } |
106 | confidence = adjustConfidence(codeUnit, confidence); |
107 | if (confidence == 0 || confidence == 100) { |
108 | break; |
109 | } |
110 | } |
111 | if (bytesToCheck < 4 && confidence < 100) { |
112 | confidence = 0; |
113 | } |
114 | results->set(textIn, this, confidence); |
115 | return (confidence > 0); |
116 | } |
117 | |
118 | CharsetRecog_UTF_32::~CharsetRecog_UTF_32() |
119 | { |
120 | // nothing to do |
121 | } |
122 | |
123 | UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const |
124 | { |
125 | const uint8_t *input = textIn->fRawInput; |
126 | int32_t limit = (textIn->fRawLength / 4) * 4; |
127 | int32_t numValid = 0; |
128 | int32_t numInvalid = 0; |
129 | bool hasBOM = FALSE; |
130 | int32_t confidence = 0; |
131 | |
132 | if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) { |
133 | hasBOM = TRUE; |
134 | } |
135 | |
136 | for(int32_t i = 0; i < limit; i += 4) { |
137 | int32_t ch = getChar(input, i); |
138 | |
139 | if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) { |
140 | numInvalid += 1; |
141 | } else { |
142 | numValid += 1; |
143 | } |
144 | } |
145 | |
146 | |
147 | // Cook up some sort of confidence score, based on presense of a BOM |
148 | // and the existence of valid and/or invalid multi-byte sequences. |
149 | if (hasBOM && numInvalid==0) { |
150 | confidence = 100; |
151 | } else if (hasBOM && numValid > numInvalid*10) { |
152 | confidence = 80; |
153 | } else if (numValid > 3 && numInvalid == 0) { |
154 | confidence = 100; |
155 | } else if (numValid > 0 && numInvalid == 0) { |
156 | confidence = 80; |
157 | } else if (numValid > numInvalid*10) { |
158 | // Probably corruput UTF-32BE data. Valid sequences aren't likely by chance. |
159 | confidence = 25; |
160 | } |
161 | |
162 | results->set(textIn, this, confidence); |
163 | return (confidence > 0); |
164 | } |
165 | |
166 | CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE() |
167 | { |
168 | // nothing to do |
169 | } |
170 | |
171 | const char *CharsetRecog_UTF_32_BE::getName() const |
172 | { |
173 | return "UTF-32BE" ; |
174 | } |
175 | |
176 | int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const |
177 | { |
178 | return input[index + 0] << 24 | input[index + 1] << 16 | |
179 | input[index + 2] << 8 | input[index + 3]; |
180 | } |
181 | |
182 | CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE() |
183 | { |
184 | // nothing to do |
185 | } |
186 | |
187 | const char *CharsetRecog_UTF_32_LE::getName() const |
188 | { |
189 | return "UTF-32LE" ; |
190 | } |
191 | |
192 | int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const |
193 | { |
194 | return input[index + 3] << 24 | input[index + 2] << 16 | |
195 | input[index + 1] << 8 | input[index + 0]; |
196 | } |
197 | |
198 | U_NAMESPACE_END |
199 | #endif |
200 | |
201 | |