1/*
2* Copyright 2020 Google Inc.
3*
4* Use of this source code is governed by a BSD-style license that can be
5* found in the LICENSE file.
6*/
7#include "include/private/SkTFitsIn.h"
8#include "include/private/SkTemplates.h"
9#include "modules/skshaper/src/SkUnicode.h"
10#include "src/utils/SkUTF.h"
11#include <unicode/ubidi.h>
12#include <unicode/ubrk.h>
13#include <unicode/utext.h>
14#include <unicode/utypes.h>
15#include <vector>
16#include <functional>
17
18using ICUBiDi = std::unique_ptr<UBiDi, SkFunctionWrapper<decltype(ubidi_close), ubidi_close>>;
19using ICUUText = std::unique_ptr<UText, SkFunctionWrapper<decltype(utext_close), utext_close>>;
20using ICUBreakIterator = std::unique_ptr<UBreakIterator, SkFunctionWrapper<decltype(ubrk_close), ubrk_close>>;
21
22/** Replaces invalid utf-8 sequences with REPLACEMENT CHARACTER U+FFFD. */
23static inline SkUnichar utf8_next(const char** ptr, const char* end) {
24 SkUnichar val = SkUTF::NextUTF8(ptr, end);
25 return val < 0 ? 0xFFFD : val;
26}
27
28namespace skia {
29
30class SkUnicode_icu : public SkUnicode {
31
32 static UBreakIteratorType convertType(UBreakType type) {
33 switch (type) {
34 case UBreakType::kLines: return UBRK_LINE;
35 case UBreakType::kGraphemes: return UBRK_CHARACTER;
36 case UBreakType::kWords: return UBRK_WORD;
37 default:
38 SkDEBUGF("Convert error: wrong break type");
39 return UBRK_CHARACTER;
40 }
41 }
42
43 static int convertUtf8ToUtf16(const char* utf8, size_t utf8Units, std::unique_ptr<uint16_t[]>* utf16) {
44 int utf16Units = SkUTF::UTF8ToUTF16(nullptr, 0, utf8, utf8Units);
45 if (utf16Units < 0) {
46 SkDEBUGF("Convert error: Invalid utf8 input");
47 return utf16Units;
48 }
49 *utf16 = std::unique_ptr<uint16_t[]>(new uint16_t[utf16Units]);
50 SkDEBUGCODE(int dstLen =) SkUTF::UTF8ToUTF16(utf16->get(), utf16Units, utf8, utf8Units);
51 SkASSERT(dstLen == utf16Units);
52 return utf16Units;
53 }
54
55 static bool extractBidi(const char utf8[], int utf8Units, Direction dir, std::vector<BidiRegion>* bidiRegions) {
56
57 // Convert to UTF16 since for now bidi iterator only operates on utf16
58 std::unique_ptr<uint16_t[]> utf16;
59 auto utf16Units = convertUtf8ToUtf16(utf8, utf8Units, &utf16);
60 if (utf16Units < 0) {
61 return false;
62 }
63
64 // Create bidi iterator
65 UErrorCode status = U_ZERO_ERROR;
66 ICUBiDi bidi(ubidi_openSized(utf16Units, 0, &status));
67 if (U_FAILURE(status)) {
68 SkDEBUGF("Bidi error: %s", u_errorName(status));
69 return false;
70 }
71 SkASSERT(bidi);
72 uint8_t bidiLevel = (dir == Direction::kLTR) ? UBIDI_LTR : UBIDI_RTL;
73 // The required lifetime of utf16 isn't well documented.
74 // It appears it isn't used after ubidi_setPara except through ubidi_getText.
75 ubidi_setPara(bidi.get(), (const UChar*)utf16.get(), utf16Units, bidiLevel, nullptr, &status);
76 if (U_FAILURE(status)) {
77 SkDEBUGF("Bidi error: %s", u_errorName(status));
78 return false;
79 }
80
81 // Iterate through bidi regions and the result positions into utf8
82 const char* start8 = utf8;
83 const char* end8 = utf8 + utf8Units;
84 BidiLevel currentLevel = 0;
85
86 Position pos8 = 0;
87 Position pos16 = 0;
88 Position end16 = ubidi_getLength(bidi.get());
89 while (pos16 < end16) {
90 auto level = ubidi_getLevelAt(bidi.get(), pos16);
91 if (pos16 == 0) {
92 currentLevel = level;
93 } else if (level != currentLevel) {
94 Position end = start8 - utf8;
95 bidiRegions->emplace_back(pos8, end, currentLevel);
96 currentLevel = level;
97 pos8 = end;
98 }
99 SkUnichar u = utf8_next(&start8, end8);
100 pos16 += SkUTF::ToUTF16(u);
101 }
102 Position end = start8 - utf8;
103 if (end != pos8) {
104 bidiRegions->emplace_back(pos8, end, currentLevel);
105 }
106 return true;
107 }
108
109 static bool extractWords(uint16_t utf16[], int utf16Units, std::vector<Position>* words) {
110
111 UErrorCode status = U_ZERO_ERROR;
112
113 UBreakIteratorType breakType = convertType(UBreakType::kWords);
114 ICUBreakIterator iterator(ubrk_open(breakType, uloc_getDefault(), nullptr, 0, &status));
115 if (U_FAILURE(status)) {
116 SkDEBUGF("Break error: %s", u_errorName(status));
117 return false;
118 }
119 SkASSERT(iterator);
120
121 UText sUtf16UText = UTEXT_INITIALIZER;
122 ICUUText utf16UText(utext_openUChars(&sUtf16UText, (UChar*)utf16, utf16Units, &status));
123 if (U_FAILURE(status)) {
124 SkDEBUGF("Break error: %s", u_errorName(status));
125 return false;
126 }
127
128 ubrk_setUText(iterator.get(), utf16UText.get(), &status);
129 if (U_FAILURE(status)) {
130 SkDEBUGF("Break error: %s", u_errorName(status));
131 return false;
132 }
133
134 // Get the words
135 int32_t pos = ubrk_first(iterator.get());
136 while (pos != UBRK_DONE) {
137 words->emplace_back(pos);
138 pos = ubrk_next(iterator.get());
139 }
140
141 return true;
142 }
143
144 static bool extractPositions(const char utf8[], int utf8Units, UBreakType type, std::function<void(int, int)> add) {
145
146 UErrorCode status = U_ZERO_ERROR;
147 UText sUtf8UText = UTEXT_INITIALIZER;
148 ICUUText text(utext_openUTF8(&sUtf8UText, &utf8[0], utf8Units, &status));
149
150 if (U_FAILURE(status)) {
151 SkDEBUGF("Break error: %s", u_errorName(status));
152 return false;
153 }
154 SkASSERT(text);
155
156 ICUBreakIterator iterator(ubrk_open(convertType(type), uloc_getDefault(), nullptr, 0, &status));
157 if (U_FAILURE(status)) {
158 SkDEBUGF("Break error: %s", u_errorName(status));
159 }
160
161 ubrk_setUText(iterator.get(), text.get(), &status);
162 if (U_FAILURE(status)) {
163 SkDEBUGF("Break error: %s", u_errorName(status));
164 return false;
165 }
166
167 auto iter = iterator.get();
168 int32_t pos = ubrk_first(iter);
169 while (pos != UBRK_DONE) {
170 add(pos, ubrk_getRuleStatus(iter));
171 pos = ubrk_next(iter);
172 }
173 return true;
174 }
175
176 static bool extractWhitespaces(const char utf8[], int utf8Units, std::vector<Position>* whitespaces) {
177
178 const char* start = utf8;
179 const char* end = utf8 + utf8Units;
180 const char* ch = start;
181 while (ch < end) {
182 auto index = ch - start;
183 auto unichar = utf8_next(&ch, end);
184 if (u_isWhitespace(unichar)) {
185 auto ending = ch - start;
186 for (auto k = index; k < ending; ++k) {
187 whitespaces->emplace_back(k);
188 }
189 }
190 }
191 return true;
192 }
193
194public:
195 ~SkUnicode_icu() override { }
196
197 bool getBidiRegions(const char utf8[], int utf8Units, Direction dir, std::vector<BidiRegion>* results) override {
198 return extractBidi(utf8, utf8Units, dir, results);
199 }
200
201 bool getLineBreaks(const char utf8[], int utf8Units, std::vector<LineBreakBefore>* results) override {
202
203 return extractPositions(utf8, utf8Units, UBreakType::kLines,
204 [results](int pos, int status) {
205 results->emplace_back(pos,status == UBRK_LINE_HARD
206 ? LineBreakType::kHardLineBreak
207 : LineBreakType::kSoftLineBreak);
208 });
209 }
210
211 bool getWords(const char utf8[], int utf8Units, std::vector<Position>* results) override {
212
213 // Convert to UTF16 since we want the results in utf16
214 std::unique_ptr<uint16_t[]> utf16;
215 auto utf16Units = convertUtf8ToUtf16(utf8, utf8Units, &utf16);
216 if (utf16Units < 0) {
217 return false;
218 }
219
220 return extractWords(utf16.get(), utf16Units, results);
221 }
222
223 bool getGraphemes(const char utf8[], int utf8Units, std::vector<Position>* results) override {
224
225 return extractPositions(utf8, utf8Units, UBreakType::kGraphemes,
226 [results](int pos, int status) { results->emplace_back(pos);
227 });
228 }
229
230 bool getWhitespaces(const char utf8[], int utf8Units, std::vector<Position>* results) override {
231
232 return extractWhitespaces(utf8, utf8Units, results);
233 }
234
235 void reorderVisual(const BidiLevel runLevels[], int levelsCount, int32_t logicalFromVisual[]) override {
236 ubidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
237 }
238};
239
240std::unique_ptr<SkUnicode> SkUnicode::Make() { return std::make_unique<SkUnicode_icu>(); }
241
242} // namespace skia
243
244