brktrans.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/brktrans.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2008-2015, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 05/11/2008 Andy Heninger Port from Java
10	**********************************************************************
11	*/
12
13	#include <utility>
14
15	#include "unicode/utypes.h"
16
17	#if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
18
19	#include "unicode/brkiter.h"
20	#include "unicode/localpointer.h"
21	#include "unicode/uchar.h"
22	#include "unicode/unifilt.h"
23	#include "unicode/uniset.h"
24
25	#include "brktrans.h"
26	#include "cmemory.h"
27	#include "mutex.h"
28	#include "uprops.h"
29	#include "uinvchar.h"
30	#include "util.h"
31	#include "uvectr32.h"
32
33	U_NAMESPACE_BEGIN
34
35	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
36
37	static const UChar SPACE = `32`; // ' '
38
39
40	/**
41	* Constructs a transliterator with the default delimiters '{' and
42	* '}'.
43	*/
44	BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
45	Transliterator (UNICODE_STRING("Any-BreakInternal", `17`), adoptedFilter),
46	cachedBI (NULL), cachedBoundaries (NULL), fInsertion (SPACE) {
47	}
48
49
50	/**
51	* Destructor.
52	*/
53	BreakTransliterator::~BreakTransliterator() {
54	}
55
56	/**
57	* Copy constructor.
58	*/
59	BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
60	Transliterator (o), cachedBI (NULL), cachedBoundaries (NULL), fInsertion (o.fInsertion) {
61	}
62
63
64	/**
65	* Transliterator API.
66	*/
67	BreakTransliterator* BreakTransliterator::clone() const {
68	return new BreakTransliterator (*this);
69	}
70
71	/**
72	* Implements {@link Transliterator#handleTransliterate}.
73	*/
74	void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
75	UBool isIncremental ) const {
76
77	UErrorCode status = U_ZERO_ERROR;
78	LocalPointer<BreakIterator> bi;
79	LocalPointer<UVector32> boundaries;
80
81	{
82	Mutex m;
83	BreakTransliterator nonConstThis = const_cast<BreakTransliterator >(this);
84	boundaries = std::move(nonConstThis->cachedBoundaries);
85	bi = std::move(nonConstThis->cachedBI);
86	}
87	if (bi.isNull()) {
88	bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
89	}
90	if (boundaries.isNull()) {
91	boundaries.adoptInstead(new UVector32 (status));
92	}
93
94	if (bi.isNull() \|\| boundaries.isNull() \|\| U_FAILURE(status)) {
95	return;
96	}
97
98	boundaries ->removeAllElements();
99	UnicodeString sText = replaceableAsString(text);
100	bi ->setText(sText);
101	bi ->preceding(offsets.start);
102
103	// To make things much easier, we will stack the boundaries, and then insert at the end.
104	// generally, we won't need too many, since we will be filtered.
105
106	int32_t boundary;
107	for(boundary = bi ->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi ->next()) {
108	if (boundary == `0`) continue;
109	// HACK: Check to see that preceeding item was a letter
110
111	UChar32 cp = sText.char32At(boundary-`1`);
112	int type = u_charType(cp);
113	//System.out.println(Integer.toString(cp,16) + " (before): " + type);
114	if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == `0`) continue;
115
116	cp = sText.char32At(boundary);
117	type = u_charType(cp);
118	//System.out.println(Integer.toString(cp,16) + " (after): " + type);
119	if ((U_MASK(type) & (U_GC_L_MASK \| U_GC_M_MASK)) == `0`) continue;
120
121	boundaries ->addElement(boundary, status);
122	// printf("Boundary at %d\n", boundary);
123	}
124
125	int delta = `0`;
126	int lastBoundary = `0`;
127
128	if (boundaries ->size() != `0`) { // if we found something, adjust
129	delta = boundaries ->size() * fInsertion.length();
130	lastBoundary = boundaries ->lastElementi();
131
132	// we do this from the end backwards, so that we don't have to keep updating.
133
134	while (boundaries ->size() > `0`) {
135	boundary = boundaries ->popi();
136	text.handleReplaceBetween(boundary, boundary, fInsertion);
137	}
138	}
139
140	// Now fix up the return values
141	offsets.contextLimit += delta;
142	offsets.limit += delta;
143	offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
144
145	// Return break iterator & boundaries vector to the cache.
146	{
147	Mutex m;
148	BreakTransliterator nonConstThis = const_cast<BreakTransliterator >(this);
149	if (nonConstThis->cachedBI.isNull()) {
150	nonConstThis->cachedBI = std::move(bi);
151	}
152	if (nonConstThis->cachedBoundaries.isNull()) {
153	nonConstThis->cachedBoundaries = std::move(boundaries);
154	}
155	}
156
157	// TODO: do something with U_FAILURE(status);
158	// (need to look at transliterators overall, not just here.)
159	}
160
161	//
162	// getInsertion()
163	//
164	const UnicodeString &BreakTransliterator::getInsertion() const {
165	return fInsertion;
166	}
167
168	//
169	// setInsertion()
170	//
171	void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
172	this->fInsertion = insertion;
173	}
174
175	//
176	// replaceableAsString Hack to let break iterators work
177	// on the replaceable text from transliterators.
178	// In practice, the only real Replaceable type that we
179	// will be seeing is UnicodeString, so this function
180	// will normally be efficient.
181	//
182	UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
183	UnicodeString s;
184	UnicodeString rs = dynamic_cast<UnicodeString >(&r);
185	if (rs != NULL) {
186	s = *rs;
187	} else {
188	r.extractBetween(`0`, r.length(), s);
189	}
190	return s;
191	}
192
193	U_NAMESPACE_END
194
195	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
196

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/brktrans.cpp