1// Scintilla source code edit control
2/** @file WordList.cxx
3 ** Hold a list of words.
4 **/
5// Copyright 1998-2002 by Neil Hodgson <neilh@scintilla.org>
6// The License.txt file describes the conditions under which this software may be distributed.
7
8#include <cstdlib>
9#include <cassert>
10#include <cstring>
11
12#include <algorithm>
13#include <iterator>
14#include <memory>
15
16#include "WordList.h"
17
18using namespace Lexilla;
19
20namespace {
21
22/**
23 * Creates an array that points into each word in the string and puts \0 terminators
24 * after each word.
25 */
26std::unique_ptr<char *[]> ArrayFromWordList(char *wordlist, size_t slen, size_t *len, bool onlyLineEnds = false) {
27 size_t words = 0;
28 // For rapid determination of whether a character is a separator, build
29 // a look up table.
30 bool wordSeparator[256] = {}; // Initialise all to false.
31 wordSeparator[static_cast<unsigned int>('\r')] = true;
32 wordSeparator[static_cast<unsigned int>('\n')] = true;
33 if (!onlyLineEnds) {
34 wordSeparator[static_cast<unsigned int>(' ')] = true;
35 wordSeparator[static_cast<unsigned int>('\t')] = true;
36 }
37 unsigned char prev = '\n';
38 for (int j = 0; wordlist[j]; j++) {
39 const unsigned char curr = wordlist[j];
40 if (!wordSeparator[curr] && wordSeparator[prev])
41 words++;
42 prev = curr;
43 }
44 std::unique_ptr<char *[]> keywords = std::make_unique<char *[]>(words + 1);
45 size_t wordsStore = 0;
46 if (words) {
47 unsigned char previous = '\0';
48 for (size_t k = 0; k < slen; k++) {
49 if (!wordSeparator[static_cast<unsigned char>(wordlist[k])]) {
50 if (!previous) {
51 keywords[wordsStore] = &wordlist[k];
52 wordsStore++;
53 }
54 } else {
55 wordlist[k] = '\0';
56 }
57 previous = wordlist[k];
58 }
59 }
60 assert(wordsStore < (words + 1));
61 keywords[wordsStore] = &wordlist[slen];
62 *len = wordsStore;
63 return keywords;
64}
65
66bool cmpWords(const char *a, const char *b) noexcept {
67 return strcmp(a, b) < 0;
68}
69
70}
71
72WordList::WordList(bool onlyLineEnds_) noexcept :
73 words(nullptr), list(nullptr), len(0), onlyLineEnds(onlyLineEnds_) {
74 // Prevent warnings by static analyzers about uninitialized starts.
75 starts[0] = -1;
76}
77
78WordList::~WordList() {
79 Clear();
80}
81
82WordList::operator bool() const noexcept {
83 return len != 0;
84}
85
86bool WordList::operator!=(const WordList &other) const noexcept {
87 if (len != other.len)
88 return true;
89 for (size_t i=0; i<len; i++) {
90 if (strcmp(words[i], other.words[i]) != 0)
91 return true;
92 }
93 return false;
94}
95
96int WordList::Length() const noexcept {
97 return static_cast<int>(len);
98}
99
100void WordList::Clear() noexcept {
101 delete []list;
102 list = nullptr;
103 delete []words;
104 words = nullptr;
105 len = 0;
106}
107
108bool WordList::Set(const char *s) {
109 const size_t lenS = strlen(s) + 1;
110 std::unique_ptr<char[]> listTemp = std::make_unique<char[]>(lenS);
111 memcpy(listTemp.get(), s, lenS);
112 size_t lenTemp = 0;
113 std::unique_ptr<char *[]> wordsTemp = ArrayFromWordList(listTemp.get(), lenS - 1, &lenTemp, onlyLineEnds);
114 std::sort(wordsTemp.get(), wordsTemp.get() + lenTemp, cmpWords);
115
116 if (lenTemp == len) {
117 bool changed = false;
118 for (size_t i = 0; i < lenTemp; i++) {
119 if (strcmp(words[i], wordsTemp[i]) != 0) {
120 changed = true;
121 break;
122 }
123 }
124 if (!changed) {
125 return false;
126 }
127 }
128
129 Clear();
130 words = wordsTemp.release();
131 list = listTemp.release();
132 len = lenTemp;
133 std::fill(starts, std::end(starts), -1);
134 for (int l = static_cast<int>(len - 1); l >= 0; l--) {
135 unsigned char indexChar = words[l][0];
136 starts[indexChar] = l;
137 }
138 return true;
139}
140
141/** Check whether a string is in the list.
142 * List elements are either exact matches or prefixes.
143 * Prefix elements start with '^' and match all strings that start with the rest of the element
144 * so '^GTK_' matches 'GTK_X', 'GTK_MAJOR_VERSION', and 'GTK_'.
145 */
146bool WordList::InList(const char *s) const noexcept {
147 if (!words)
148 return false;
149 const unsigned char firstChar = s[0];
150 int j = starts[firstChar];
151 if (j >= 0) {
152 while (words[j][0] == firstChar) {
153 if (s[1] == words[j][1]) {
154 const char *a = words[j] + 1;
155 const char *b = s + 1;
156 while (*a && *a == *b) {
157 a++;
158 b++;
159 }
160 if (!*a && !*b)
161 return true;
162 }
163 j++;
164 }
165 }
166 j = starts[static_cast<unsigned int>('^')];
167 if (j >= 0) {
168 while (words[j][0] == '^') {
169 const char *a = words[j] + 1;
170 const char *b = s;
171 while (*a && *a == *b) {
172 a++;
173 b++;
174 }
175 if (!*a)
176 return true;
177 j++;
178 }
179 }
180 return false;
181}
182
183/** similar to InList, but word s can be a substring of keyword.
184 * eg. the keyword define is defined as def~ine. This means the word must start
185 * with def to be a keyword, but also defi, defin and define are valid.
186 * The marker is ~ in this case.
187 */
188bool WordList::InListAbbreviated(const char *s, const char marker) const noexcept {
189 if (!words)
190 return false;
191 const unsigned char firstChar = s[0];
192 int j = starts[firstChar];
193 if (j >= 0) {
194 while (words[j][0] == firstChar) {
195 bool isSubword = false;
196 int start = 1;
197 if (words[j][1] == marker) {
198 isSubword = true;
199 start++;
200 }
201 if (s[1] == words[j][start]) {
202 const char *a = words[j] + start;
203 const char *b = s + 1;
204 while (*a && *a == *b) {
205 a++;
206 if (*a == marker) {
207 isSubword = true;
208 a++;
209 }
210 b++;
211 }
212 if ((!*a || isSubword) && !*b)
213 return true;
214 }
215 j++;
216 }
217 }
218 j = starts[static_cast<unsigned int>('^')];
219 if (j >= 0) {
220 while (words[j][0] == '^') {
221 const char *a = words[j] + 1;
222 const char *b = s;
223 while (*a && *a == *b) {
224 a++;
225 b++;
226 }
227 if (!*a)
228 return true;
229 j++;
230 }
231 }
232 return false;
233}
234
235/** similar to InListAbbreviated, but word s can be a abridged version of a keyword.
236* eg. the keyword is defined as "after.~:". This means the word must have a prefix (begins with) of
237* "after." and suffix (ends with) of ":" to be a keyword, Hence "after.field:" , "after.form.item:" are valid.
238* Similarly "~.is.valid" keyword is suffix only... hence "field.is.valid" , "form.is.valid" are valid.
239* The marker is ~ in this case.
240* No multiple markers check is done and wont work.
241*/
242bool WordList::InListAbridged(const char *s, const char marker) const noexcept {
243 if (!words)
244 return false;
245 const unsigned char firstChar = s[0];
246 int j = starts[firstChar];
247 if (j >= 0) {
248 while (words[j][0] == firstChar) {
249 const char *a = words[j];
250 const char *b = s;
251 while (*a && *a == *b) {
252 a++;
253 if (*a == marker) {
254 a++;
255 const size_t suffixLengthA = strlen(a);
256 const size_t suffixLengthB = strlen(b);
257 if (suffixLengthA >= suffixLengthB)
258 break;
259 b = b + suffixLengthB - suffixLengthA - 1;
260 }
261 b++;
262 }
263 if (!*a && !*b)
264 return true;
265 j++;
266 }
267 }
268
269 j = starts[static_cast<unsigned int>(marker)];
270 if (j >= 0) {
271 while (words[j][0] == marker) {
272 const char *a = words[j] + 1;
273 const char *b = s;
274 const size_t suffixLengthA = strlen(a);
275 const size_t suffixLengthB = strlen(b);
276 if (suffixLengthA > suffixLengthB) {
277 j++;
278 continue;
279 }
280 b = b + suffixLengthB - suffixLengthA;
281
282 while (*a && *a == *b) {
283 a++;
284 b++;
285 }
286 if (!*a && !*b)
287 return true;
288 j++;
289 }
290 }
291
292 return false;
293}
294
295const char *WordList::WordAt(int n) const noexcept {
296 return words[n];
297}
298
299