1 | // |
2 | // RegularExpression.h |
3 | // |
4 | // Library: Foundation |
5 | // Package: RegExp |
6 | // Module: RegularExpression |
7 | // |
8 | // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH. |
9 | // and Contributors. |
10 | // |
11 | // SPDX-License-Identifier: BSL-1.0 |
12 | // |
13 | |
14 | |
15 | #include "Poco/RegularExpression.h" |
16 | #include "Poco/Exception.h" |
17 | #include <sstream> |
18 | #if defined(POCO_UNBUNDLED) |
19 | #include <pcre.h> |
20 | #else |
21 | #include "pcre_config.h" |
22 | #include "pcre.h" |
23 | #endif |
24 | |
25 | |
26 | namespace Poco { |
27 | |
28 | |
29 | const int RegularExpression::OVEC_SIZE = 126; // must be multiple of 3 |
30 | |
31 | |
32 | RegularExpression::RegularExpression(const std::string& pattern, int options, bool study): _pcre(0), _extra(0) |
33 | { |
34 | const char* error; |
35 | int offs; |
36 | unsigned nmcount; |
37 | unsigned nmentrysz; |
38 | unsigned char* nmtbl; |
39 | |
40 | _pcre = pcre_compile(pattern.c_str(), options, &error, &offs, 0); |
41 | if (!_pcre) |
42 | { |
43 | std::ostringstream msg; |
44 | msg << error << " (at offset " << offs << ")" ; |
45 | throw RegularExpressionException(msg.str()); |
46 | } |
47 | if (study) |
48 | _extra = pcre_study(reinterpret_cast<pcre*>(_pcre), 0, &error); |
49 | |
50 | pcre_fullinfo(reinterpret_cast<const pcre*>(_pcre), reinterpret_cast<const pcre_extra*>(_extra), PCRE_INFO_NAMECOUNT, &nmcount); |
51 | pcre_fullinfo(reinterpret_cast<const pcre*>(_pcre), reinterpret_cast<const pcre_extra*>(_extra), PCRE_INFO_NAMEENTRYSIZE, &nmentrysz); |
52 | pcre_fullinfo(reinterpret_cast<const pcre*>(_pcre), reinterpret_cast<const pcre_extra*>(_extra), PCRE_INFO_NAMETABLE, &nmtbl); |
53 | |
54 | for (int i = 0; i < nmcount; i++) |
55 | { |
56 | unsigned char* group = nmtbl + 2 + (nmentrysz * i); |
57 | int n = pcre_get_stringnumber(reinterpret_cast<const pcre*>(_pcre), (char*) group); |
58 | _groups[n] = std::string((char*) group); |
59 | } |
60 | } |
61 | |
62 | |
63 | RegularExpression::~RegularExpression() |
64 | { |
65 | if (_pcre) pcre_free(reinterpret_cast<pcre*>(_pcre)); |
66 | if (_extra) pcre_free(reinterpret_cast<struct pcre_extra*>(_extra)); |
67 | } |
68 | |
69 | |
70 | int RegularExpression::match(const std::string& subject, std::string::size_type offset, Match& mtch, int options) const |
71 | { |
72 | poco_assert (offset <= subject.length()); |
73 | |
74 | int ovec[OVEC_SIZE]; |
75 | int rc = pcre_exec(reinterpret_cast<pcre*>(_pcre), reinterpret_cast<struct pcre_extra*>(_extra), subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
76 | if (rc == PCRE_ERROR_NOMATCH) |
77 | { |
78 | mtch.offset = std::string::npos; |
79 | mtch.length = 0; |
80 | return 0; |
81 | } |
82 | else if (rc == PCRE_ERROR_BADOPTION) |
83 | { |
84 | throw RegularExpressionException("bad option" ); |
85 | } |
86 | else if (rc == 0) |
87 | { |
88 | throw RegularExpressionException("too many captured substrings" ); |
89 | } |
90 | else if (rc < 0) |
91 | { |
92 | std::ostringstream msg; |
93 | msg << "PCRE error " << rc; |
94 | throw RegularExpressionException(msg.str()); |
95 | } |
96 | mtch.offset = ovec[0] < 0 ? std::string::npos : ovec[0]; |
97 | mtch.length = ovec[1] - mtch.offset; |
98 | return rc; |
99 | } |
100 | |
101 | |
102 | int RegularExpression::match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options) const |
103 | { |
104 | poco_assert (offset <= subject.length()); |
105 | |
106 | matches.clear(); |
107 | |
108 | int ovec[OVEC_SIZE]; |
109 | int rc = pcre_exec(reinterpret_cast<pcre*>(_pcre), reinterpret_cast<struct pcre_extra*>(_extra), subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
110 | if (rc == PCRE_ERROR_NOMATCH) |
111 | { |
112 | return 0; |
113 | } |
114 | else if (rc == PCRE_ERROR_BADOPTION) |
115 | { |
116 | throw RegularExpressionException("bad option" ); |
117 | } |
118 | else if (rc == 0) |
119 | { |
120 | throw RegularExpressionException("too many captured substrings" ); |
121 | } |
122 | else if (rc < 0) |
123 | { |
124 | std::ostringstream msg; |
125 | msg << "PCRE error " << rc; |
126 | throw RegularExpressionException(msg.str()); |
127 | } |
128 | matches.reserve(rc); |
129 | for (int i = 0; i < rc; ++i) |
130 | { |
131 | Match m; |
132 | GroupMap::const_iterator it; |
133 | |
134 | m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ; |
135 | m.length = ovec[i*2 + 1] - m.offset; |
136 | |
137 | it = _groups.find(i); |
138 | if (it != _groups.end()) |
139 | { |
140 | m.name = (*it).second; |
141 | } |
142 | |
143 | matches.push_back(m); |
144 | } |
145 | return rc; |
146 | } |
147 | |
148 | |
149 | bool RegularExpression::match(const std::string& subject, std::string::size_type offset) const |
150 | { |
151 | Match mtch; |
152 | match(subject, offset, mtch, RE_ANCHORED | RE_NOTEMPTY); |
153 | return mtch.offset == offset && mtch.length == subject.length() - offset; |
154 | } |
155 | |
156 | |
157 | bool RegularExpression::match(const std::string& subject, std::string::size_type offset, int options) const |
158 | { |
159 | Match mtch; |
160 | match(subject, offset, mtch, options); |
161 | return mtch.offset == offset && mtch.length == subject.length() - offset; |
162 | } |
163 | |
164 | |
165 | int RegularExpression::(const std::string& subject, std::string& str, int options) const |
166 | { |
167 | Match mtch; |
168 | int rc = match(subject, 0, mtch, options); |
169 | if (mtch.offset != std::string::npos) |
170 | str.assign(subject, mtch.offset, mtch.length); |
171 | else |
172 | str.clear(); |
173 | return rc; |
174 | } |
175 | |
176 | |
177 | int RegularExpression::(const std::string& subject, std::string::size_type offset, std::string& str, int options) const |
178 | { |
179 | Match mtch; |
180 | int rc = match(subject, offset, mtch, options); |
181 | if (mtch.offset != std::string::npos) |
182 | str.assign(subject, mtch.offset, mtch.length); |
183 | else |
184 | str.clear(); |
185 | return rc; |
186 | } |
187 | |
188 | |
189 | int RegularExpression::split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options) const |
190 | { |
191 | MatchVec matches; |
192 | strings.clear(); |
193 | int rc = match(subject, offset, matches, options); |
194 | strings.reserve(matches.size()); |
195 | for (MatchVec::const_iterator it = matches.begin(); it != matches.end(); ++it) |
196 | { |
197 | if (it->offset != std::string::npos) |
198 | strings.push_back(subject.substr(it->offset, it->length)); |
199 | else |
200 | strings.push_back(std::string()); |
201 | } |
202 | return rc; |
203 | } |
204 | |
205 | |
206 | int RegularExpression::(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const |
207 | { |
208 | if (options & RE_GLOBAL) |
209 | { |
210 | int rc = 0; |
211 | std::string::size_type pos = substOne(subject, offset, replacement, options); |
212 | while (pos != std::string::npos) |
213 | { |
214 | ++rc; |
215 | pos = substOne(subject, pos, replacement, options); |
216 | } |
217 | return rc; |
218 | } |
219 | else |
220 | { |
221 | return substOne(subject, offset, replacement, options) != std::string::npos ? 1 : 0; |
222 | } |
223 | } |
224 | |
225 | |
226 | std::string::size_type RegularExpression::(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const |
227 | { |
228 | if (offset >= subject.length()) return std::string::npos; |
229 | |
230 | int ovec[OVEC_SIZE]; |
231 | int rc = pcre_exec(reinterpret_cast<pcre*>(_pcre), reinterpret_cast<struct pcre_extra*>(_extra), subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
232 | if (rc == PCRE_ERROR_NOMATCH) |
233 | { |
234 | return std::string::npos; |
235 | } |
236 | else if (rc == PCRE_ERROR_BADOPTION) |
237 | { |
238 | throw RegularExpressionException("bad option" ); |
239 | } |
240 | else if (rc == 0) |
241 | { |
242 | throw RegularExpressionException("too many captured substrings" ); |
243 | } |
244 | else if (rc < 0) |
245 | { |
246 | std::ostringstream msg; |
247 | msg << "PCRE error " << rc; |
248 | throw RegularExpressionException(msg.str()); |
249 | } |
250 | std::string result; |
251 | std::string::size_type len = subject.length(); |
252 | std::string::size_type pos = 0; |
253 | std::string::size_type rp = std::string::npos; |
254 | while (pos < len) |
255 | { |
256 | if (ovec[0] == pos) |
257 | { |
258 | std::string::const_iterator it = replacement.begin(); |
259 | std::string::const_iterator end = replacement.end(); |
260 | while (it != end) |
261 | { |
262 | if (*it == '$' && !(options & RE_NO_VARS)) |
263 | { |
264 | ++it; |
265 | if (it != end) |
266 | { |
267 | char d = *it; |
268 | if (d >= '0' && d <= '9') |
269 | { |
270 | int c = d - '0'; |
271 | if (c < rc) |
272 | { |
273 | int o = ovec[c*2]; |
274 | int l = ovec[c*2 + 1] - o; |
275 | result.append(subject, o, l); |
276 | } |
277 | } |
278 | else |
279 | { |
280 | result += '$'; |
281 | result += d; |
282 | } |
283 | ++it; |
284 | } |
285 | else result += '$'; |
286 | } |
287 | else result += *it++; |
288 | } |
289 | pos = ovec[1]; |
290 | rp = result.length(); |
291 | } |
292 | else result += subject[pos++]; |
293 | } |
294 | subject = result; |
295 | return rp; |
296 | } |
297 | |
298 | |
299 | bool RegularExpression::match(const std::string& subject, const std::string& pattern, int options) |
300 | { |
301 | int ctorOptions = options & (RE_CASELESS | RE_MULTILINE | RE_DOTALL | RE_EXTENDED | RE_ANCHORED | RE_DOLLAR_ENDONLY | RE_EXTRA | RE_UNGREEDY | RE_UTF8 | RE_NO_AUTO_CAPTURE); |
302 | int mtchOptions = options & (RE_ANCHORED | RE_NOTBOL | RE_NOTEOL | RE_NOTEMPTY | RE_NO_AUTO_CAPTURE | RE_NO_UTF8_CHECK); |
303 | RegularExpression re(pattern, ctorOptions, false); |
304 | return re.match(subject, 0, mtchOptions); |
305 | } |
306 | |
307 | |
308 | } // namespace Poco |
309 | |