1//
2// RegularExpression.h
3//
4// Library: Foundation
5// Package: RegExp
6// Module: RegularExpression
7//
8// Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH.
9// and Contributors.
10//
11// SPDX-License-Identifier: BSL-1.0
12//
13
14
15#include "Poco/RegularExpression.h"
16#include "Poco/Exception.h"
17#include <sstream>
18#if defined(POCO_UNBUNDLED_PCRE)
19#include <pcre.h>
20#else
21#include "pcre_config.h"
22#include "pcre.h"
23#endif
24
25
26namespace Poco {
27
28
29const int RegularExpression::OVEC_SIZE = 128;
30
31
32RegularExpression::RegularExpression(const std::string& pattern, int options, bool study): _pcre(0), _extra(0)
33{
34 const char* error;
35 int offs;
36 unsigned nmcount;
37 unsigned nmentrysz;
38 unsigned char* nmtbl;
39
40 _pcre = pcre_compile(pattern.c_str(), options, &error, &offs, 0);
41 if (!_pcre)
42 {
43 std::ostringstream msg;
44 msg << error << " (at offset " << offs << ")";
45 throw RegularExpressionException(msg.str());
46 }
47 if (study)
48 _extra = pcre_study(reinterpret_cast<pcre*>(_pcre), 0, &error);
49
50 pcre_fullinfo(reinterpret_cast<const pcre*>(_pcre), reinterpret_cast<const pcre_extra*>(_extra), PCRE_INFO_NAMECOUNT, &nmcount);
51 pcre_fullinfo(reinterpret_cast<const pcre*>(_pcre), reinterpret_cast<const pcre_extra*>(_extra), PCRE_INFO_NAMEENTRYSIZE, &nmentrysz);
52 pcre_fullinfo(reinterpret_cast<const pcre*>(_pcre), reinterpret_cast<const pcre_extra*>(_extra), PCRE_INFO_NAMETABLE, &nmtbl);
53
54 for (int i = 0; i < nmcount; i++)
55 {
56 unsigned char* group = nmtbl + 2 + (nmentrysz * i);
57 int n = pcre_get_stringnumber(reinterpret_cast<const pcre*>(_pcre), (char*) group);
58 _groups[n] = std::string((char*) group);
59 }
60}
61
62
63RegularExpression::~RegularExpression()
64{
65 if (_pcre) pcre_free(reinterpret_cast<pcre*>(_pcre));
66 if (_extra) pcre_free(reinterpret_cast<struct pcre_extra*>(_extra));
67}
68
69
70int RegularExpression::match(const std::string& subject, std::string::size_type offset, Match& mtch, int options) const
71{
72 poco_assert (offset <= subject.length());
73
74 int ovec[OVEC_SIZE];
75 int rc = pcre_exec(reinterpret_cast<pcre*>(_pcre), reinterpret_cast<struct pcre_extra*>(_extra), subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE);
76 if (rc == PCRE_ERROR_NOMATCH)
77 {
78 mtch.offset = std::string::npos;
79 mtch.length = 0;
80 return 0;
81 }
82 else if (rc == PCRE_ERROR_BADOPTION)
83 {
84 throw RegularExpressionException("bad option");
85 }
86 else if (rc == 0)
87 {
88 throw RegularExpressionException("too many captured substrings");
89 }
90 else if (rc < 0)
91 {
92 std::ostringstream msg;
93 msg << "PCRE error " << rc;
94 throw RegularExpressionException(msg.str());
95 }
96 mtch.offset = ovec[0] < 0 ? std::string::npos : ovec[0];
97 mtch.length = ovec[1] - mtch.offset;
98 return rc;
99}
100
101
102int RegularExpression::match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options) const
103{
104 poco_assert (offset <= subject.length());
105
106 matches.clear();
107
108 int ovec[OVEC_SIZE];
109 int rc = pcre_exec(reinterpret_cast<pcre*>(_pcre), reinterpret_cast<struct pcre_extra*>(_extra), subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE);
110 if (rc == PCRE_ERROR_NOMATCH)
111 {
112 return 0;
113 }
114 else if (rc == PCRE_ERROR_BADOPTION)
115 {
116 throw RegularExpressionException("bad option");
117 }
118 else if (rc == 0)
119 {
120 throw RegularExpressionException("too many captured substrings");
121 }
122 else if (rc < 0)
123 {
124 std::ostringstream msg;
125 msg << "PCRE error " << rc;
126 throw RegularExpressionException(msg.str());
127 }
128 matches.reserve(rc);
129 for (int i = 0; i < rc; ++i)
130 {
131 Match m;
132 GroupMap::const_iterator it;
133
134 m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ;
135 m.length = ovec[i*2 + 1] - m.offset;
136
137 it = _groups.find(i);
138 if (it != _groups.end())
139 {
140 m.name = (*it).second;
141 }
142
143 matches.push_back(m);
144 }
145 return rc;
146}
147
148
149bool RegularExpression::match(const std::string& subject, std::string::size_type offset) const
150{
151 Match mtch;
152 match(subject, offset, mtch, RE_ANCHORED | RE_NOTEMPTY);
153 return mtch.offset == offset && mtch.length == subject.length() - offset;
154}
155
156
157bool RegularExpression::match(const std::string& subject, std::string::size_type offset, int options) const
158{
159 Match mtch;
160 match(subject, offset, mtch, options);
161 return mtch.offset == offset && mtch.length == subject.length() - offset;
162}
163
164
165int RegularExpression::extract(const std::string& subject, std::string& str, int options) const
166{
167 Match mtch;
168 int rc = match(subject, 0, mtch, options);
169 if (mtch.offset != std::string::npos)
170 str.assign(subject, mtch.offset, mtch.length);
171 else
172 str.clear();
173 return rc;
174}
175
176
177int RegularExpression::extract(const std::string& subject, std::string::size_type offset, std::string& str, int options) const
178{
179 Match mtch;
180 int rc = match(subject, offset, mtch, options);
181 if (mtch.offset != std::string::npos)
182 str.assign(subject, mtch.offset, mtch.length);
183 else
184 str.clear();
185 return rc;
186}
187
188
189int RegularExpression::split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options) const
190{
191 MatchVec matches;
192 strings.clear();
193 int rc = match(subject, offset, matches, options);
194 strings.reserve(matches.size());
195 for (MatchVec::const_iterator it = matches.begin(); it != matches.end(); ++it)
196 {
197 if (it->offset != std::string::npos)
198 strings.push_back(subject.substr(it->offset, it->length));
199 else
200 strings.push_back(std::string());
201 }
202 return rc;
203}
204
205
206int RegularExpression::subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const
207{
208 if (options & RE_GLOBAL)
209 {
210 int rc = 0;
211 std::string::size_type pos = substOne(subject, offset, replacement, options);
212 while (pos != std::string::npos)
213 {
214 ++rc;
215 pos = substOne(subject, pos, replacement, options);
216 }
217 return rc;
218 }
219 else
220 {
221 return substOne(subject, offset, replacement, options) != std::string::npos ? 1 : 0;
222 }
223}
224
225
226std::string::size_type RegularExpression::substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const
227{
228 if (offset >= subject.length()) return std::string::npos;
229
230 int ovec[OVEC_SIZE];
231 int rc = pcre_exec(reinterpret_cast<pcre*>(_pcre), reinterpret_cast<struct pcre_extra*>(_extra), subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE);
232 if (rc == PCRE_ERROR_NOMATCH)
233 {
234 return std::string::npos;
235 }
236 else if (rc == PCRE_ERROR_BADOPTION)
237 {
238 throw RegularExpressionException("bad option");
239 }
240 else if (rc == 0)
241 {
242 throw RegularExpressionException("too many captured substrings");
243 }
244 else if (rc < 0)
245 {
246 std::ostringstream msg;
247 msg << "PCRE error " << rc;
248 throw RegularExpressionException(msg.str());
249 }
250 std::string result;
251 std::string::size_type len = subject.length();
252 std::string::size_type pos = 0;
253 std::string::size_type rp = std::string::npos;
254 while (pos < len)
255 {
256 if (ovec[0] == pos)
257 {
258 std::string::const_iterator it = replacement.begin();
259 std::string::const_iterator end = replacement.end();
260 while (it != end)
261 {
262 if (*it == '$' && !(options & RE_NO_VARS))
263 {
264 ++it;
265 if (it != end)
266 {
267 char d = *it;
268 if (d >= '0' && d <= '9')
269 {
270 int c = d - '0';
271 if (c < rc)
272 {
273 int o = ovec[c*2];
274 int l = ovec[c*2 + 1] - o;
275 result.append(subject, o, l);
276 }
277 }
278 else
279 {
280 result += '$';
281 result += d;
282 }
283 ++it;
284 }
285 else result += '$';
286 }
287 else result += *it++;
288 }
289 pos = ovec[1];
290 rp = result.length();
291 }
292 else result += subject[pos++];
293 }
294 subject = result;
295 return rp;
296}
297
298
299bool RegularExpression::match(const std::string& subject, const std::string& pattern, int options)
300{
301 int ctorOptions = options & (RE_CASELESS | RE_MULTILINE | RE_DOTALL | RE_EXTENDED | RE_ANCHORED | RE_DOLLAR_ENDONLY | RE_EXTRA | RE_UNGREEDY | RE_UTF8 | RE_NO_AUTO_CAPTURE);
302 int mtchOptions = options & (RE_ANCHORED | RE_NOTBOL | RE_NOTEOL | RE_NOTEMPTY | RE_NO_AUTO_CAPTURE | RE_NO_UTF8_CHECK);
303 RegularExpression re(pattern, ctorOptions, false);
304 return re.match(subject, 0, mtchOptions);
305}
306
307
308} // namespace Poco
309