1//
2// RegularExpression.h
3//
4// Library: Foundation
5// Package: RegExp
6// Module: RegularExpression
7//
8// Definitions of class RegularExpression.
9//
10// A wrapper class for Philip Hazel's PCRE - Perl Compatible Regular Expressions
11// library (http://www.pcre.org).
12//
13// Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH.
14// and Contributors.
15//
16// SPDX-License-Identifier: BSL-1.0
17//
18
19
20#ifndef Foundation_RegularExpression_INCLUDED
21#define Foundation_RegularExpression_INCLUDED
22
23
24#include "Poco/Foundation.h"
25#include <vector>
26#include <map>
27
28
29namespace Poco {
30
31
32class Foundation_API RegularExpression
33 /// A class for working with regular expressions.
34 /// Implemented using PCRE, the Perl Compatible
35 /// Regular Expressions library by Philip Hazel
36 /// (see http://www.pcre.org).
37{
38public:
39 enum Options // These must match the corresponding options in pcre.h!
40 /// Some of the following options can only be passed to the constructor;
41 /// some can be passed only to matching functions, and some can be used
42 /// everywhere.
43 ///
44 /// * Options marked [ctor] can be passed to the constructor.
45 /// * Options marked [match] can be passed to match, extract, split and subst.
46 /// * Options marked [subst] can be passed to subst.
47 ///
48 /// See the PCRE documentation for more information.
49 {
50 RE_CASELESS = 0x00000001, /// case insensitive matching (/i) [ctor]
51 RE_MULTILINE = 0x00000002, /// enable multi-line mode; affects ^ and $ (/m) [ctor]
52 RE_DOTALL = 0x00000004, /// dot matches all characters, including newline (/s) [ctor]
53 RE_EXTENDED = 0x00000008, /// totally ignore whitespace (/x) [ctor]
54 RE_ANCHORED = 0x00000010, /// treat pattern as if it starts with a ^ [ctor, match]
55 RE_DOLLAR_ENDONLY = 0x00000020, /// dollar matches end-of-string only, not last newline in string [ctor]
56 RE_EXTRA = 0x00000040, /// enable optional PCRE functionality [ctor]
57 RE_NOTBOL = 0x00000080, /// circumflex does not match beginning of string [match]
58 RE_NOTEOL = 0x00000100, /// $ does not match end of string [match]
59 RE_UNGREEDY = 0x00000200, /// make quantifiers ungreedy [ctor]
60 RE_NOTEMPTY = 0x00000400, /// empty string never matches [match]
61 RE_UTF8 = 0x00000800, /// assume pattern and subject is UTF-8 encoded [ctor]
62 RE_NO_AUTO_CAPTURE = 0x00001000, /// disable numbered capturing parentheses [ctor, match]
63 RE_NO_UTF8_CHECK = 0x00002000, /// do not check validity of UTF-8 code sequences [match]
64 RE_FIRSTLINE = 0x00040000, /// an unanchored pattern is required to match
65 /// before or at the first newline in the subject string,
66 /// though the matched text may continue over the newline [ctor]
67 RE_DUPNAMES = 0x00080000, /// names used to identify capturing subpatterns need not be unique [ctor]
68 RE_NEWLINE_CR = 0x00100000, /// assume newline is CR ('\r'), the default [ctor]
69 RE_NEWLINE_LF = 0x00200000, /// assume newline is LF ('\n') [ctor]
70 RE_NEWLINE_CRLF = 0x00300000, /// assume newline is CRLF ("\r\n") [ctor]
71 RE_NEWLINE_ANY = 0x00400000, /// assume newline is any valid Unicode newline character [ctor]
72 RE_NEWLINE_ANYCRLF = 0x00500000, /// assume newline is any of CR, LF, CRLF [ctor]
73 RE_GLOBAL = 0x10000000, /// replace all occurrences (/g) [subst]
74 RE_NO_VARS = 0x20000000 /// treat dollar in replacement string as ordinary character [subst]
75 };
76
77 struct Match
78 {
79 std::string::size_type offset; /// zero based offset (std::string::npos if subexpr does not match)
80 std::string::size_type length; /// length of substring
81 std::string name; /// name of group
82 };
83 typedef std::vector<Match> MatchVec;
84 typedef std::map<int, std::string> GroupMap;
85
86 RegularExpression(const std::string& pattern, int options = 0, bool study = true);
87 /// Creates a regular expression and parses the given pattern.
88 /// If study is true, the pattern is analyzed and optimized. This
89 /// is mainly useful if the pattern is used more than once.
90 /// For a description of the options, please see the PCRE documentation.
91 /// Throws a RegularExpressionException if the patter cannot be compiled.
92
93 ~RegularExpression();
94 /// Destroys the regular expression.
95
96 int match(const std::string& subject, Match& mtch, int options = 0) const;
97 /// Matches the given subject string against the pattern. Returns the position
98 /// of the first captured substring in mtch.
99 /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and
100 /// mtch.length is 0.
101 /// Throws a RegularExpressionException in case of an error.
102 /// Returns the number of matches.
103
104 int match(const std::string& subject, std::string::size_type offset, Match& mtch, int options = 0) const;
105 /// Matches the given subject string, starting at offset, against the pattern.
106 /// Returns the position of the captured substring in mtch.
107 /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and
108 /// mtch.length is 0.
109 /// Throws a RegularExpressionException in case of an error.
110 /// Returns the number of matches.
111
112 int match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options = 0) const;
113 /// Matches the given subject string against the pattern.
114 /// The first entry in matches contains the position of the captured substring.
115 /// The following entries identify matching subpatterns. See the PCRE documentation
116 /// for a more detailed explanation.
117 /// If no part of the subject matches the pattern, matches is empty.
118 /// Throws a RegularExpressionException in case of an error.
119 /// Returns the number of matches.
120
121 bool match(const std::string& subject, std::string::size_type offset = 0) const;
122 /// Returns true if and only if the subject matches the regular expression.
123 ///
124 /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for
125 /// matching, which means that the empty string will never match and
126 /// the pattern is treated as if it starts with a ^.
127
128 bool match(const std::string& subject, std::string::size_type offset, int options) const;
129 /// Returns true if and only if the subject matches the regular expression.
130
131 bool operator == (const std::string& subject) const;
132 /// Returns true if and only if the subject matches the regular expression.
133 ///
134 /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for
135 /// matching, which means that the empty string will never match and
136 /// the pattern is treated as if it starts with a ^.
137
138 bool operator != (const std::string& subject) const;
139 /// Returns true if and only if the subject does not match the regular expression.
140 ///
141 /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for
142 /// matching, which means that the empty string will never match and
143 /// the pattern is treated as if it starts with a ^.
144
145 int extract(const std::string& subject, std::string& str, int options = 0) const;
146 /// Matches the given subject string against the pattern.
147 /// Returns the captured string.
148 /// Throws a RegularExpressionException in case of an error.
149 /// Returns the number of matches.
150
151 int extract(const std::string& subject, std::string::size_type offset, std::string& str, int options = 0) const;
152 /// Matches the given subject string, starting at offset, against the pattern.
153 /// Returns the captured string.
154 /// Throws a RegularExpressionException in case of an error.
155 /// Returns the number of matches.
156
157 int split(const std::string& subject, std::vector<std::string>& strings, int options = 0) const;
158 /// Matches the given subject string against the pattern.
159 /// The first entry in captured is the captured substring.
160 /// The following entries contain substrings matching subpatterns. See the PCRE documentation
161 /// for a more detailed explanation.
162 /// If no part of the subject matches the pattern, captured is empty.
163 /// Throws a RegularExpressionException in case of an error.
164 /// Returns the number of matches.
165
166 int split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options = 0) const;
167 /// Matches the given subject string against the pattern.
168 /// The first entry in captured is the captured substring.
169 /// The following entries contain substrings matching subpatterns. See the PCRE documentation
170 /// for a more detailed explanation.
171 /// If no part of the subject matches the pattern, captured is empty.
172 /// Throws a RegularExpressionException in case of an error.
173 /// Returns the number of matches.
174
175 int subst(std::string& subject, const std::string& replacement, int options = 0) const;
176 /// Substitute in subject all matches of the pattern with replacement.
177 /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise,
178 /// only the first match is replaced.
179 /// Occurrences of $<n> (for example, $1, $2, ...) in replacement are replaced
180 /// with the corresponding captured string. $0 is the original subject string.
181 /// Returns the number of replaced occurrences.
182
183 int subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options = 0) const;
184 /// Substitute in subject all matches of the pattern with replacement,
185 /// starting at offset.
186 /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise,
187 /// only the first match is replaced.
188 /// Unless RE_NO_VARS is specified, occurrences of $<n> (for example, $0, $1, $2, ... $9)
189 /// in replacement are replaced with the corresponding captured string.
190 /// $0 is the captured substring. $1 ... $n are the substrings matching the subpatterns.
191 /// Returns the number of replaced occurrences.
192
193 static bool match(const std::string& subject, const std::string& pattern, int options = 0);
194 /// Matches the given subject string against the regular expression given in pattern,
195 /// using the given options.
196
197protected:
198 std::string::size_type substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const;
199
200private:
201 // Note: to avoid a dependency on the pcre.h header the following are
202 // declared as void* and casted to the correct type in the implementation file.
203 void* _pcre; // Actual type is pcre*
204 void* _extra; // Actual type is struct pcre_extra*
205
206 GroupMap _groups;
207
208 static const int OVEC_SIZE;
209
210 RegularExpression();
211 RegularExpression(const RegularExpression&);
212 RegularExpression& operator = (const RegularExpression&);
213};
214
215
216//
217// inlines
218//
219inline int RegularExpression::match(const std::string& subject, Match& mtch, int options) const
220{
221 return match(subject, 0, mtch, options);
222}
223
224
225inline int RegularExpression::split(const std::string& subject, std::vector<std::string>& strings, int options) const
226{
227 return split(subject, 0, strings, options);
228}
229
230
231inline int RegularExpression::subst(std::string& subject, const std::string& replacement, int options) const
232{
233 return subst(subject, 0, replacement, options);
234}
235
236
237inline bool RegularExpression::operator == (const std::string& subject) const
238{
239 return match(subject);
240}
241
242
243inline bool RegularExpression::operator != (const std::string& subject) const
244{
245 return !match(subject);
246}
247
248
249} // namespace Poco
250
251
252#endif // Foundation_RegularExpression_INCLUDED
253