| 1 | // | 
|---|
| 2 | // RegularExpression.h | 
|---|
| 3 | // | 
|---|
| 4 | // Library: Foundation | 
|---|
| 5 | // Package: RegExp | 
|---|
| 6 | // Module:  RegularExpression | 
|---|
| 7 | // | 
|---|
| 8 | // Definitions of class RegularExpression. | 
|---|
| 9 | // | 
|---|
| 10 | // A wrapper class for Philip Hazel's PCRE - Perl Compatible Regular Expressions | 
|---|
| 11 | // library (http://www.pcre.org). | 
|---|
| 12 | // | 
|---|
| 13 | // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH. | 
|---|
| 14 | // and Contributors. | 
|---|
| 15 | // | 
|---|
| 16 | // SPDX-License-Identifier:	BSL-1.0 | 
|---|
| 17 | // | 
|---|
| 18 |  | 
|---|
| 19 |  | 
|---|
| 20 | #ifndef Foundation_RegularExpression_INCLUDED | 
|---|
| 21 | #define Foundation_RegularExpression_INCLUDED | 
|---|
| 22 |  | 
|---|
| 23 |  | 
|---|
| 24 | #include "Poco/Foundation.h" | 
|---|
| 25 | #include <vector> | 
|---|
| 26 | #include <map> | 
|---|
| 27 |  | 
|---|
| 28 |  | 
|---|
| 29 | namespace Poco { | 
|---|
| 30 |  | 
|---|
| 31 |  | 
|---|
| 32 | class Foundation_API RegularExpression | 
|---|
| 33 | /// A class for working with regular expressions. | 
|---|
| 34 | /// Implemented using PCRE, the Perl Compatible | 
|---|
| 35 | /// Regular Expressions library by Philip Hazel | 
|---|
| 36 | /// (see http://www.pcre.org). | 
|---|
| 37 | { | 
|---|
| 38 | public: | 
|---|
| 39 | enum Options // These must match the corresponding options in pcre.h! | 
|---|
| 40 | /// Some of the following options can only be passed to the constructor; | 
|---|
| 41 | /// some can be passed only to matching functions, and some can be used | 
|---|
| 42 | /// everywhere. | 
|---|
| 43 | /// | 
|---|
| 44 | ///   * Options marked [ctor] can be passed to the constructor. | 
|---|
| 45 | ///   * Options marked [match] can be passed to match, extract, split and subst. | 
|---|
| 46 | ///   * Options marked [subst] can be passed to subst. | 
|---|
| 47 | /// | 
|---|
| 48 | /// See the PCRE documentation for more information. | 
|---|
| 49 | { | 
|---|
| 50 | RE_CASELESS        = 0x00000001, /// case insensitive matching (/i) [ctor] | 
|---|
| 51 | RE_MULTILINE       = 0x00000002, /// enable multi-line mode; affects ^ and $ (/m) [ctor] | 
|---|
| 52 | RE_DOTALL          = 0x00000004, /// dot matches all characters, including newline (/s) [ctor] | 
|---|
| 53 | RE_EXTENDED        = 0x00000008, /// totally ignore whitespace (/x) [ctor] | 
|---|
| 54 | RE_ANCHORED        = 0x00000010, /// treat pattern as if it starts with a ^ [ctor, match] | 
|---|
| 55 | RE_DOLLAR_ENDONLY  = 0x00000020, /// dollar matches end-of-string only, not last newline in string [ctor] | 
|---|
| 56 | = 0x00000040, /// enable optional PCRE functionality [ctor] | 
|---|
| 57 | RE_NOTBOL          = 0x00000080, /// circumflex does not match beginning of string [match] | 
|---|
| 58 | RE_NOTEOL          = 0x00000100, /// $ does not match end of string [match] | 
|---|
| 59 | RE_UNGREEDY        = 0x00000200, /// make quantifiers ungreedy [ctor] | 
|---|
| 60 | RE_NOTEMPTY        = 0x00000400, /// empty string never matches [match] | 
|---|
| 61 | RE_UTF8            = 0x00000800, /// assume pattern and subject is UTF-8 encoded [ctor] | 
|---|
| 62 | RE_NO_AUTO_CAPTURE = 0x00001000, /// disable numbered capturing parentheses [ctor, match] | 
|---|
| 63 | RE_NO_UTF8_CHECK   = 0x00002000, /// do not check validity of UTF-8 code sequences [match] | 
|---|
| 64 | RE_FIRSTLINE       = 0x00040000, /// an  unanchored  pattern  is  required  to  match | 
|---|
| 65 | /// before  or  at  the  first  newline  in  the subject string, | 
|---|
| 66 | /// though the matched text may continue over the newline [ctor] | 
|---|
| 67 | RE_DUPNAMES        = 0x00080000, /// names used to identify capturing  subpatterns  need not be unique [ctor] | 
|---|
| 68 | RE_NEWLINE_CR      = 0x00100000, /// assume newline is CR ('\r'), the default [ctor] | 
|---|
| 69 | RE_NEWLINE_LF      = 0x00200000, /// assume newline is LF ('\n') [ctor] | 
|---|
| 70 | RE_NEWLINE_CRLF    = 0x00300000, /// assume newline is CRLF ("\r\n") [ctor] | 
|---|
| 71 | RE_NEWLINE_ANY     = 0x00400000, /// assume newline is any valid Unicode newline character [ctor] | 
|---|
| 72 | RE_NEWLINE_ANYCRLF = 0x00500000, /// assume newline is any of CR, LF, CRLF [ctor] | 
|---|
| 73 | RE_GLOBAL          = 0x10000000, /// replace all occurrences (/g) [subst] | 
|---|
| 74 | RE_NO_VARS         = 0x20000000  /// treat dollar in replacement string as ordinary character [subst] | 
|---|
| 75 | }; | 
|---|
| 76 |  | 
|---|
| 77 | struct Match | 
|---|
| 78 | { | 
|---|
| 79 | std::string::size_type offset; /// zero based offset (std::string::npos if subexpr does not match) | 
|---|
| 80 | std::string::size_type length; /// length of substring | 
|---|
| 81 | std::string name;              /// name of group | 
|---|
| 82 | }; | 
|---|
| 83 | typedef std::vector<Match> MatchVec; | 
|---|
| 84 | typedef std::map<int, std::string> GroupMap; | 
|---|
| 85 |  | 
|---|
| 86 | RegularExpression(const std::string& pattern, int options = 0, bool study = true); | 
|---|
| 87 | /// Creates a regular expression and parses the given pattern. | 
|---|
| 88 | /// If study is true, the pattern is analyzed and optimized. This | 
|---|
| 89 | /// is mainly useful if the pattern is used more than once. | 
|---|
| 90 | /// For a description of the options, please see the PCRE documentation. | 
|---|
| 91 | /// Throws a RegularExpressionException if the patter cannot be compiled. | 
|---|
| 92 |  | 
|---|
| 93 | ~RegularExpression(); | 
|---|
| 94 | /// Destroys the regular expression. | 
|---|
| 95 |  | 
|---|
| 96 | int match(const std::string& subject, Match& mtch, int options = 0) const; | 
|---|
| 97 | /// Matches the given subject string against the pattern. Returns the position | 
|---|
| 98 | /// of the first captured substring in mtch. | 
|---|
| 99 | /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and | 
|---|
| 100 | /// mtch.length is 0. | 
|---|
| 101 | /// Throws a RegularExpressionException in case of an error. | 
|---|
| 102 | /// Returns the number of matches. | 
|---|
| 103 |  | 
|---|
| 104 | int match(const std::string& subject, std::string::size_type offset, Match& mtch, int options = 0) const; | 
|---|
| 105 | /// Matches the given subject string, starting at offset, against the pattern. | 
|---|
| 106 | /// Returns the position of the captured substring in mtch. | 
|---|
| 107 | /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and | 
|---|
| 108 | /// mtch.length is 0. | 
|---|
| 109 | /// Throws a RegularExpressionException in case of an error. | 
|---|
| 110 | /// Returns the number of matches. | 
|---|
| 111 |  | 
|---|
| 112 | int match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options = 0) const; | 
|---|
| 113 | /// Matches the given subject string against the pattern. | 
|---|
| 114 | /// The first entry in matches contains the position of the captured substring. | 
|---|
| 115 | /// The following entries identify matching subpatterns. See the PCRE documentation | 
|---|
| 116 | /// for a more detailed explanation. | 
|---|
| 117 | /// If no part of the subject matches the pattern, matches is empty. | 
|---|
| 118 | /// Throws a RegularExpressionException in case of an error. | 
|---|
| 119 | /// Returns the number of matches. | 
|---|
| 120 |  | 
|---|
| 121 | bool match(const std::string& subject, std::string::size_type offset = 0) const; | 
|---|
| 122 | /// Returns true if and only if the subject matches the regular expression. | 
|---|
| 123 | /// | 
|---|
| 124 | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for | 
|---|
| 125 | /// matching, which means that the empty string will never match and | 
|---|
| 126 | /// the pattern is treated as if it starts with a ^. | 
|---|
| 127 |  | 
|---|
| 128 | bool match(const std::string& subject, std::string::size_type offset, int options) const; | 
|---|
| 129 | /// Returns true if and only if the subject matches the regular expression. | 
|---|
| 130 |  | 
|---|
| 131 | bool operator == (const std::string& subject) const; | 
|---|
| 132 | /// Returns true if and only if the subject matches the regular expression. | 
|---|
| 133 | /// | 
|---|
| 134 | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for | 
|---|
| 135 | /// matching, which means that the empty string will never match and | 
|---|
| 136 | /// the pattern is treated as if it starts with a ^. | 
|---|
| 137 |  | 
|---|
| 138 | bool operator != (const std::string& subject) const; | 
|---|
| 139 | /// Returns true if and only if the subject does not match the regular expression. | 
|---|
| 140 | /// | 
|---|
| 141 | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for | 
|---|
| 142 | /// matching, which means that the empty string will never match and | 
|---|
| 143 | /// the pattern is treated as if it starts with a ^. | 
|---|
| 144 |  | 
|---|
| 145 | int (const std::string& subject, std::string& str, int options = 0) const; | 
|---|
| 146 | /// Matches the given subject string against the pattern. | 
|---|
| 147 | /// Returns the captured string. | 
|---|
| 148 | /// Throws a RegularExpressionException in case of an error. | 
|---|
| 149 | /// Returns the number of matches. | 
|---|
| 150 |  | 
|---|
| 151 | int (const std::string& subject, std::string::size_type offset, std::string& str, int options = 0) const; | 
|---|
| 152 | /// Matches the given subject string, starting at offset, against the pattern. | 
|---|
| 153 | /// Returns the captured string. | 
|---|
| 154 | /// Throws a RegularExpressionException in case of an error. | 
|---|
| 155 | /// Returns the number of matches. | 
|---|
| 156 |  | 
|---|
| 157 | int split(const std::string& subject, std::vector<std::string>& strings, int options = 0) const; | 
|---|
| 158 | /// Matches the given subject string against the pattern. | 
|---|
| 159 | /// The first entry in captured is the captured substring. | 
|---|
| 160 | /// The following entries contain substrings matching subpatterns. See the PCRE documentation | 
|---|
| 161 | /// for a more detailed explanation. | 
|---|
| 162 | /// If no part of the subject matches the pattern, captured is empty. | 
|---|
| 163 | /// Throws a RegularExpressionException in case of an error. | 
|---|
| 164 | /// Returns the number of matches. | 
|---|
| 165 |  | 
|---|
| 166 | int split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options = 0) const; | 
|---|
| 167 | /// Matches the given subject string against the pattern. | 
|---|
| 168 | /// The first entry in captured is the captured substring. | 
|---|
| 169 | /// The following entries contain substrings matching subpatterns. See the PCRE documentation | 
|---|
| 170 | /// for a more detailed explanation. | 
|---|
| 171 | /// If no part of the subject matches the pattern, captured is empty. | 
|---|
| 172 | /// Throws a RegularExpressionException in case of an error. | 
|---|
| 173 | /// Returns the number of matches. | 
|---|
| 174 |  | 
|---|
| 175 | int subst(std::string& subject, const std::string& replacement, int options = 0) const; | 
|---|
| 176 | /// Substitute in subject all matches of the pattern with replacement. | 
|---|
| 177 | /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise, | 
|---|
| 178 | /// only the first match is replaced. | 
|---|
| 179 | /// Occurrences of $<n> (for example, $1, $2, ...) in replacement are replaced | 
|---|
| 180 | /// with the corresponding captured string. $0 is the original subject string. | 
|---|
| 181 | /// Returns the number of replaced occurrences. | 
|---|
| 182 |  | 
|---|
| 183 | int subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options = 0) const; | 
|---|
| 184 | /// Substitute in subject all matches of the pattern with replacement, | 
|---|
| 185 | /// starting at offset. | 
|---|
| 186 | /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise, | 
|---|
| 187 | /// only the first match is replaced. | 
|---|
| 188 | /// Unless RE_NO_VARS is specified, occurrences of $<n> (for example, $0, $1, $2, ... $9) | 
|---|
| 189 | /// in replacement are replaced with the corresponding captured string. | 
|---|
| 190 | /// $0 is the captured substring. $1 ... $n are the substrings matching the subpatterns. | 
|---|
| 191 | /// Returns the number of replaced occurrences. | 
|---|
| 192 |  | 
|---|
| 193 | static bool match(const std::string& subject, const std::string& pattern, int options = 0); | 
|---|
| 194 | /// Matches the given subject string against the regular expression given in pattern, | 
|---|
| 195 | /// using the given options. | 
|---|
| 196 |  | 
|---|
| 197 | protected: | 
|---|
| 198 | std::string::size_type substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const; | 
|---|
| 199 |  | 
|---|
| 200 | private: | 
|---|
| 201 | // Note: to avoid a dependency on the pcre.h header the following are | 
|---|
| 202 | // declared as void* and casted to the correct type in the implementation file. | 
|---|
| 203 | void* _pcre;  // Actual type is pcre* | 
|---|
| 204 | void* ; // Actual type is struct pcre_extra* | 
|---|
| 205 |  | 
|---|
| 206 | GroupMap _groups; | 
|---|
| 207 |  | 
|---|
| 208 | static const int OVEC_SIZE; | 
|---|
| 209 |  | 
|---|
| 210 | RegularExpression(); | 
|---|
| 211 | RegularExpression(const RegularExpression&); | 
|---|
| 212 | RegularExpression& operator = (const RegularExpression&); | 
|---|
| 213 | }; | 
|---|
| 214 |  | 
|---|
| 215 |  | 
|---|
| 216 | // | 
|---|
| 217 | // inlines | 
|---|
| 218 | // | 
|---|
| 219 | inline int RegularExpression::match(const std::string& subject, Match& mtch, int options) const | 
|---|
| 220 | { | 
|---|
| 221 | return match(subject, 0, mtch, options); | 
|---|
| 222 | } | 
|---|
| 223 |  | 
|---|
| 224 |  | 
|---|
| 225 | inline int RegularExpression::split(const std::string& subject, std::vector<std::string>& strings, int options) const | 
|---|
| 226 | { | 
|---|
| 227 | return split(subject, 0, strings, options); | 
|---|
| 228 | } | 
|---|
| 229 |  | 
|---|
| 230 |  | 
|---|
| 231 | inline int RegularExpression::subst(std::string& subject, const std::string& replacement, int options) const | 
|---|
| 232 | { | 
|---|
| 233 | return subst(subject, 0, replacement, options); | 
|---|
| 234 | } | 
|---|
| 235 |  | 
|---|
| 236 |  | 
|---|
| 237 | inline bool RegularExpression::operator == (const std::string& subject) const | 
|---|
| 238 | { | 
|---|
| 239 | return match(subject); | 
|---|
| 240 | } | 
|---|
| 241 |  | 
|---|
| 242 |  | 
|---|
| 243 | inline bool RegularExpression::operator != (const std::string& subject) const | 
|---|
| 244 | { | 
|---|
| 245 | return !match(subject); | 
|---|
| 246 | } | 
|---|
| 247 |  | 
|---|
| 248 |  | 
|---|
| 249 | } // namespace Poco | 
|---|
| 250 |  | 
|---|
| 251 |  | 
|---|
| 252 | #endif // Foundation_RegularExpression_INCLUDED | 
|---|
| 253 |  | 
|---|