1 | // |
---|
2 | // RegularExpression.h |
---|
3 | // |
---|
4 | // $Id: //poco/1.3/Foundation/include/Poco/RegularExpression.h#2 $ |
---|
5 | // |
---|
6 | // Library: Foundation |
---|
7 | // Package: RegExp |
---|
8 | // Module: RegularExpression |
---|
9 | // |
---|
10 | // Definitions of class RegularExpression. |
---|
11 | // |
---|
12 | // A wrapper class for Philip Hazel's PCRE - Perl Compatible Regular Expressions |
---|
13 | // library (http://www.pcre.org). |
---|
14 | // |
---|
15 | // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH. |
---|
16 | // and Contributors. |
---|
17 | // |
---|
18 | // Permission is hereby granted, free of charge, to any person or organization |
---|
19 | // obtaining a copy of the software and accompanying documentation covered by |
---|
20 | // this license (the "Software") to use, reproduce, display, distribute, |
---|
21 | // execute, and transmit the Software, and to prepare derivative works of the |
---|
22 | // Software, and to permit third-parties to whom the Software is furnished to |
---|
23 | // do so, all subject to the following: |
---|
24 | // |
---|
25 | // The copyright notices in the Software and this entire statement, including |
---|
26 | // the above license grant, this restriction and the following disclaimer, |
---|
27 | // must be included in all copies of the Software, in whole or in part, and |
---|
28 | // all derivative works of the Software, unless such copies or derivative |
---|
29 | // works are solely in the form of machine-executable object code generated by |
---|
30 | // a source language processor. |
---|
31 | // |
---|
32 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
33 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
34 | // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
---|
35 | // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
---|
36 | // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
---|
37 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
38 | // DEALINGS IN THE SOFTWARE. |
---|
39 | // |
---|
40 | |
---|
41 | |
---|
42 | #ifndef Foundation_RegularExpression_INCLUDED |
---|
43 | #define Foundation_RegularExpression_INCLUDED |
---|
44 | |
---|
45 | |
---|
46 | #include "Poco/Foundation.h" |
---|
47 | #include <vector> |
---|
48 | |
---|
49 | |
---|
50 | // |
---|
51 | // Copy these definitions from pcre.h |
---|
52 | // to avoid pulling in the entire header file |
---|
53 | // |
---|
54 | extern "C" |
---|
55 | { |
---|
56 | struct real_pcre; |
---|
57 | typedef struct real_pcre pcre; |
---|
58 | struct pcre_extra; |
---|
59 | } |
---|
60 | |
---|
61 | |
---|
62 | namespace Poco { |
---|
63 | |
---|
64 | |
---|
65 | class Foundation_API RegularExpression |
---|
66 | /// A class for working with regular expressions. |
---|
67 | /// Implemented using PCRE, the Perl Compatible |
---|
68 | /// Regular Expressions library by Philip Hazel |
---|
69 | /// (see http://www.pcre.org). |
---|
70 | { |
---|
71 | public: |
---|
72 | enum Options // These must match the corresponsing options in pcre.h! |
---|
73 | /// Some of the following options can only be passed to the constructor; |
---|
74 | /// some can be passed only to matching functions, and some can be used |
---|
75 | /// everywhere. |
---|
76 | /// |
---|
77 | /// * Options marked [ctor] can be passed to the constructor. |
---|
78 | /// * Options marked [match] can be passed to match, extract, split and subst. |
---|
79 | /// * Options marked [subst] can be passed to subst. |
---|
80 | /// |
---|
81 | /// See the PCRE documentation for more information. |
---|
82 | { |
---|
83 | RE_CASELESS = 0x00000001, /// case insensitive matching (/i) [ctor] |
---|
84 | RE_MULTILINE = 0x00000002, /// enable multi-line mode; affects ^ and $ (/m) [ctor] |
---|
85 | RE_DOTALL = 0x00000004, /// dot matches all characters, including newline (/s) [ctor] |
---|
86 | RE_EXTENDED = 0x00000004, /// totally ignore whitespace (/x) [ctor] |
---|
87 | RE_ANCHORED = 0x00000010, /// treat pattern as if it starts with a ^ [ctor, match] |
---|
88 | RE_DOLLAR_ENDONLY = 0x00000020, /// dollar matches end-of-string only, not last newline in string [ctor] |
---|
89 | RE_EXTRA = 0x00000040, /// enable optional PCRE functionality [ctor] |
---|
90 | RE_NOTBOL = 0x00000080, /// circumflex does not match beginning of string [match] |
---|
91 | RE_NOTEOL = 0x00000100, /// $ does not match end of string [match] |
---|
92 | RE_UNGREEDY = 0x00000200, /// make quantifiers ungreedy [ctor] |
---|
93 | RE_NOTEMPTY = 0x00000400, /// empty string never matches [match] |
---|
94 | RE_UTF8 = 0x00000800, /// assume pattern and subject is UTF-8 encoded [ctor] |
---|
95 | RE_NO_AUTO_CAPTURE = 0x00001000, /// disable numbered capturing parentheses [ctor, match] |
---|
96 | RE_NO_UTF8_CHECK = 0x00002000, /// do not check validity of UTF-8 code sequences [match] |
---|
97 | RE_FIRSTLINE = 0x00040000, /// an unanchored pattern is required to match |
---|
98 | /// before or at the first newline in the subject string, |
---|
99 | /// though the matched text may continue over the newline [ctor] |
---|
100 | RE_DUPNAMES = 0x00080000, /// names used to identify capturing subpatterns need not be unique [ctor] |
---|
101 | RE_NEWLINE_CR = 0x00100000, /// assume newline is CR ('\r'), the default [ctor] |
---|
102 | RE_NEWLINE_LF = 0x00200000, /// assume newline is LF ('\n') [ctor] |
---|
103 | RE_NEWLINE_CRLF = 0x00300000, /// assume newline is CRLF ("\r\n") [ctor] |
---|
104 | RE_NEWLINE_ANY = 0x00400000, /// assume newline is any valid Unicode newline character [ctor] |
---|
105 | RE_NEWLINE_ANYCRLF = 0x00500000, /// assume newline is any of CR, LF, CRLF [ctor] |
---|
106 | RE_GLOBAL = 0x10000000, /// replace all occurences (/g) [subst] |
---|
107 | RE_NO_VARS = 0x20000000 /// treat dollar in replacement string as ordinary character [subst] |
---|
108 | }; |
---|
109 | |
---|
110 | struct Match |
---|
111 | { |
---|
112 | std::string::size_type offset; /// zero based offset (std::string::npos if subexpr does not match) |
---|
113 | std::string::size_type length; /// length of substring |
---|
114 | }; |
---|
115 | typedef std::vector<Match> MatchVec; |
---|
116 | |
---|
117 | RegularExpression(const std::string& pattern, int options = 0, bool study = true); |
---|
118 | /// Creates a regular expression and parses the given pattern. |
---|
119 | /// If study is true, the pattern is analyzed and optimized. This |
---|
120 | /// is mainly useful if the pattern is used more than once. |
---|
121 | /// For a description of the options, please see the PCRE documentation. |
---|
122 | /// Throws a RegularExpressionException if the patter cannot be compiled. |
---|
123 | |
---|
124 | ~RegularExpression(); |
---|
125 | /// Destroys the regular expression. |
---|
126 | |
---|
127 | int match(const std::string& subject, Match& mtch, int options = 0) const; |
---|
128 | /// Matches the given subject string against the pattern. Returns the position |
---|
129 | /// of the first captured substring in mtch. |
---|
130 | /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and |
---|
131 | /// mtch.length is 0. |
---|
132 | /// Throws a RegularExpressionException in case of an error. |
---|
133 | /// Returns the number of matches. |
---|
134 | |
---|
135 | int match(const std::string& subject, std::string::size_type offset, Match& mtch, int options = 0) const; |
---|
136 | /// Matches the given subject string, starting at offset, against the pattern. |
---|
137 | /// Returns the position of the captured substring in mtch. |
---|
138 | /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and |
---|
139 | /// mtch.length is 0. |
---|
140 | /// Throws a RegularExpressionException in case of an error. |
---|
141 | /// Returns the number of matches. |
---|
142 | |
---|
143 | int match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options = 0) const; |
---|
144 | /// Matches the given subject string against the pattern. |
---|
145 | /// The first entry in matches contains the position of the captured substring. |
---|
146 | /// The following entries identify matching subpatterns. See the PCRE documentation |
---|
147 | /// for a more detailed explanation. |
---|
148 | /// If no part of the subject matches the pattern, matches is empty. |
---|
149 | /// Throws a RegularExpressionException in case of an error. |
---|
150 | /// Returns the number of matches. |
---|
151 | |
---|
152 | bool match(const std::string& subject, std::string::size_type offset = 0) const; |
---|
153 | /// Returns true if and only if the subject matches the regular expression. |
---|
154 | /// |
---|
155 | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for |
---|
156 | /// matching, which means that the empty string will never match and |
---|
157 | /// the pattern is treated as if it starts with a ^. |
---|
158 | |
---|
159 | bool match(const std::string& subject, std::string::size_type offset, int options) const; |
---|
160 | /// Returns true if and only if the subject matches the regular expression. |
---|
161 | |
---|
162 | bool operator == (const std::string& subject) const; |
---|
163 | /// Returns true if and only if the subject matches the regular expression. |
---|
164 | /// |
---|
165 | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for |
---|
166 | /// matching, which means that the empty string will never match and |
---|
167 | /// the pattern is treated as if it starts with a ^. |
---|
168 | |
---|
169 | bool operator != (const std::string& subject) const; |
---|
170 | /// Returns true if and only if the subject does not match the regular expression. |
---|
171 | /// |
---|
172 | /// Internally, this method sets the RE_ANCHORED and RE_NOTEMPTY options for |
---|
173 | /// matching, which means that the empty string will never match and |
---|
174 | /// the pattern is treated as if it starts with a ^. |
---|
175 | |
---|
176 | int extract(const std::string& subject, std::string& str, int options = 0) const; |
---|
177 | /// Matches the given subject string against the pattern. |
---|
178 | /// Returns the captured string. |
---|
179 | /// Throws a RegularExpressionException in case of an error. |
---|
180 | /// Returns the number of matches. |
---|
181 | |
---|
182 | int extract(const std::string& subject, std::string::size_type offset, std::string& str, int options = 0) const; |
---|
183 | /// Matches the given subject string, starting at offset, against the pattern. |
---|
184 | /// Returns the captured string. |
---|
185 | /// Throws a RegularExpressionException in case of an error. |
---|
186 | /// Returns the number of matches. |
---|
187 | |
---|
188 | int split(const std::string& subject, std::vector<std::string>& strings, int options = 0) const; |
---|
189 | /// Matches the given subject string against the pattern. |
---|
190 | /// The first entry in captured is the captured substring. |
---|
191 | /// The following entries contain substrings matching subpatterns. See the PCRE documentation |
---|
192 | /// for a more detailed explanation. |
---|
193 | /// If no part of the subject matches the pattern, captured is empty. |
---|
194 | /// Throws a RegularExpressionException in case of an error. |
---|
195 | /// Returns the number of matches. |
---|
196 | |
---|
197 | int split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options = 0) const; |
---|
198 | /// Matches the given subject string against the pattern. |
---|
199 | /// The first entry in captured is the captured substring. |
---|
200 | /// The following entries contain substrings matching subpatterns. See the PCRE documentation |
---|
201 | /// for a more detailed explanation. |
---|
202 | /// If no part of the subject matches the pattern, captured is empty. |
---|
203 | /// Throws a RegularExpressionException in case of an error. |
---|
204 | /// Returns the number of matches. |
---|
205 | |
---|
206 | int subst(std::string& subject, const std::string& replacement, int options = 0) const; |
---|
207 | /// Substitute in subject all matches of the pattern with replacement. |
---|
208 | /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise, |
---|
209 | /// only the first match is replaced. |
---|
210 | /// Occurences of $<n> (for example, $1, $2, ...) in replacement are replaced |
---|
211 | /// with the corresponding captured string. $0 is the original subject string. |
---|
212 | /// Returns the number of replaced occurences. |
---|
213 | |
---|
214 | int subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options = 0) const; |
---|
215 | /// Substitute in subject all matches of the pattern with replacement, |
---|
216 | /// starting at offset. |
---|
217 | /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise, |
---|
218 | /// only the first match is replaced. |
---|
219 | /// Unless RE_NO_VARS is specified, occurences of $<n> (for example, $0, $1, $2, ... $9) |
---|
220 | /// in replacement are replaced with the corresponding captured string. |
---|
221 | /// $0 is the captured substring. $1 ... $n are the substrings maching the subpatterns. |
---|
222 | /// Returns the number of replaced occurences. |
---|
223 | |
---|
224 | static bool match(const std::string& subject, const std::string& pattern, int options = 0); |
---|
225 | /// Matches the given subject string against the regular expression given in pattern, |
---|
226 | /// using the given options. |
---|
227 | |
---|
228 | protected: |
---|
229 | std::string::size_type substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const; |
---|
230 | |
---|
231 | private: |
---|
232 | pcre* _pcre; |
---|
233 | pcre_extra* _extra; |
---|
234 | |
---|
235 | static const int OVEC_SIZE; |
---|
236 | |
---|
237 | RegularExpression(); |
---|
238 | RegularExpression(const RegularExpression&); |
---|
239 | RegularExpression& operator = (const RegularExpression&); |
---|
240 | }; |
---|
241 | |
---|
242 | |
---|
243 | // |
---|
244 | // inlines |
---|
245 | // |
---|
246 | inline int RegularExpression::match(const std::string& subject, Match& mtch, int options) const |
---|
247 | { |
---|
248 | return match(subject, 0, mtch, options); |
---|
249 | } |
---|
250 | |
---|
251 | |
---|
252 | inline int RegularExpression::split(const std::string& subject, std::vector<std::string>& strings, int options) const |
---|
253 | { |
---|
254 | return split(subject, 0, strings, options); |
---|
255 | } |
---|
256 | |
---|
257 | |
---|
258 | inline int RegularExpression::subst(std::string& subject, const std::string& replacement, int options) const |
---|
259 | { |
---|
260 | return subst(subject, 0, replacement, options); |
---|
261 | } |
---|
262 | |
---|
263 | |
---|
264 | inline bool RegularExpression::operator == (const std::string& subject) const |
---|
265 | { |
---|
266 | return match(subject); |
---|
267 | } |
---|
268 | |
---|
269 | |
---|
270 | inline bool RegularExpression::operator != (const std::string& subject) const |
---|
271 | { |
---|
272 | return !match(subject); |
---|
273 | } |
---|
274 | |
---|
275 | |
---|
276 | } // namespace Poco |
---|
277 | |
---|
278 | |
---|
279 | #endif // Foundation_RegularExpression_INCLUDED |
---|