1 | // |
---|
2 | // RegularExpression.h |
---|
3 | // |
---|
4 | // $Id: //poco/1.3/Foundation/src/RegularExpression.cpp#3 $ |
---|
5 | // |
---|
6 | // Library: Foundation |
---|
7 | // Package: RegExp |
---|
8 | // Module: RegularExpression |
---|
9 | // |
---|
10 | // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH. |
---|
11 | // and Contributors. |
---|
12 | // |
---|
13 | // Permission is hereby granted, free of charge, to any person or organization |
---|
14 | // obtaining a copy of the software and accompanying documentation covered by |
---|
15 | // this license (the "Software") to use, reproduce, display, distribute, |
---|
16 | // execute, and transmit the Software, and to prepare derivative works of the |
---|
17 | // Software, and to permit third-parties to whom the Software is furnished to |
---|
18 | // do so, all subject to the following: |
---|
19 | // |
---|
20 | // The copyright notices in the Software and this entire statement, including |
---|
21 | // the above license grant, this restriction and the following disclaimer, |
---|
22 | // must be included in all copies of the Software, in whole or in part, and |
---|
23 | // all derivative works of the Software, unless such copies or derivative |
---|
24 | // works are solely in the form of machine-executable object code generated by |
---|
25 | // a source language processor. |
---|
26 | // |
---|
27 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
28 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
29 | // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
---|
30 | // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
---|
31 | // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
---|
32 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
33 | // DEALINGS IN THE SOFTWARE. |
---|
34 | // |
---|
35 | |
---|
36 | |
---|
37 | #include <Poco/RegularExpression.h> |
---|
38 | #include <Poco/Exception.h> |
---|
39 | #include <sstream> |
---|
40 | #if defined(POCO_UNBUNDLED) |
---|
41 | #include <pcre.h> |
---|
42 | #else |
---|
43 | #include "pcre.h" |
---|
44 | #endif |
---|
45 | |
---|
46 | |
---|
47 | namespace Poco { |
---|
48 | |
---|
49 | |
---|
50 | const int RegularExpression::OVEC_SIZE = 64; |
---|
51 | |
---|
52 | |
---|
53 | RegularExpression::RegularExpression(const std::string& pattern, int options, bool study): _pcre(0), _extra(0) |
---|
54 | { |
---|
55 | const char* error; |
---|
56 | int offs; |
---|
57 | _pcre = pcre_compile(pattern.c_str(), options, &error, &offs, 0); |
---|
58 | if (!_pcre) |
---|
59 | { |
---|
60 | std::ostringstream msg; |
---|
61 | msg << error << " (at offset " << offs << ")"; |
---|
62 | throw RegularExpressionException(msg.str()); |
---|
63 | } |
---|
64 | if (study) |
---|
65 | _extra = pcre_study(_pcre, 0, &error); |
---|
66 | } |
---|
67 | |
---|
68 | |
---|
69 | RegularExpression::~RegularExpression() |
---|
70 | { |
---|
71 | if (_pcre) pcre_free(_pcre); |
---|
72 | if (_extra) pcre_free(_extra); |
---|
73 | } |
---|
74 | |
---|
75 | |
---|
76 | int RegularExpression::match(const std::string& subject, std::string::size_type offset, Match& mtch, int options) const |
---|
77 | { |
---|
78 | poco_assert (offset <= subject.length()); |
---|
79 | |
---|
80 | int ovec[OVEC_SIZE]; |
---|
81 | int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
---|
82 | if (rc == PCRE_ERROR_NOMATCH) |
---|
83 | { |
---|
84 | mtch.offset = std::string::npos; |
---|
85 | mtch.length = 0; |
---|
86 | return 0; |
---|
87 | } |
---|
88 | else if (rc == PCRE_ERROR_BADOPTION) |
---|
89 | { |
---|
90 | throw RegularExpressionException("bad option"); |
---|
91 | } |
---|
92 | else if (rc == 0) |
---|
93 | { |
---|
94 | throw RegularExpressionException("too many captured substrings"); |
---|
95 | } |
---|
96 | else if (rc < 0) |
---|
97 | { |
---|
98 | std::ostringstream msg; |
---|
99 | msg << "PCRE error " << rc; |
---|
100 | throw RegularExpressionException(msg.str()); |
---|
101 | } |
---|
102 | mtch.offset = ovec[0] < 0 ? std::string::npos : ovec[0]; |
---|
103 | mtch.length = ovec[1] - mtch.offset; |
---|
104 | return rc; |
---|
105 | } |
---|
106 | |
---|
107 | |
---|
108 | int RegularExpression::match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options) const |
---|
109 | { |
---|
110 | poco_assert (offset <= subject.length()); |
---|
111 | |
---|
112 | matches.clear(); |
---|
113 | |
---|
114 | int ovec[OVEC_SIZE]; |
---|
115 | int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
---|
116 | if (rc == PCRE_ERROR_NOMATCH) |
---|
117 | { |
---|
118 | return 0; |
---|
119 | } |
---|
120 | else if (rc == PCRE_ERROR_BADOPTION) |
---|
121 | { |
---|
122 | throw RegularExpressionException("bad option"); |
---|
123 | } |
---|
124 | else if (rc == 0) |
---|
125 | { |
---|
126 | throw RegularExpressionException("too many captured substrings"); |
---|
127 | } |
---|
128 | else if (rc < 0) |
---|
129 | { |
---|
130 | std::ostringstream msg; |
---|
131 | msg << "PCRE error " << rc; |
---|
132 | throw RegularExpressionException(msg.str()); |
---|
133 | } |
---|
134 | matches.reserve(rc); |
---|
135 | for (int i = 0; i < rc; ++i) |
---|
136 | { |
---|
137 | Match m; |
---|
138 | m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ; |
---|
139 | m.length = ovec[i*2 + 1] - m.offset; |
---|
140 | matches.push_back(m); |
---|
141 | } |
---|
142 | return rc; |
---|
143 | } |
---|
144 | |
---|
145 | |
---|
146 | bool RegularExpression::match(const std::string& subject, std::string::size_type offset) const |
---|
147 | { |
---|
148 | Match mtch; |
---|
149 | match(subject, offset, mtch, RE_ANCHORED | RE_NOTEMPTY); |
---|
150 | return mtch.offset == offset && mtch.length == subject.length() - offset; |
---|
151 | } |
---|
152 | |
---|
153 | |
---|
154 | bool RegularExpression::match(const std::string& subject, std::string::size_type offset, int options) const |
---|
155 | { |
---|
156 | Match mtch; |
---|
157 | match(subject, offset, mtch, options); |
---|
158 | return mtch.offset == offset && mtch.length == subject.length() - offset; |
---|
159 | } |
---|
160 | |
---|
161 | |
---|
162 | int RegularExpression::extract(const std::string& subject, std::string& str, int options) const |
---|
163 | { |
---|
164 | Match mtch; |
---|
165 | int rc = match(subject, 0, mtch, options); |
---|
166 | if (mtch.offset != std::string::npos) |
---|
167 | str.assign(subject, mtch.offset, mtch.length); |
---|
168 | else |
---|
169 | str.clear(); |
---|
170 | return rc; |
---|
171 | } |
---|
172 | |
---|
173 | |
---|
174 | int RegularExpression::extract(const std::string& subject, std::string::size_type offset, std::string& str, int options) const |
---|
175 | { |
---|
176 | Match mtch; |
---|
177 | int rc = match(subject, offset, mtch, options); |
---|
178 | if (mtch.offset != std::string::npos) |
---|
179 | str.assign(subject, mtch.offset, mtch.length); |
---|
180 | else |
---|
181 | str.clear(); |
---|
182 | return rc; |
---|
183 | } |
---|
184 | |
---|
185 | |
---|
186 | int RegularExpression::split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options) const |
---|
187 | { |
---|
188 | MatchVec matches; |
---|
189 | strings.clear(); |
---|
190 | int rc = match(subject, offset, matches, options); |
---|
191 | strings.reserve(matches.size()); |
---|
192 | for (MatchVec::const_iterator it = matches.begin(); it != matches.end(); ++it) |
---|
193 | { |
---|
194 | if (it->offset != std::string::npos) |
---|
195 | strings.push_back(subject.substr(it->offset, it->length)); |
---|
196 | else |
---|
197 | strings.push_back(std::string()); |
---|
198 | } |
---|
199 | return rc; |
---|
200 | } |
---|
201 | |
---|
202 | |
---|
203 | int RegularExpression::subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const |
---|
204 | { |
---|
205 | if (options & RE_GLOBAL) |
---|
206 | { |
---|
207 | int rc = 0; |
---|
208 | std::string::size_type pos = substOne(subject, offset, replacement, options); |
---|
209 | while (pos != std::string::npos) |
---|
210 | { |
---|
211 | ++rc; |
---|
212 | pos = substOne(subject, pos, replacement, options); |
---|
213 | } |
---|
214 | return rc; |
---|
215 | } |
---|
216 | else |
---|
217 | { |
---|
218 | return substOne(subject, offset, replacement, options) != std::string::npos ? 1 : 0; |
---|
219 | } |
---|
220 | } |
---|
221 | |
---|
222 | |
---|
223 | std::string::size_type RegularExpression::substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const |
---|
224 | { |
---|
225 | if (offset >= subject.length()) return std::string::npos; |
---|
226 | |
---|
227 | int ovec[OVEC_SIZE]; |
---|
228 | int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
---|
229 | if (rc == PCRE_ERROR_NOMATCH) |
---|
230 | { |
---|
231 | return std::string::npos; |
---|
232 | } |
---|
233 | else if (rc == PCRE_ERROR_BADOPTION) |
---|
234 | { |
---|
235 | throw RegularExpressionException("bad option"); |
---|
236 | } |
---|
237 | else if (rc == 0) |
---|
238 | { |
---|
239 | throw RegularExpressionException("too many captured substrings"); |
---|
240 | } |
---|
241 | else if (rc < 0) |
---|
242 | { |
---|
243 | std::ostringstream msg; |
---|
244 | msg << "PCRE error " << rc; |
---|
245 | throw RegularExpressionException(msg.str()); |
---|
246 | } |
---|
247 | std::string result; |
---|
248 | std::string::size_type len = subject.length(); |
---|
249 | std::string::size_type pos = 0; |
---|
250 | std::string::size_type rp = std::string::npos; |
---|
251 | while (pos < len) |
---|
252 | { |
---|
253 | if (ovec[0] == pos) |
---|
254 | { |
---|
255 | std::string::const_iterator it = replacement.begin(); |
---|
256 | std::string::const_iterator end = replacement.end(); |
---|
257 | while (it != end) |
---|
258 | { |
---|
259 | if (*it == '$' && !(options & RE_NO_VARS)) |
---|
260 | { |
---|
261 | ++it; |
---|
262 | if (it != end) |
---|
263 | { |
---|
264 | char d = *it; |
---|
265 | if (d >= '0' && d <= '9') |
---|
266 | { |
---|
267 | int c = d - '0'; |
---|
268 | if (c < rc) |
---|
269 | { |
---|
270 | int o = ovec[c*2]; |
---|
271 | int l = ovec[c*2 + 1] - o; |
---|
272 | result.append(subject, o, l); |
---|
273 | } |
---|
274 | } |
---|
275 | else |
---|
276 | { |
---|
277 | result += '$'; |
---|
278 | result += d; |
---|
279 | } |
---|
280 | ++it; |
---|
281 | } |
---|
282 | else result += '$'; |
---|
283 | } |
---|
284 | else result += *it++; |
---|
285 | } |
---|
286 | pos = ovec[1]; |
---|
287 | rp = result.length(); |
---|
288 | } |
---|
289 | else result += subject[pos++]; |
---|
290 | } |
---|
291 | subject = result; |
---|
292 | return rp; |
---|
293 | } |
---|
294 | |
---|
295 | |
---|
296 | bool RegularExpression::match(const std::string& subject, const std::string& pattern, int options) |
---|
297 | { |
---|
298 | int ctorOptions = options & (RE_CASELESS | RE_MULTILINE | RE_DOTALL | RE_EXTENDED | RE_ANCHORED | RE_DOLLAR_ENDONLY | RE_EXTRA | RE_UNGREEDY | RE_UTF8 | RE_NO_AUTO_CAPTURE); |
---|
299 | int mtchOptions = options & (RE_ANCHORED | RE_NOTBOL | RE_NOTEOL | RE_NOTEMPTY | RE_NO_AUTO_CAPTURE | RE_NO_UTF8_CHECK); |
---|
300 | RegularExpression re(pattern, ctorOptions, false); |
---|
301 | return re.match(subject, 0, mtchOptions); |
---|
302 | } |
---|
303 | |
---|
304 | |
---|
305 | } // namespace Poco |
---|