[80] | 1 | // |
---|
| 2 | // RegularExpression.h |
---|
| 3 | // |
---|
| 4 | // $Id: //poco/1.3/Foundation/src/RegularExpression.cpp#3 $ |
---|
| 5 | // |
---|
| 6 | // Library: Foundation |
---|
| 7 | // Package: RegExp |
---|
| 8 | // Module: RegularExpression |
---|
| 9 | // |
---|
| 10 | // Copyright (c) 2004-2006, Applied Informatics Software Engineering GmbH. |
---|
| 11 | // and Contributors. |
---|
| 12 | // |
---|
| 13 | // Permission is hereby granted, free of charge, to any person or organization |
---|
| 14 | // obtaining a copy of the software and accompanying documentation covered by |
---|
| 15 | // this license (the "Software") to use, reproduce, display, distribute, |
---|
| 16 | // execute, and transmit the Software, and to prepare derivative works of the |
---|
| 17 | // Software, and to permit third-parties to whom the Software is furnished to |
---|
| 18 | // do so, all subject to the following: |
---|
| 19 | // |
---|
| 20 | // The copyright notices in the Software and this entire statement, including |
---|
| 21 | // the above license grant, this restriction and the following disclaimer, |
---|
| 22 | // must be included in all copies of the Software, in whole or in part, and |
---|
| 23 | // all derivative works of the Software, unless such copies or derivative |
---|
| 24 | // works are solely in the form of machine-executable object code generated by |
---|
| 25 | // a source language processor. |
---|
| 26 | // |
---|
| 27 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
---|
| 28 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
---|
| 29 | // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
---|
| 30 | // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
---|
| 31 | // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
---|
| 32 | // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
---|
| 33 | // DEALINGS IN THE SOFTWARE. |
---|
| 34 | // |
---|
| 35 | |
---|
| 36 | |
---|
| 37 | #include "Poco/RegularExpression.h" |
---|
| 38 | #include "Poco/Exception.h" |
---|
| 39 | #include <sstream> |
---|
| 40 | #if defined(POCO_UNBUNDLED) |
---|
| 41 | #include <pcre.h> |
---|
| 42 | #else |
---|
| 43 | #include "pcre.h" |
---|
| 44 | #endif |
---|
| 45 | |
---|
| 46 | |
---|
| 47 | namespace Poco { |
---|
| 48 | |
---|
| 49 | |
---|
| 50 | const int RegularExpression::OVEC_SIZE = 64; |
---|
| 51 | |
---|
| 52 | |
---|
| 53 | RegularExpression::RegularExpression(const std::string& pattern, int options, bool study): _pcre(0), _extra(0) |
---|
| 54 | { |
---|
| 55 | const char* error; |
---|
| 56 | int offs; |
---|
| 57 | _pcre = pcre_compile(pattern.c_str(), options, &error, &offs, 0); |
---|
| 58 | if (!_pcre) |
---|
| 59 | { |
---|
| 60 | std::ostringstream msg; |
---|
| 61 | msg << error << " (at offset " << offs << ")"; |
---|
| 62 | throw RegularExpressionException(msg.str()); |
---|
| 63 | } |
---|
| 64 | if (study) |
---|
| 65 | _extra = pcre_study(_pcre, 0, &error); |
---|
| 66 | } |
---|
| 67 | |
---|
| 68 | |
---|
| 69 | RegularExpression::~RegularExpression() |
---|
| 70 | { |
---|
| 71 | if (_pcre) pcre_free(_pcre); |
---|
| 72 | if (_extra) pcre_free(_extra); |
---|
| 73 | } |
---|
| 74 | |
---|
| 75 | |
---|
| 76 | int RegularExpression::match(const std::string& subject, std::string::size_type offset, Match& mtch, int options) const |
---|
| 77 | { |
---|
| 78 | poco_assert (offset <= subject.length()); |
---|
| 79 | |
---|
| 80 | int ovec[OVEC_SIZE]; |
---|
| 81 | int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
---|
| 82 | if (rc == PCRE_ERROR_NOMATCH) |
---|
| 83 | { |
---|
| 84 | mtch.offset = std::string::npos; |
---|
| 85 | mtch.length = 0; |
---|
| 86 | return 0; |
---|
| 87 | } |
---|
| 88 | else if (rc == PCRE_ERROR_BADOPTION) |
---|
| 89 | { |
---|
| 90 | throw RegularExpressionException("bad option"); |
---|
| 91 | } |
---|
| 92 | else if (rc == 0) |
---|
| 93 | { |
---|
| 94 | throw RegularExpressionException("too many captured substrings"); |
---|
| 95 | } |
---|
| 96 | else if (rc < 0) |
---|
| 97 | { |
---|
| 98 | std::ostringstream msg; |
---|
| 99 | msg << "PCRE error " << rc; |
---|
| 100 | throw RegularExpressionException(msg.str()); |
---|
| 101 | } |
---|
| 102 | mtch.offset = ovec[0] < 0 ? std::string::npos : ovec[0]; |
---|
| 103 | mtch.length = ovec[1] - mtch.offset; |
---|
| 104 | return rc; |
---|
| 105 | } |
---|
| 106 | |
---|
| 107 | |
---|
| 108 | int RegularExpression::match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options) const |
---|
| 109 | { |
---|
| 110 | poco_assert (offset <= subject.length()); |
---|
| 111 | |
---|
| 112 | matches.clear(); |
---|
| 113 | |
---|
| 114 | int ovec[OVEC_SIZE]; |
---|
| 115 | int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
---|
| 116 | if (rc == PCRE_ERROR_NOMATCH) |
---|
| 117 | { |
---|
| 118 | return 0; |
---|
| 119 | } |
---|
| 120 | else if (rc == PCRE_ERROR_BADOPTION) |
---|
| 121 | { |
---|
| 122 | throw RegularExpressionException("bad option"); |
---|
| 123 | } |
---|
| 124 | else if (rc == 0) |
---|
| 125 | { |
---|
| 126 | throw RegularExpressionException("too many captured substrings"); |
---|
| 127 | } |
---|
| 128 | else if (rc < 0) |
---|
| 129 | { |
---|
| 130 | std::ostringstream msg; |
---|
| 131 | msg << "PCRE error " << rc; |
---|
| 132 | throw RegularExpressionException(msg.str()); |
---|
| 133 | } |
---|
| 134 | matches.reserve(rc); |
---|
| 135 | for (int i = 0; i < rc; ++i) |
---|
| 136 | { |
---|
| 137 | Match m; |
---|
| 138 | m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ; |
---|
| 139 | m.length = ovec[i*2 + 1] - m.offset; |
---|
| 140 | matches.push_back(m); |
---|
| 141 | } |
---|
| 142 | return rc; |
---|
| 143 | } |
---|
| 144 | |
---|
| 145 | |
---|
| 146 | bool RegularExpression::match(const std::string& subject, std::string::size_type offset) const |
---|
| 147 | { |
---|
| 148 | Match mtch; |
---|
| 149 | match(subject, offset, mtch, RE_ANCHORED | RE_NOTEMPTY); |
---|
| 150 | return mtch.offset == offset && mtch.length == subject.length() - offset; |
---|
| 151 | } |
---|
| 152 | |
---|
| 153 | |
---|
| 154 | bool RegularExpression::match(const std::string& subject, std::string::size_type offset, int options) const |
---|
| 155 | { |
---|
| 156 | Match mtch; |
---|
| 157 | match(subject, offset, mtch, options); |
---|
| 158 | return mtch.offset == offset && mtch.length == subject.length() - offset; |
---|
| 159 | } |
---|
| 160 | |
---|
| 161 | |
---|
| 162 | int RegularExpression::extract(const std::string& subject, std::string& str, int options) const |
---|
| 163 | { |
---|
| 164 | Match mtch; |
---|
| 165 | int rc = match(subject, 0, mtch, options); |
---|
| 166 | if (mtch.offset != std::string::npos) |
---|
| 167 | str.assign(subject, mtch.offset, mtch.length); |
---|
| 168 | else |
---|
| 169 | str.clear(); |
---|
| 170 | return rc; |
---|
| 171 | } |
---|
| 172 | |
---|
| 173 | |
---|
| 174 | int RegularExpression::extract(const std::string& subject, std::string::size_type offset, std::string& str, int options) const |
---|
| 175 | { |
---|
| 176 | Match mtch; |
---|
| 177 | int rc = match(subject, offset, mtch, options); |
---|
| 178 | if (mtch.offset != std::string::npos) |
---|
| 179 | str.assign(subject, mtch.offset, mtch.length); |
---|
| 180 | else |
---|
| 181 | str.clear(); |
---|
| 182 | return rc; |
---|
| 183 | } |
---|
| 184 | |
---|
| 185 | |
---|
| 186 | int RegularExpression::split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options) const |
---|
| 187 | { |
---|
| 188 | MatchVec matches; |
---|
| 189 | strings.clear(); |
---|
| 190 | int rc = match(subject, offset, matches, options); |
---|
| 191 | strings.reserve(matches.size()); |
---|
| 192 | for (MatchVec::const_iterator it = matches.begin(); it != matches.end(); ++it) |
---|
| 193 | { |
---|
| 194 | if (it->offset != std::string::npos) |
---|
| 195 | strings.push_back(subject.substr(it->offset, it->length)); |
---|
| 196 | else |
---|
| 197 | strings.push_back(std::string()); |
---|
| 198 | } |
---|
| 199 | return rc; |
---|
| 200 | } |
---|
| 201 | |
---|
| 202 | |
---|
| 203 | int RegularExpression::subst(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const |
---|
| 204 | { |
---|
| 205 | if (options & RE_GLOBAL) |
---|
| 206 | { |
---|
| 207 | int rc = 0; |
---|
| 208 | std::string::size_type pos = substOne(subject, offset, replacement, options); |
---|
| 209 | while (pos != std::string::npos) |
---|
| 210 | { |
---|
| 211 | ++rc; |
---|
| 212 | pos = substOne(subject, pos, replacement, options); |
---|
| 213 | } |
---|
| 214 | return rc; |
---|
| 215 | } |
---|
| 216 | else |
---|
| 217 | { |
---|
| 218 | return substOne(subject, offset, replacement, options) != std::string::npos ? 1 : 0; |
---|
| 219 | } |
---|
| 220 | } |
---|
| 221 | |
---|
| 222 | |
---|
| 223 | std::string::size_type RegularExpression::substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const |
---|
| 224 | { |
---|
| 225 | if (offset >= subject.length()) return std::string::npos; |
---|
| 226 | |
---|
| 227 | int ovec[OVEC_SIZE]; |
---|
| 228 | int rc = pcre_exec(_pcre, _extra, subject.c_str(), int(subject.size()), int(offset), options & 0xFFFF, ovec, OVEC_SIZE); |
---|
| 229 | if (rc == PCRE_ERROR_NOMATCH) |
---|
| 230 | { |
---|
| 231 | return std::string::npos; |
---|
| 232 | } |
---|
| 233 | else if (rc == PCRE_ERROR_BADOPTION) |
---|
| 234 | { |
---|
| 235 | throw RegularExpressionException("bad option"); |
---|
| 236 | } |
---|
| 237 | else if (rc == 0) |
---|
| 238 | { |
---|
| 239 | throw RegularExpressionException("too many captured substrings"); |
---|
| 240 | } |
---|
| 241 | else if (rc < 0) |
---|
| 242 | { |
---|
| 243 | std::ostringstream msg; |
---|
| 244 | msg << "PCRE error " << rc; |
---|
| 245 | throw RegularExpressionException(msg.str()); |
---|
| 246 | } |
---|
| 247 | std::string result; |
---|
| 248 | std::string::size_type len = subject.length(); |
---|
| 249 | std::string::size_type pos = 0; |
---|
| 250 | std::string::size_type rp = std::string::npos; |
---|
| 251 | while (pos < len) |
---|
| 252 | { |
---|
| 253 | if (ovec[0] == pos) |
---|
| 254 | { |
---|
| 255 | std::string::const_iterator it = replacement.begin(); |
---|
| 256 | std::string::const_iterator end = replacement.end(); |
---|
| 257 | while (it != end) |
---|
| 258 | { |
---|
| 259 | if (*it == '$' && !(options & RE_NO_VARS)) |
---|
| 260 | { |
---|
| 261 | ++it; |
---|
| 262 | if (it != end) |
---|
| 263 | { |
---|
| 264 | char d = *it; |
---|
| 265 | if (d >= '0' && d <= '9') |
---|
| 266 | { |
---|
| 267 | int c = d - '0'; |
---|
| 268 | if (c < rc) |
---|
| 269 | { |
---|
| 270 | int o = ovec[c*2]; |
---|
| 271 | int l = ovec[c*2 + 1] - o; |
---|
| 272 | result.append(subject, o, l); |
---|
| 273 | } |
---|
| 274 | } |
---|
| 275 | else |
---|
| 276 | { |
---|
| 277 | result += '$'; |
---|
| 278 | result += d; |
---|
| 279 | } |
---|
| 280 | ++it; |
---|
| 281 | } |
---|
| 282 | else result += '$'; |
---|
| 283 | } |
---|
| 284 | else result += *it++; |
---|
| 285 | } |
---|
| 286 | pos = ovec[1]; |
---|
| 287 | rp = result.length(); |
---|
| 288 | } |
---|
| 289 | else result += subject[pos++]; |
---|
| 290 | } |
---|
| 291 | subject = result; |
---|
| 292 | return rp; |
---|
| 293 | } |
---|
| 294 | |
---|
| 295 | |
---|
| 296 | bool RegularExpression::match(const std::string& subject, const std::string& pattern, int options) |
---|
| 297 | { |
---|
| 298 | int ctorOptions = options & (RE_CASELESS | RE_MULTILINE | RE_DOTALL | RE_EXTENDED | RE_ANCHORED | RE_DOLLAR_ENDONLY | RE_EXTRA | RE_UNGREEDY | RE_UTF8 | RE_NO_AUTO_CAPTURE); |
---|
| 299 | int mtchOptions = options & (RE_ANCHORED | RE_NOTBOL | RE_NOTEOL | RE_NOTEMPTY | RE_NO_AUTO_CAPTURE | RE_NO_UTF8_CHECK); |
---|
| 300 | RegularExpression re(pattern, ctorOptions, false); |
---|
| 301 | return re.match(subject, 0, mtchOptions); |
---|
| 302 | } |
---|
| 303 | |
---|
| 304 | |
---|
| 305 | } // namespace Poco |
---|