regexp.cpp

00001 // -*- c-basic-offset: 2 -*-
00002 /*
00003  *  This file is part of the KDE libraries
00004  *  Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
00005  *  Copyright (C) 2003,2004 Apple Computer, Inc.
00006  *  Copyright (C) 2006      Maksim Orlovich (maksim@kde.org)
00007  *
00008  *  This library is free software; you can redistribute it and/or
00009  *  modify it under the terms of the GNU Lesser General Public
00010  *  License as published by the Free Software Foundation; either
00011  *  version 2 of the License, or (at your option) any later version.
00012  *
00013  *  This library is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  *  Lesser General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU Lesser General Public
00019  *  License along with this library; if not, write to the Free Software
00020  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00021  *
00022  */
00023 
00024 #include "regexp.h"
00025 
00026 #include "lexer.h"
00027 #include <assert.h>
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 
00032 using namespace KJS;
00033 
00034 #ifdef PCRE_CONFIG_UTF8
00035 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
00036 #endif
00037 
00038 RegExp::RegExp(const UString &p, int f)
00039   : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
00040 {
00041   // Determine whether libpcre has unicode support if need be..
00042 #ifdef PCRE_CONFIG_UTF8
00043   if (utf8Support == Unknown) {
00044     int supported;
00045     pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
00046     utf8Support = supported ? Supported : Unsupported;
00047   }
00048 #endif
00049 
00050   nrSubPatterns = 0; // determined in match() with POSIX regex.
00051 
00052   // JS regexps can contain Unicode escape sequences (\uxxxx) which
00053   // are rather uncommon elsewhere. As our regexp libs don't understand
00054   // them we do the unescaping ourselves internally.
00055   // Also make sure to expand out any nulls as pcre_compile 
00056   // expects null termination..
00057   UString intern;
00058   const char* const nil = "\\x00";
00059   if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
00060     bool escape = false;
00061     for (int i = 0; i < p.size(); ++i) {
00062       UChar c = p[i];
00063       if (escape) {
00064         escape = false;
00065         // we only care about \u
00066         if (c == 'u') {
00067       // standard unicode escape sequence looks like \uxxxx but
00068       // other browsers also accept less then 4 hex digits
00069       unsigned short u = 0;
00070       int j = 0;
00071       for (j = 0; j < 4; ++j) {
00072         if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
00073           u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
00074           ++i;
00075         } else {
00076           // sequence incomplete. restore index.
00077           // TODO: cleaner way to propagate warning
00078           fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
00079           i -= j;
00080           break;
00081         }
00082       }
00083       if (j < 4) {
00084         // sequence was incomplete. treat \u as u which IE always
00085         // and FF sometimes does.
00086         intern.append(UString('u'));
00087       } else {
00088             c = UChar(u);
00089             switch (u) {
00090             case 0:
00091           // Make sure to encode 0, to avoid terminating the string
00092           intern += UString(nil);
00093           break;
00094             case '^':
00095             case '$':
00096             case '\\':
00097             case '.':
00098             case '*':
00099             case '+':
00100             case '?':
00101             case '(': case ')':
00102             case '{': case '}':
00103             case '[': case ']':
00104             case '|':
00105           // escape pattern characters have to remain escaped
00106           intern.append(UString('\\'));
00107           // intentional fallthrough
00108             default:
00109           intern += UString(&c, 1);
00110           break;
00111         }
00112           }
00113           continue;
00114         }
00115         intern += UString('\\');
00116         intern += UString(&c, 1);
00117       } else {
00118         if (c == '\\')
00119           escape = true;
00120         else if (c == '\0')
00121           intern += UString(nil);
00122         else
00123           intern += UString(&c, 1);
00124       }
00125     }
00126   } else {
00127     intern = p;
00128   }
00129 
00130 #ifdef HAVE_PCREPOSIX
00131   int pcreflags = 0;
00132   const char *perrormsg;
00133   int errorOffset;
00134 
00135   if (flgs & IgnoreCase)
00136     pcreflags |= PCRE_CASELESS;
00137 
00138   if (flgs & Multiline)
00139     pcreflags |= PCRE_MULTILINE;
00140 
00141 #ifdef PCRE_CONFIG_UTF8
00142   if (utf8Support == Supported)
00143     pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
00144 #endif
00145 
00146   // Fill our buffer with an encoded version, whether utf-8, or, 
00147   // if PCRE is incapable, truncated.
00148   prepareMatch(intern);
00149 
00150   pcregex = pcre_compile(buffer, pcreflags,
00151              &perrormsg, &errorOffset, NULL);
00152   doneMatch(); // Cleanup buffers
00153   if (!pcregex) {
00154 #ifndef NDEBUG
00155     fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
00156 #endif
00157     valid = false;
00158     return;
00159   }
00160 
00161 #ifdef PCRE_INFO_CAPTURECOUNT
00162   // Get number of subpatterns that will be returned
00163   int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00164   if (rc != 0)
00165 #endif
00166     nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
00167 
00168 #else /* HAVE_PCREPOSIX */
00169 
00170   int regflags = 0;
00171 #ifdef REG_EXTENDED
00172   regflags |= REG_EXTENDED;
00173 #endif
00174 #ifdef REG_ICASE
00175   if ( f & IgnoreCase )
00176     regflags |= REG_ICASE;
00177 #endif
00178 
00179   //NOTE: Multiline is not feasible with POSIX regex.
00180   //if ( f & Multiline )
00181   //    ;
00182   // Note: the Global flag is already handled by RegExpProtoFunc::execute
00183 
00184   int errorCode = regcomp(&preg, intern.ascii(), regflags);
00185   if (errorCode != 0) {
00186 #ifndef NDEBUG
00187     char errorMessage[80];
00188     regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
00189     fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
00190 #endif
00191     valid = false;
00192   }
00193 #endif
00194 }
00195 
00196 RegExp::~RegExp()
00197 {
00198   doneMatch(); // Be 100% sure buffers are freed
00199 #ifdef HAVE_PCREPOSIX
00200   if (pcregex)
00201     pcre_free(pcregex);
00202 #else
00203   /* TODO: is this really okay after an error ? */
00204   regfree(&preg);
00205 #endif
00206 }
00207 
00208 void RegExp::prepareUtf8(const UString& s)
00209 {
00210   // Allocate a buffer big enough to hold all the characters plus \0
00211   const int length = s.size();
00212   buffer = new char[length * 3 + 1];
00213 
00214   // Also create buffer for positions. We need one extra character in there,
00215   // even past the \0 since the non-empty handling may jump one past the end
00216   originalPos = new int[length * 3 + 2];
00217 
00218   // Convert to runs of 8-bit characters, and generate indeces
00219   // Note that we do NOT combine surrogate pairs here, as 
00220   // regexps operate on them as separate characters
00221   char *p      = buffer;
00222   int  *posOut = originalPos;
00223   const UChar *d = s.data();
00224   for (int i = 0; i != length; ++i) {
00225     unsigned short c = d[i].unicode();
00226 
00227     int sequenceLen;
00228     if (c < 0x80) {
00229       *p++ = (char)c;
00230       sequenceLen = 1;
00231     } else if (c < 0x800) {
00232       *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
00233       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
00234       sequenceLen = 2;
00235     } else {
00236       *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
00237       *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
00238       *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
00239       sequenceLen = 3;
00240     }
00241 
00242     while (sequenceLen > 0) {
00243       *posOut = i;
00244       ++posOut;
00245       --sequenceLen;
00246     }
00247   }
00248 
00249   bufferSize = p - buffer;
00250 
00251   *p++ = '\0';
00252 
00253   // Record positions for \0, and the fictional character after that.
00254   *posOut     = length;
00255   *(posOut+1) = length+1;
00256 }
00257 
00258 void RegExp::prepareASCII (const UString& s)
00259 {
00260   originalPos = 0;
00261 
00262   // Best-effort attempt to get something done
00263   // when we don't have utf 8 available -- use 
00264   // truncated version, and pray for the best 
00265   CString truncated = s.cstring();
00266   buffer = new char[truncated.size() + 1];
00267   memcpy(buffer, truncated.c_str(), truncated.size());
00268   buffer[truncated.size()] = '\0'; // For _compile use
00269   bufferSize = truncated.size();
00270 }
00271 
00272 void RegExp::prepareMatch(const UString &s)
00273 {
00274   delete[] originalPos; // Just to be sure..
00275   delete[] buffer;
00276 #ifdef PCRE_CONFIG_UTF8
00277   if (utf8Support == Supported)
00278     prepareUtf8(s);
00279   else
00280 #endif
00281     prepareASCII(s);
00282 
00283 #ifndef NDEBUG
00284   originalS = s;
00285 #endif
00286 }
00287 
00288 void RegExp::doneMatch() 
00289 {
00290   delete[] originalPos; originalPos = 0;
00291   delete[] buffer;      buffer      = 0;
00292 }
00293 
00294 UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
00295 {
00296 #ifndef NDEBUG
00297   assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
00298 #endif
00299   assert(valid);
00300 
00301   if (i < 0)
00302     i = 0;
00303   if (ovector)
00304     *ovector = 0L;
00305   int dummyPos;
00306   if (!pos)
00307     pos = &dummyPos;
00308   *pos = -1;
00309   if (i > s.size() || s.isNull())
00310     return UString::null;
00311 
00312 #ifdef HAVE_PCREPOSIX
00313   int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
00314   if (ovector) *ovector = new int[ovecsize];
00315   if (!pcregex)
00316     return UString::null;
00317 
00318   int startPos;
00319   int nextPos;
00320 
00321 #ifdef PCRE_CONFIG_UTF8
00322   if (utf8Support == Supported) {
00323     startPos = i;
00324     while (originalPos[startPos] < i)
00325       ++startPos;
00326 
00327     nextPos = startPos;
00328     while (originalPos[nextPos] < (i + 1))
00329       ++nextPos;
00330   } else
00331 #endif
00332   {
00333     startPos = i;
00334     nextPos  = i + 1;
00335   }
00336 
00337   int baseFlags =
00338 #ifdef PCRE_CONFIG_UTF8
00339     utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
00340 #endif
00341     0;
00342   if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
00343                 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest
00344                 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00345   {
00346     // Failed to match.
00347     if ((flgs & Global) && m_notEmpty && ovector)
00348     {
00349       // We set m_notEmpty ourselves, to look for a non-empty match
00350       // (see man pcretest or pcretest.c for details).
00351       // So we don't stop here, we want to try again at i+1.
00352 #ifdef KJS_VERBOSE
00353       fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
00354 #endif
00355       m_notEmpty = 0;
00356       if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
00357                     ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00358         return UString::null;
00359     }
00360     else // done
00361       return UString::null;
00362   }
00363 
00364   // Got a match, proceed with it.
00365   // But fix up the ovector if need be..
00366   if (ovector && originalPos) {
00367     for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) {
00368       if ((*ovector)[c] != -1)
00369         (*ovector)[c] = originalPos[(*ovector)[c]];
00370     }
00371   }
00372 
00373   if (!ovector)
00374     return UString::null; // don't rely on the return value if you pass ovector==0
00375 #else
00376   const uint maxMatch = 10;
00377   regmatch_t rmatch[maxMatch];
00378 
00379   char *str = strdup(s.ascii()); // TODO: why ???
00380   if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00381     free(str);
00382     return UString::null;
00383   }
00384   free(str);
00385 
00386   if (!ovector) {
00387     *pos = rmatch[0].rm_so + i;
00388     return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00389   }
00390 
00391   // map rmatch array to ovector used in PCRE case
00392   nrSubPatterns = 0;
00393   for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
00394     nrSubPatterns++;
00395     // if the nonEmpty flag is set, return a failed match if any of the
00396     // subMatches happens to be an empty string.
00397     if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo) 
00398       return UString::null;
00399   }
00400   // Allow an ovector slot to return the (failed) match result.
00401   if (nrSubPatterns == 0) nrSubPatterns = 1;
00402   
00403   int ovecsize = (nrSubPatterns)*3; // see above
00404   *ovector = new int[ovecsize];
00405   for (uint j = 0; j < nrSubPatterns; j++) {
00406       (*ovector)[2*j] = rmatch[j].rm_so + i;
00407       (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00408   }
00409 #endif
00410 
00411   *pos = (*ovector)[0];
00412   if ( *pos == (*ovector)[1] && (flgs & Global) )
00413   {
00414     // empty match, next try will be with m_notEmpty=true
00415     m_notEmpty=true;
00416   }
00417   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00418 }
00419 
00420 #if 0 // unused
00421 bool RegExp::test(const UString &s, int)
00422 {
00423 #ifdef HAVE_PCREPOSIX
00424   int ovector[300];
00425   CString buffer(s.cstring());
00426 
00427   if (s.isNull() ||
00428       pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
00429         0, ovector, 300) == PCRE_ERROR_NOMATCH)
00430     return false;
00431   else
00432     return true;
00433 
00434 #else
00435 
00436   char *str = strdup(s.ascii());
00437   int r = regexec(&preg, str, 0, 0, 0);
00438   free(str);
00439 
00440   return r == 0;
00441 #endif
00442 }
00443 #endif
KDE Home | KDE Accessibility Home | Description of Access Keys