kpimutils
linklocator.cpp
Go to the documentation of this file.
00001 /* 00002 Copyright (c) 2002 Dave Corrie <kde@davecorrie.com> 00003 00004 This library is free software; you can redistribute it and/or 00005 modify it under the terms of the GNU Library General Public 00006 License as published by the Free Software Foundation; either 00007 version 2 of the License, or (at your option) any later version. 00008 00009 This library is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00012 Library General Public License for more details. 00013 00014 You should have received a copy of the GNU Library General Public License 00015 along with this library; see the file COPYING.LIB. If not, write to 00016 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00017 Boston, MA 02110-1301, USA. 00018 */ 00030 #include "linklocator.h" 00031 00032 #include <kglobal.h> 00033 #include <kstandarddirs.h> 00034 #include <kcodecs.h> 00035 #include <kdebug.h> 00036 #include <kemoticons.h> 00037 00038 #include <QtCore/QCoreApplication> 00039 #include <QtCore/QFile> 00040 #include <QtCore/QRegExp> 00041 #include <QtGui/QTextDocument> 00042 00043 #include <limits.h> 00044 00045 using namespace KPIMUtils; 00046 00051 //@cond PRIVATE 00052 class KPIMUtils::LinkLocator::Private 00053 { 00054 public: 00055 int mMaxUrlLen; 00056 int mMaxAddressLen; 00057 }; 00058 //@endcond 00059 00060 // Use a static for this as calls to the KEmoticons constructor are expensive. 00061 K_GLOBAL_STATIC( KEmoticons, sEmoticons ) 00062 00063 LinkLocator::LinkLocator( const QString &text, int pos ) 00064 : mText( text ), mPos( pos ), d( new KPIMUtils::LinkLocator::Private ) 00065 { 00066 d->mMaxUrlLen = 4096; 00067 d->mMaxAddressLen = 255; 00068 00069 // If you change either of the above values for maxUrlLen or 00070 // maxAddressLen, then please also update the documentation for 00071 // setMaxUrlLen()/setMaxAddressLen() in the header file AND the 00072 // default values used for the maxUrlLen/maxAddressLen parameters 00073 // of convertToHtml(). 00074 } 00075 00076 LinkLocator::~LinkLocator() 00077 { 00078 delete d; 00079 } 00080 00081 void LinkLocator::setMaxUrlLen( int length ) 00082 { 00083 d->mMaxUrlLen = length; 00084 } 00085 00086 int LinkLocator::maxUrlLen() const 00087 { 00088 return d->mMaxUrlLen; 00089 } 00090 00091 void LinkLocator::setMaxAddressLen( int length ) 00092 { 00093 d->mMaxAddressLen = length; 00094 } 00095 00096 int LinkLocator::maxAddressLen() const 00097 { 00098 return d->mMaxAddressLen; 00099 } 00100 00101 QString LinkLocator::getUrl() 00102 { 00103 QString url; 00104 if ( atUrl() ) { 00105 // NOTE: see http://tools.ietf.org/html/rfc3986#appendix-A and especially appendix-C 00106 // Appendix-C mainly says, that when extracting URLs from plain text, line breaks shall 00107 // be allowed and should be ignored when the URI is extracted. 00108 00109 // This implementation follows this recommendation and 00110 // allows the URL to be enclosed within different kind of brackets/quotes 00111 // If an URL is enclosed, whitespace characters are allowed and removed, otherwise 00112 // the URL ends with the first whitespace 00113 // Also, if the URL is enclosed in brackets, the URL itself is not allowed 00114 // to contain the closing bracket, as this would be detected as the end of the URL 00115 00116 QChar beforeUrl, afterUrl; 00117 00118 // detect if the url has been surrounded by brackets or quotes 00119 if ( mPos > 0 ) { 00120 beforeUrl = mText[mPos - 1]; 00121 00122 if ( beforeUrl == '(' ) { 00123 afterUrl = ')'; 00124 } else if ( beforeUrl == '[' ) { 00125 afterUrl = ']'; 00126 } else if ( beforeUrl == '<' ) { 00127 afterUrl = '>'; 00128 } else if ( beforeUrl == '>' ) { // for e.g. <link>http://.....</link> 00129 afterUrl = '<'; 00130 } else if ( beforeUrl == '"' ) { 00131 afterUrl = '"'; 00132 } 00133 } 00134 00135 url.reserve( maxUrlLen() ); // avoid allocs 00136 int start = mPos; 00137 while ( ( mPos < (int)mText.length() ) && 00138 ( mText[mPos].isPrint() || mText[mPos].isSpace() ) && 00139 ( ( afterUrl.isNull() && !mText[mPos].isSpace() ) || 00140 ( !afterUrl.isNull() && mText[mPos] != afterUrl ) ) ) { 00141 if ( !mText[mPos].isSpace() ) { // skip whitespace 00142 url.append( mText[mPos] ); 00143 if ( url.length() > maxUrlLen() ) { 00144 break; 00145 } 00146 } 00147 00148 mPos++; 00149 } 00150 00151 if ( isEmptyUrl(url) || ( url.length() > maxUrlLen() ) ) { 00152 mPos = start; 00153 url = ""; 00154 } else { 00155 --mPos; 00156 } 00157 } 00158 00159 00160 // HACK: This is actually against the RFC. However, most people don't properly escape the URL in 00161 // their text with "" or <>. That leads to people writing an url, followed immediatley by 00162 // a dot to finish the sentence. That would lead the parser to include the dot in the url, 00163 // even though that is not wanted. So work around that here. 00164 // Most real-life URLs hopefully don't end with dots or commas. 00165 if ( url.length() > 1 ) { 00166 QList<QChar> wordBoundaries; 00167 wordBoundaries << '.' << ',' << ':' << '!' << '?'; 00168 if ( wordBoundaries.contains( url.at( url.length() - 1 ) ) ) { 00169 url.chop( 1 ); 00170 --mPos; 00171 } 00172 } 00173 00174 return url; 00175 } 00176 00177 // keep this in sync with KMMainWin::slotUrlClicked() 00178 bool LinkLocator::atUrl() const 00179 { 00180 // the following characters are allowed in a dot-atom (RFC 2822): 00181 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ 00182 const QString allowedSpecialChars = QString( ".!#$%&'*+-/=?^_`{|}~" ); 00183 00184 // the character directly before the URL must not be a letter, a number or 00185 // any other character allowed in a dot-atom (RFC 2822). 00186 if ( ( mPos > 0 ) && 00187 ( mText[mPos-1].isLetterOrNumber() || 00188 ( allowedSpecialChars.indexOf( mText[mPos-1] ) != -1 ) ) ) { 00189 return false; 00190 } 00191 00192 QChar ch = mText[mPos]; 00193 return 00194 ( ch == 'h' && ( mText.mid( mPos, 7 ) == "http://" || 00195 mText.mid( mPos, 8 ) == "https://" ) ) || 00196 ( ch == 'v' && mText.mid( mPos, 6 ) == "vnc://" ) || 00197 ( ch == 'f' && ( mText.mid( mPos, 7 ) == "fish://" || 00198 mText.mid( mPos, 6 ) == "ftp://" || 00199 mText.mid( mPos, 7 ) == "ftps://" ) ) || 00200 ( ch == 's' && ( mText.mid( mPos, 7 ) == "sftp://" || 00201 mText.mid( mPos, 6 ) == "smb://" ) ) || 00202 ( ch == 'm' && mText.mid( mPos, 7 ) == "mailto:" ) || 00203 ( ch == 'w' && mText.mid( mPos, 4 ) == "www." ) || 00204 ( ch == 'f' && ( mText.mid( mPos, 4 ) == "ftp." || 00205 mText.mid( mPos, 7 ) == "file://" ) ) || 00206 ( ch == 'n' && mText.mid( mPos, 5 ) == "news:" ); 00207 } 00208 00209 bool LinkLocator::isEmptyUrl( const QString &url ) const 00210 { 00211 return url.isEmpty() || 00212 url == "http://" || 00213 url == "https://" || 00214 url == "fish://" || 00215 url == "ftp://" || 00216 url == "ftps://" || 00217 url == "sftp://" || 00218 url == "smb://" || 00219 url == "vnc://" || 00220 url == "mailto" || 00221 url == "www" || 00222 url == "ftp" || 00223 url == "news" || 00224 url == "news://"; 00225 } 00226 00227 QString LinkLocator::getEmailAddress() 00228 { 00229 QString address; 00230 00231 if ( mText[mPos] == '@' ) { 00232 // the following characters are allowed in a dot-atom (RFC 2822): 00233 // a-z A-Z 0-9 . ! # $ % & ' * + - / = ? ^ _ ` { | } ~ 00234 const QString allowedSpecialChars = QString( ".!#$%&'*+-/=?^_`{|}~" ); 00235 00236 // determine the local part of the email address 00237 int start = mPos - 1; 00238 while ( start >= 0 && mText[start].unicode() < 128 && 00239 ( mText[start].isLetterOrNumber() || 00240 mText[start] == '@' || // allow @ to find invalid email addresses 00241 allowedSpecialChars.indexOf( mText[start] ) != -1 ) ) { 00242 if ( mText[start] == '@' ) { 00243 return QString(); // local part contains '@' -> no email address 00244 } 00245 --start; 00246 } 00247 ++start; 00248 // we assume that an email address starts with a letter or a digit 00249 while ( ( start < mPos ) && !mText[start].isLetterOrNumber() ) { 00250 ++start; 00251 } 00252 if ( start == mPos ) { 00253 return QString(); // local part is empty -> no email address 00254 } 00255 00256 // determine the domain part of the email address 00257 int dotPos = INT_MAX; 00258 int end = mPos + 1; 00259 while ( end < (int)mText.length() && 00260 ( mText[end].isLetterOrNumber() || 00261 mText[end] == '@' || // allow @ to find invalid email addresses 00262 mText[end] == '.' || 00263 mText[end] == '-' ) ) { 00264 if ( mText[end] == '@' ) { 00265 return QString(); // domain part contains '@' -> no email address 00266 } 00267 if ( mText[end] == '.' ) { 00268 dotPos = qMin( dotPos, end ); // remember index of first dot in domain 00269 } 00270 ++end; 00271 } 00272 // we assume that an email address ends with a letter or a digit 00273 while ( ( end > mPos ) && !mText[end - 1].isLetterOrNumber() ) { 00274 --end; 00275 } 00276 if ( end == mPos ) { 00277 return QString(); // domain part is empty -> no email address 00278 } 00279 if ( dotPos >= end ) { 00280 return QString(); // domain part doesn't contain a dot 00281 } 00282 00283 if ( end - start > maxAddressLen() ) { 00284 return QString(); // too long -> most likely no email address 00285 } 00286 address = mText.mid( start, end - start ); 00287 00288 mPos = end - 1; 00289 } 00290 return address; 00291 } 00292 00293 QString LinkLocator::convertToHtml( const QString &plainText, int flags, 00294 int maxUrlLen, int maxAddressLen ) 00295 { 00296 LinkLocator locator( plainText ); 00297 locator.setMaxUrlLen( maxUrlLen ); 00298 locator.setMaxAddressLen( maxAddressLen ); 00299 00300 QString str; 00301 QString result( (QChar*)0, (int)locator.mText.length() * 2 ); 00302 QChar ch; 00303 int x; 00304 bool startOfLine = true; 00305 QString emoticon; 00306 00307 for ( locator.mPos = 0, x = 0; locator.mPos < (int)locator.mText.length(); 00308 locator.mPos++, x++ ) { 00309 ch = locator.mText[locator.mPos]; 00310 if ( flags & PreserveSpaces ) { 00311 if ( ch == ' ' ) { 00312 if ( locator.mPos + 1 < locator.mText.length() ) { 00313 if ( locator.mText[locator.mPos + 1] != ' ' ) { 00314 00315 // A single space, make it breaking if not at the start or end of the line 00316 const bool endOfLine = locator.mText[locator.mPos + 1] == '\n'; 00317 if ( !startOfLine && !endOfLine ) { 00318 result += ' '; 00319 } else { 00320 result += " "; 00321 } 00322 } else { 00323 00324 // Whitespace of more than one space, make it all non-breaking 00325 while ( locator.mPos < locator.mText.length() && locator.mText[locator.mPos] == ' ' ) { 00326 result += " "; 00327 locator.mPos++; 00328 x++; 00329 } 00330 00331 // We incremented once to often, undo that 00332 locator.mPos--; 00333 x--; 00334 } 00335 } else { 00336 // Last space in the text, it is non-breaking 00337 result += " "; 00338 } 00339 00340 if ( startOfLine ) { 00341 startOfLine = false; 00342 } 00343 continue; 00344 } else if ( ch == '\t' ) { 00345 do 00346 { 00347 result += " "; 00348 x++; 00349 } 00350 while ( ( x & 7 ) != 0 ); 00351 x--; 00352 startOfLine = false; 00353 continue; 00354 } 00355 } 00356 if ( ch == '\n' ) { 00357 result += "<br />\n"; // Keep the \n, so apps can figure out the quoting levels correctly. 00358 startOfLine = true; 00359 x = -1; 00360 continue; 00361 } 00362 00363 startOfLine = false; 00364 if ( ch == '&' ) { 00365 result += "&"; 00366 } else if ( ch == '"' ) { 00367 result += """; 00368 } else if ( ch == '<' ) { 00369 result += "<"; 00370 } else if ( ch == '>' ) { 00371 result += ">"; 00372 } else { 00373 const int start = locator.mPos; 00374 if ( !( flags & IgnoreUrls ) ) { 00375 str = locator.getUrl(); 00376 if ( !str.isEmpty() ) { 00377 QString hyperlink; 00378 if ( str.left( 4 ) == "www." ) { 00379 hyperlink = "http://" + str; 00380 } else if ( str.left( 4 ) == "ftp." ) { 00381 hyperlink = "ftp://" + str; 00382 } else { 00383 hyperlink = str; 00384 } 00385 00386 result += "<a href=\"" + hyperlink + "\">" + Qt::escape( str ) + "</a>"; 00387 x += locator.mPos - start; 00388 continue; 00389 } 00390 str = locator.getEmailAddress(); 00391 if ( !str.isEmpty() ) { 00392 // len is the length of the local part 00393 int len = str.indexOf( '@' ); 00394 QString localPart = str.left( len ); 00395 00396 // remove the local part from the result (as '&'s have been expanded to 00397 // & we have to take care of the 4 additional characters per '&') 00398 result.truncate( result.length() - 00399 len - ( localPart.count( '&' ) * 4 ) ); 00400 x -= len; 00401 00402 result += "<a href=\"mailto:" + str + "\">" + str + "</a>"; 00403 x += str.length() - 1; 00404 continue; 00405 } 00406 } 00407 if ( flags & HighlightText ) { 00408 str = locator.highlightedText(); 00409 if ( !str.isEmpty() ) { 00410 result += str; 00411 x += locator.mPos - start; 00412 continue; 00413 } 00414 } 00415 result += ch; 00416 } 00417 } 00418 00419 if ( flags & ReplaceSmileys ) { 00420 QStringList exclude; 00421 exclude << "(c)" << "(C)" << ">:-(" << ">:(" << "(B)" << "(b)" << "(P)" << "(p)"; 00422 exclude << "(O)" << "(o)" << "(D)" << "(d)" << "(E)" << "(e)" << "(K)" << "(k)"; 00423 exclude << "(I)" << "(i)" << "(L)" << "(l)" << "(8)" << "(T)" << "(t)" << "(G)"; 00424 exclude << "(g)" << "(F)" << "(f)" << "(H)"; 00425 exclude << "8)" << "(N)" << "(n)" << "(Y)" << "(y)" << "(U)" << "(u)" << "(W)" << "(w)"; 00426 static QString cachedEmoticonsThemeName; 00427 if ( cachedEmoticonsThemeName.isEmpty() ) { 00428 cachedEmoticonsThemeName = KEmoticons::currentThemeName(); 00429 } 00430 result = 00431 sEmoticons->theme( cachedEmoticonsThemeName ).parseEmoticons( 00432 result, KEmoticonsTheme::StrictParse | KEmoticonsTheme::SkipHTML, exclude ); 00433 } 00434 00435 return result; 00436 } 00437 00438 QString LinkLocator::pngToDataUrl( const QString &iconPath ) 00439 { 00440 if ( iconPath.isEmpty() ) { 00441 return QString(); 00442 } 00443 00444 QFile pngFile( iconPath ); 00445 if ( !pngFile.open( QIODevice::ReadOnly | QIODevice::Unbuffered ) ) { 00446 return QString(); 00447 } 00448 00449 QByteArray ba = pngFile.readAll(); 00450 pngFile.close(); 00451 return QString::fromLatin1( "data:image/png;base64,%1" ).arg( ba.toBase64().constData() ); 00452 } 00453 00454 QString LinkLocator::highlightedText() 00455 { 00456 // formating symbols must be prepended with a whitespace 00457 if ( ( mPos > 0 ) && !mText[mPos-1].isSpace() ) { 00458 return QString(); 00459 } 00460 00461 const QChar ch = mText[mPos]; 00462 if ( ch != '/' && ch != '*' && ch != '_' ) { 00463 return QString(); 00464 } 00465 00466 QRegExp re = 00467 QRegExp( QString( "\\%1((\\w+)([\\s-']\\w+)*( ?[,.:\\?!;])?)\\%2" ).arg( ch ).arg( ch ) ); 00468 re.setMinimal(true); 00469 if ( re.indexIn( mText, mPos ) == mPos ) { 00470 int length = re.matchedLength(); 00471 // there must be a whitespace after the closing formating symbol 00472 if ( mPos + length < mText.length() && !mText[mPos + length].isSpace() ) { 00473 return QString(); 00474 } 00475 mPos += length - 1; 00476 switch ( ch.toLatin1() ) { 00477 case '*': 00478 return "<b>" + re.cap( 1 ) + "</b>"; 00479 case '_': 00480 return "<u>" + re.cap( 1 ) + "</u>"; 00481 case '/': 00482 return "<i>" + re.cap( 1 ) + "</i>"; 00483 } 00484 } 00485 return QString(); 00486 }