00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #ifdef HAVE_CONFIG_H
00036 #include "config.h"
00037 #endif
00038
00039
00040 #include "html/htmltokenizer.h"
00041 #include "html/html_documentimpl.h"
00042 #include "html/htmlparser.h"
00043 #include "html/dtd.h"
00044
00045 #include "misc/loader.h"
00046 #include "misc/htmlhashes.h"
00047
00048 #include "khtmlview.h"
00049 #include "khtml_part.h"
00050 #include "xml/dom_docimpl.h"
00051 #include "css/csshelper.h"
00052 #include "ecma/kjs_proxy.h"
00053 #include <kcharsets.h>
00054 #include <kglobal.h>
00055 #include <ctype.h>
00056 #include <assert.h>
00057 #include <qvariant.h>
00058 #include <kdebug.h>
00059 #include <stdlib.h>
00060
00061 #include "kentities.c"
00062
00063 using namespace khtml;
00064
00065 static const QChar commentStart [] = { '<','!','-','-', QChar::null };
00066
00067 static const char scriptEnd [] = "</script";
00068 static const char xmpEnd [] = "</xmp";
00069 static const char styleEnd [] = "</style";
00070 static const char textareaEnd [] = "</textarea";
00071 static const char titleEnd [] = "</title";
00072
00073 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
00074 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) P = realloc(p, sizeof(QChar)*( N ))
00075 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089 #if 0
00090 #define fixUpChar(x)
00091 #else
00092 #define fixUpChar(x) \
00093 if (!(x).row() ) { \
00094 switch ((x).cell()) \
00095 { \
00096 \
00097 case 0x80: (x) = 0x20ac; break; \
00098 case 0x82: (x) = ','; break; \
00099 case 0x83: (x) = 0x0192; break; \
00100 case 0x84: (x) = '"'; break; \
00101 case 0x85: (x) = 0x2026; break; \
00102 case 0x86: (x) = 0x2020; break; \
00103 case 0x87: (x) = 0x2021; break; \
00104 case 0x88: (x) = 0x02C6; break; \
00105 case 0x89: (x) = 0x2030; break; \
00106 case 0x8A: (x) = 0x0160; break; \
00107 case 0x8b: (x) = '<'; break; \
00108 case 0x8C: (x) = 0x0152; break; \
00109 \
00110 case 0x8E: (x) = 0x017D; break; \
00111 \
00112 \
00113 case 0x91: (x) = '\''; break; \
00114 case 0x92: (x) = '\''; break; \
00115 case 0x93: (x) = '"'; break; \
00116 case 0x94: (x) = '"'; break; \
00117 case 0x95: (x) = '*'; break; \
00118 case 0x96: (x) = '-'; break; \
00119 case 0x97: (x) = '-'; break; \
00120 case 0x98: (x) = '~'; break; \
00121 case 0x99: (x) = 0x2122; break; \
00122 case 0x9A: (x) = 0x0161; break; \
00123 case 0x9b: (x) = '>'; break; \
00124 case 0x9C: (x) = 0x0153; break; \
00125 \
00126 case 0x9E: (x) = 0x017E; break; \
00127 case 0x9F: (x) = 0x0178; break; \
00128 default: break; \
00129 } \
00130 }
00131 #endif
00132
00133
00134
00135 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, KHTMLView *_view)
00136 {
00137 view = _view;
00138 buffer = 0;
00139 scriptCode = 0;
00140 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00141 charsets = KGlobal::charsets();
00142 parser = new KHTMLParser(_view, _doc);
00143 m_executingScript = 0;
00144 onHold = false;
00145
00146 reset();
00147 }
00148
00149 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, DOM::DocumentFragmentImpl *i)
00150 {
00151 view = 0;
00152 buffer = 0;
00153 scriptCode = 0;
00154 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00155 charsets = KGlobal::charsets();
00156 parser = new KHTMLParser( i, _doc );
00157 m_executingScript = 0;
00158 onHold = false;
00159
00160 reset();
00161 }
00162
00163 void HTMLTokenizer::reset()
00164 {
00165 assert(m_executingScript == 0);
00166 assert(onHold == false);
00167
00168 while (!cachedScript.isEmpty())
00169 cachedScript.dequeue()->deref(this);
00170
00171 if ( buffer )
00172 KHTML_DELETE_QCHAR_VEC(buffer);
00173 buffer = dest = 0;
00174 size = 0;
00175
00176 if ( scriptCode )
00177 KHTML_DELETE_QCHAR_VEC(scriptCode);
00178 scriptCode = 0;
00179 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00180
00181 currToken.reset();
00182 }
00183
00184 void HTMLTokenizer::begin()
00185 {
00186 m_executingScript = 0;
00187 onHold = false;
00188 reset();
00189 size = 254;
00190 buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
00191 dest = buffer;
00192 tag = NoTag;
00193 pending = NonePending;
00194 discard = NoneDiscard;
00195 pre = false;
00196 prePos = 0;
00197 plaintext = false;
00198 xmp = false;
00199 processingInstruction = false;
00200 script = false;
00201 escaped = false;
00202 style = false;
00203 skipLF = false;
00204 select = false;
00205 comment = false;
00206 server = false;
00207 textarea = false;
00208 title = false;
00209 startTag = false;
00210 tquote = NoQuote;
00211 searchCount = 0;
00212 Entity = NoEntity;
00213 noMoreData = false;
00214 brokenComments = false;
00215 brokenServer = false;
00216 lineno = 0;
00217 scriptStartLineno = 0;
00218 tagStartLineno = 0;
00219 }
00220
00221 void HTMLTokenizer::processListing(DOMStringIt list)
00222 {
00223 bool old_pre = pre;
00224
00225
00226
00227
00228 if(!style) pre = true;
00229 prePos = 0;
00230
00231 while ( list.length() )
00232 {
00233 checkBuffer(3*TAB_SIZE);
00234
00235 if (skipLF && ( *list != '\n' ))
00236 {
00237 skipLF = false;
00238 }
00239
00240 if (skipLF)
00241 {
00242 skipLF = false;
00243 ++list;
00244 }
00245 else if (( *list == '\n' ) || ( *list == '\r' ))
00246 {
00247 if (discard == LFDiscard)
00248 {
00249
00250 discard = NoneDiscard;
00251 }
00252 else
00253 {
00254
00255 if (pending)
00256 addPending();
00257 pending = LFPending;
00258 }
00259
00260 if (*list == '\r')
00261 {
00262 skipLF = true;
00263 }
00264 ++list;
00265 }
00266 else if (( *list == ' ' ) || ( *list == '\t'))
00267 {
00268 if (pending)
00269 addPending();
00270 if (*list == ' ')
00271 pending = SpacePending;
00272 else
00273 pending = TabPending;
00274
00275 ++list;
00276 }
00277 else
00278 {
00279 discard = NoneDiscard;
00280 if (pending)
00281 addPending();
00282
00283 prePos++;
00284 *dest++ = *list;
00285 ++list;
00286 }
00287
00288 }
00289
00290 if ((pending == SpacePending) || (pending == TabPending))
00291 addPending();
00292 else
00293 pending = NonePending;
00294
00295 prePos = 0;
00296 pre = old_pre;
00297 }
00298
00299 void HTMLTokenizer::parseSpecial(DOMStringIt &src)
00300 {
00301 assert( textarea || title || !Entity );
00302 assert( !tag );
00303 assert( xmp+textarea+title+style+script == 1 );
00304 if (script)
00305 scriptStartLineno = lineno+src.lineCount();
00306
00307 if ( comment ) parseComment( src );
00308
00309 while ( src.length() ) {
00310 checkScriptBuffer();
00311 unsigned char ch = src->latin1();
00312 if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && !title && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "<!-" ) {
00313 comment = true;
00314 parseComment( src );
00315 continue;
00316 }
00317 if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
00318 ++src;
00319 scriptCodeSize = scriptCodeResync-1;
00320 scriptCodeResync = 0;
00321 scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
00322 if ( script )
00323 scriptHandler();
00324 else {
00325 processListing(DOMStringIt(scriptCode, scriptCodeSize));
00326 processToken();
00327 if ( style ) { currToken.id = ID_STYLE + ID_CLOSE_TAG; }
00328 else if ( textarea ) { currToken.id = ID_TEXTAREA + ID_CLOSE_TAG; }
00329 else if ( title ) { currToken.id = ID_TITLE + ID_CLOSE_TAG; }
00330 else if ( xmp ) { currToken.id = ID_XMP + ID_CLOSE_TAG; }
00331 processToken();
00332 style = script = style = textarea = title = xmp = false;
00333 tquote = NoQuote;
00334 scriptCodeSize = scriptCodeResync = 0;
00335 }
00336 return;
00337 }
00338
00339 if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
00340 scriptCodeSize >= searchStopperLen &&
00341 !QConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) {
00342 scriptCodeResync = scriptCodeSize-searchStopperLen+1;
00343 tquote = NoQuote;
00344 continue;
00345 }
00346 if ( scriptCodeResync && !escaped ) {
00347 if(ch == '\"')
00348 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
00349 else if(ch == '\'')
00350 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
00351 else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
00352 tquote = NoQuote;
00353 }
00354 escaped = ( !escaped && ch == '\\' );
00355 if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
00356 QChar *scriptCodeDest = scriptCode+scriptCodeSize;
00357 ++src;
00358 parseEntity(src,scriptCodeDest,true);
00359 scriptCodeSize = scriptCodeDest-scriptCode;
00360 }
00361 else {
00362 scriptCode[ scriptCodeSize++ ] = *src;
00363 ++src;
00364 }
00365 }
00366 }
00367
00368 void HTMLTokenizer::scriptHandler()
00369 {
00370 QString currentScriptSrc = scriptSrc;
00371 scriptSrc = QString::null;
00372
00373 processListing(DOMStringIt(scriptCode, scriptCodeSize));
00374 QString exScript( buffer, dest-buffer );
00375
00376 processToken();
00377 currToken.id = ID_SCRIPT + ID_CLOSE_TAG;
00378 processToken();
00379
00380 QString prependingSrc;
00381
00382 if ( !parser->skipMode() ) {
00383 CachedScript* cs = 0;
00384
00385
00386 if ( !currentScriptSrc.isEmpty() &&
00387 (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) ))
00388 cachedScript.enqueue(cs);
00389
00390 if (cs) {
00391 pendingSrc.prepend( QString(src.current(), src.length() ) );
00392 setSrc(QString::null);
00393 scriptCodeSize = scriptCodeResync = 0;
00394 cs->ref(this);
00395
00396 }
00397 else if (currentScriptSrc.isEmpty() && view && javascript ) {
00398 if ( !m_executingScript )
00399 pendingSrc.prepend( QString( src.current(), src.length() ) );
00400 else
00401 prependingSrc = QString( src.current(), src.length() );
00402
00403 setSrc(QString::null);
00404 scriptCodeSize = scriptCodeResync = 0;
00405 scriptExecution( exScript, QString::null, tagStartLineno );
00406 }
00407 }
00408
00409 script = false;
00410 scriptCodeSize = scriptCodeResync = 0;
00411
00412 if ( !m_executingScript && cachedScript.isEmpty() ) {
00413
00414 QString newStr = QString(src.current(), src.length());
00415 newStr += pendingSrc;
00416 setSrc(newStr);
00417 pendingSrc = QString::null;
00418 }
00419 else if ( !prependingSrc.isEmpty() )
00420 write( prependingSrc, false );
00421 }
00422
00423 void HTMLTokenizer::scriptExecution( const QString& str, QString scriptURL,
00424 int baseLine)
00425 {
00426 bool oldscript = script;
00427 m_executingScript++;
00428 script = false;
00429 QString url;
00430 if (scriptURL.isNull())
00431 url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL();
00432 else
00433 url = scriptURL;
00434
00435 view->part()->executeScript(url,baseLine,Node(),str);
00436 m_executingScript--;
00437 script = oldscript;
00438 }
00439
00440 void HTMLTokenizer::parseComment(DOMStringIt &src)
00441 {
00442 checkScriptBuffer(src.length());
00443 while ( src.length() ) {
00444 scriptCode[ scriptCodeSize++ ] = *src;
00445 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00446 qDebug("comment is now: *%s*",
00447 QConstString((QChar*)src.current(), QMIN(16, src.length())).string().latin1());
00448 #endif
00449 if (src->unicode() == '>' &&
00450 ( ( brokenComments && !( script || style ) ) ||
00451 ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
00452 scriptCode[scriptCodeSize-2] == '-' ) ) ) {
00453 ++src;
00454 if ( !( script || xmp || textarea || style) ) {
00455 #ifdef COMMENTS_IN_DOM
00456 checkScriptBuffer();
00457 scriptCode[ scriptCodeSize ] = 0;
00458 scriptCode[ scriptCodeSize + 1 ] = 0;
00459 currToken.id = ID_COMMENT;
00460 processListing(DOMStringIt(scriptCode, scriptCodeSize - 2));
00461 processToken();
00462 currToken.id = ID_COMMENT + ID_CLOSE_TAG;
00463 processToken();
00464 #endif
00465 scriptCodeSize = 0;
00466 }
00467 comment = false;
00468 return;
00469 }
00470 ++src;
00471 }
00472 }
00473
00474 void HTMLTokenizer::parseServer(DOMStringIt &src)
00475 {
00476 checkScriptBuffer(src.length());
00477 while ( src.length() ) {
00478 scriptCode[ scriptCodeSize++ ] = *src;
00479 if (src->unicode() == '>' &&
00480 scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
00481 ++src;
00482 server = false;
00483 scriptCodeSize = 0;
00484 return;
00485 }
00486 ++src;
00487 }
00488 }
00489
00490 void HTMLTokenizer::parseProcessingInstruction(DOMStringIt &src)
00491 {
00492 char oldchar = 0;
00493 while ( src.length() )
00494 {
00495 unsigned char chbegin = src->latin1();
00496 if(chbegin == '\'') {
00497 tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
00498 }
00499 else if(chbegin == '\"') {
00500 tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
00501 }
00502
00503
00504
00505 else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
00506 {
00507
00508 processingInstruction = false;
00509 ++src;
00510 discard=LFDiscard;
00511 return;
00512 }
00513 ++src;
00514 oldchar = chbegin;
00515 }
00516 }
00517
00518 void HTMLTokenizer::parseText(DOMStringIt &src)
00519 {
00520 while ( src.length() )
00521 {
00522
00523 checkBuffer();
00524
00525
00526 unsigned char chbegin = src->latin1();
00527
00528 if (skipLF && ( chbegin != '\n' ))
00529 {
00530 skipLF = false;
00531 }
00532
00533 if (skipLF)
00534 {
00535 skipLF = false;
00536 ++src;
00537 }
00538 else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
00539 {
00540 if (chbegin == '\r')
00541 skipLF = true;
00542
00543 *dest++ = '\n';
00544 ++src;
00545 }
00546 else {
00547 *dest++ = *src;
00548 ++src;
00549 }
00550 }
00551 }
00552
00553
00554 void HTMLTokenizer::parseEntity(DOMStringIt &src, QChar *&dest, bool start)
00555 {
00556 if( start )
00557 {
00558 cBufferPos = 0;
00559 Entity = SearchEntity;
00560 }
00561
00562 while( src.length() )
00563 {
00564 ushort cc = src->unicode();
00565 switch(Entity) {
00566 case NoEntity:
00567 return;
00568
00569 break;
00570 case SearchEntity:
00571 if(cc == '#') {
00572 cBuffer[cBufferPos++] = cc;
00573 ++src;
00574 Entity = NumericSearch;
00575 }
00576 else
00577 Entity = EntityName;
00578
00579 break;
00580
00581 case NumericSearch:
00582 if(cc == 'x' || cc == 'X') {
00583 cBuffer[cBufferPos++] = cc;
00584 ++src;
00585 Entity = Hexadecimal;
00586 }
00587 else if(cc >= '0' && cc <= '9')
00588 Entity = Decimal;
00589 else
00590 Entity = SearchSemicolon;
00591
00592 break;
00593
00594 case Hexadecimal:
00595 {
00596 int uc = EntityChar.unicode();
00597 int ll = kMin(src.length(), 9-cBufferPos);
00598 while(ll--) {
00599 QChar csrc(src->lower());
00600 cc = csrc.cell();
00601
00602 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
00603 Entity = SearchSemicolon;
00604 break;
00605 }
00606 uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
00607 cBuffer[cBufferPos++] = cc;
00608 ++src;
00609 }
00610 EntityChar = QChar(uc);
00611 if(cBufferPos == 9) Entity = SearchSemicolon;
00612 break;
00613 }
00614 case Decimal:
00615 {
00616 int uc = EntityChar.unicode();
00617 int ll = kMin(src.length(), 9-cBufferPos);
00618 while(ll--) {
00619 cc = src->cell();
00620
00621 if(src->row() || !(cc >= '0' && cc <= '9')) {
00622 Entity = SearchSemicolon;
00623 break;
00624 }
00625
00626 uc = uc * 10 + (cc - '0');
00627 cBuffer[cBufferPos++] = cc;
00628 ++src;
00629 }
00630 EntityChar = QChar(uc);
00631 if(cBufferPos == 9) Entity = SearchSemicolon;
00632 break;
00633 }
00634 case EntityName:
00635 {
00636 int ll = kMin(src.length(), 9-cBufferPos);
00637 while(ll--) {
00638 QChar csrc = *src;
00639 cc = csrc.cell();
00640
00641 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
00642 (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
00643 Entity = SearchSemicolon;
00644 break;
00645 }
00646
00647 cBuffer[cBufferPos++] = cc;
00648 ++src;
00649 }
00650 if(cBufferPos == 9) Entity = SearchSemicolon;
00651 if(Entity == SearchSemicolon) {
00652 if(cBufferPos > 1) {
00653 const entity *e = findEntity(cBuffer, cBufferPos);
00654 if(e)
00655 EntityChar = e->code;
00656
00657
00658 if(tag && EntityChar.unicode() > 255 && *src != ';')
00659 EntityChar = QChar::null;
00660 }
00661 }
00662 else
00663 break;
00664 }
00665 case SearchSemicolon:
00666
00667
00668
00669 fixUpChar(EntityChar);
00670
00671 if ( EntityChar != QChar::null ) {
00672 checkBuffer();
00673
00674 if (*src == ';')
00675 ++src;
00676
00677 src.push( EntityChar );
00678 } else {
00679 #ifdef TOKEN_DEBUG
00680 kdDebug( 6036 ) << "unknown entity!" << endl;
00681 #endif
00682 checkBuffer(10);
00683
00684 *dest++ = '&';
00685 for(unsigned int i = 0; i < cBufferPos; i++)
00686 dest[i] = cBuffer[i];
00687 dest += cBufferPos;
00688 Entity = NoEntity;
00689 if (pre)
00690 prePos += cBufferPos+1;
00691 }
00692
00693 Entity = NoEntity;
00694 EntityChar = QChar::null;
00695 return;
00696 };
00697 }
00698 }
00699
00700 void HTMLTokenizer::parseTag(DOMStringIt &src)
00701 {
00702 assert(!Entity );
00703
00704 while ( src.length() )
00705 {
00706 checkBuffer();
00707 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00708 uint l = 0;
00709 while(l < src.length() && (*(src.current()+l)).latin1() != '>')
00710 l++;
00711 qDebug("src is now: *%s*, tquote: %d",
00712 QConstString((QChar*)src.current(), l).string().latin1(), tquote);
00713 #endif
00714 switch(tag) {
00715 case NoTag:
00716 {
00717 return;
00718 }
00719 case TagName:
00720 {
00721 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00722 qDebug("TagName");
00723 #endif
00724 if (searchCount > 0)
00725 {
00726 if (*src == commentStart[searchCount])
00727 {
00728 searchCount++;
00729 if (searchCount == 4)
00730 {
00731 #ifdef TOKEN_DEBUG
00732 kdDebug( 6036 ) << "Found comment" << endl;
00733 #endif
00734
00735 ++src;
00736 dest = buffer;
00737 tag = NoTag;
00738
00739 comment = true;
00740
00741 checkScriptBuffer();
00742 scriptCode[0] = scriptCode[1] = '-';
00743 scriptCodeSize = 2;
00744 parseComment(src);
00745 return;
00746 }
00747
00748 cBuffer[cBufferPos++] = src->cell();
00749 ++src;
00750 break;
00751 }
00752 else
00753 searchCount = 0;
00754 }
00755
00756 bool finish = false;
00757 unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00758 while(ll--) {
00759 ushort curchar = *src;
00760 if(curchar <= ' ' || curchar == '>' ) {
00761 finish = true;
00762 break;
00763 }
00764
00765
00766
00767 char cc = curchar;
00768 cBuffer[cBufferPos++] = cc | 0x20;
00769 ++src;
00770 }
00771
00772
00773
00774 if(finish || CBUFLEN == cBufferPos) {
00775 bool beginTag;
00776 char* ptr = cBuffer;
00777 unsigned int len = cBufferPos;
00778 cBuffer[cBufferPos] = '\0';
00779 if ((cBufferPos > 0) && (*ptr == '/'))
00780 {
00781
00782 beginTag = false;
00783 ptr++;
00784 len--;
00785 }
00786 else
00787
00788 beginTag = true;
00789
00790 if(len > 1 && ptr[len-1] == '/' ) {
00791 ptr[--len] = '\0';
00792
00793 if (*src == '>')
00794 currToken.flat = true;
00795 }
00796
00797 uint tagID = khtml::getTagID(ptr, len);
00798 if (!tagID) {
00799 #ifdef TOKEN_DEBUG
00800 QCString tmp(ptr, len+1);
00801 kdDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"" << endl;
00802 #endif
00803 dest = buffer;
00804 }
00805 else
00806 {
00807 #ifdef TOKEN_DEBUG
00808 QCString tmp(ptr, len+1);
00809 kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;
00810 #endif
00811 currToken.id = beginTag ? tagID : tagID + ID_CLOSE_TAG;
00812 dest = buffer;
00813 }
00814 tag = SearchAttribute;
00815 cBufferPos = 0;
00816 }
00817 break;
00818 }
00819 case SearchAttribute:
00820 {
00821 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00822 qDebug("SearchAttribute");
00823 #endif
00824 bool atespace = false;
00825 ushort curchar;
00826 while(src.length()) {
00827 curchar = *src;
00828 if(curchar > ' ') {
00829 if(curchar == '>')
00830 tag = SearchEnd;
00831 else if(atespace && (curchar == '\'' || curchar == '"'))
00832 {
00833 tag = SearchValue;
00834 *dest++ = 0;
00835 attrName = QString::null;
00836 }
00837 else
00838 tag = AttributeName;
00839
00840 cBufferPos = 0;
00841 break;
00842 }
00843 atespace = true;
00844 ++src;
00845 }
00846 break;
00847 }
00848 case AttributeName:
00849 {
00850 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00851 qDebug("AttributeName");
00852 #endif
00853 ushort curchar;
00854 int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00855
00856 while(ll--) {
00857 curchar = *src;
00858 if(curchar <= '>') {
00859 if(curchar <= ' ' || curchar == '=' || curchar == '>') {
00860 unsigned int a;
00861 cBuffer[cBufferPos] = '\0';
00862 a = khtml::getAttrID(cBuffer, cBufferPos);
00863 if ( !a )
00864 attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00865
00866 dest = buffer;
00867 *dest++ = a;
00868 #ifdef TOKEN_DEBUG
00869 if (!a || (cBufferPos && *cBuffer == '!'))
00870 kdDebug( 6036 ) << "Unknown attribute: *" << QCString(cBuffer, cBufferPos+1).data() << "*" << endl;
00871 else
00872 kdDebug( 6036 ) << "Known attribute: " << QCString(cBuffer, cBufferPos+1).data() << endl;
00873 #endif
00874
00875 if (!a && cBufferPos == 1 && *cBuffer == '/' && curchar == '>')
00876 currToken.flat = true;
00877
00878 tag = SearchEqual;
00879 break;
00880 }
00881 }
00882 cBuffer[cBufferPos++] = (char) curchar | 0x20;
00883 ++src;
00884 }
00885 if ( cBufferPos == CBUFLEN ) {
00886 cBuffer[cBufferPos] = '\0';
00887 attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00888 dest = buffer;
00889 *dest++ = 0;
00890 tag = SearchEqual;
00891 }
00892 break;
00893 }
00894 case SearchEqual:
00895 {
00896 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00897 qDebug("SearchEqual");
00898 #endif
00899 ushort curchar;
00900 bool atespace = false;
00901 while(src.length()) {
00902 curchar = src->unicode();
00903 if(curchar > ' ') {
00904 if(curchar == '=') {
00905 #ifdef TOKEN_DEBUG
00906 kdDebug(6036) << "found equal" << endl;
00907 #endif
00908 tag = SearchValue;
00909 ++src;
00910 }
00911 else if(atespace && (curchar == '\'' || curchar == '"'))
00912 {
00913 tag = SearchValue;
00914 *dest++ = 0;
00915 attrName = QString::null;
00916 }
00917 else {
00918 DOMString v("");
00919 currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00920 dest = buffer;
00921 tag = SearchAttribute;
00922 }
00923 break;
00924 }
00925 atespace = true;
00926 ++src;
00927 }
00928 break;
00929 }
00930 case SearchValue:
00931 {
00932 ushort curchar;
00933 while(src.length()) {
00934 curchar = src->unicode();
00935 if(curchar > ' ') {
00936 if(( curchar == '\'' || curchar == '\"' )) {
00937 tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
00938 tag = QuotedValue;
00939 ++src;
00940 } else
00941 tag = Value;
00942
00943 break;
00944 }
00945 ++src;
00946 }
00947 break;
00948 }
00949 case QuotedValue:
00950 {
00951 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00952 qDebug("QuotedValue");
00953 #endif
00954 ushort curchar;
00955 while(src.length()) {
00956 checkBuffer();
00957
00958 curchar = src->unicode();
00959 if(curchar <= '\'' && !src.escaped()) {
00960
00961 if ( curchar == '&' )
00962 {
00963 ++src;
00964 parseEntity(src, dest, true);
00965 break;
00966 }
00967 else if ( (tquote == SingleQuote && curchar == '\'') ||
00968 (tquote == DoubleQuote && curchar == '\"') )
00969 {
00970
00971 while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
00972 dest--;
00973 DOMString v(buffer+1, dest-buffer-1);
00974 currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00975
00976 dest = buffer;
00977 tag = SearchAttribute;
00978 tquote = NoQuote;
00979 ++src;
00980 break;
00981 }
00982 }
00983 *dest++ = *src;
00984 ++src;
00985 }
00986 break;
00987 }
00988 case Value:
00989 {
00990 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00991 qDebug("Value");
00992 #endif
00993 ushort curchar;
00994 while(src.length()) {
00995 checkBuffer();
00996 curchar = src->unicode();
00997 if(curchar <= '>' && !src.escaped()) {
00998
00999 if ( curchar == '&' )
01000 {
01001 ++src;
01002 parseEntity(src, dest, true);
01003 break;
01004 }
01005
01006
01007 if ( curchar <= ' ' || curchar == '>' )
01008 {
01009 DOMString v(buffer+1, dest-buffer-1);
01010 currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
01011 dest = buffer;
01012 tag = SearchAttribute;
01013 break;
01014 }
01015 }
01016
01017 *dest++ = *src;
01018 ++src;
01019 }
01020 break;
01021 }
01022 case SearchEnd:
01023 {
01024 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
01025 qDebug("SearchEnd");
01026 #endif
01027 while(src.length()) {
01028 if(*src == '>')
01029 break;
01030
01031 if (*src == '/')
01032 currToken.flat = true;
01033
01034 ++src;
01035 }
01036 if(!src.length() && *src != '>') break;
01037
01038 searchCount = 0;
01039 tag = NoTag;
01040 tquote = NoQuote;
01041 ++src;
01042
01043 if ( !currToken.id )
01044 return;
01045
01046 uint tagID = currToken.id;
01047 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
01048 kdDebug( 6036 ) << "appending Tag: " << tagID << endl;
01049 #endif
01050 bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
01051
01052 if(tagID >= ID_CLOSE_TAG)
01053 tagID -= ID_CLOSE_TAG;
01054 else if ( beginTag && tagID == ID_SCRIPT ) {
01055 AttributeImpl* a = 0;
01056 scriptSrc = scriptSrcCharset = QString::null;
01057 if ( currToken.attrs &&
01058 parser->doc()->view()->part()->jScriptEnabled() &&
01059 view
01060 ) {
01061 if ( ( a = currToken.attrs->getAttributeItem( ATTR_SRC ) ) )
01062 scriptSrc = parser->doc()->completeURL(khtml::parseURL( a->value() ).string() );
01063 if ( ( a = currToken.attrs->getAttributeItem( ATTR_CHARSET ) ) )
01064 scriptSrcCharset = a->value().string().stripWhiteSpace();
01065 if ( scriptSrcCharset.isEmpty() )
01066 scriptSrcCharset = parser->doc()->view()->part()->encoding();
01067 if (!(a = currToken.attrs->getAttributeItem( ATTR_LANGUAGE )))
01068 a = currToken.attrs->getAttributeItem(ATTR_TYPE);
01069 }
01070 javascript = true;
01071 if( a ) {
01072 QString lang = a->value().string();
01073 lang = lang.lower();
01074 if( !lang.contains("javascript") &&
01075 !lang.contains("ecmascript") &&
01076 !lang.contains("livescript") &&
01077 !lang.contains("jscript") )
01078 javascript = false;
01079 }
01080 }
01081
01082 processToken();
01083
01084
01085 pre = parser->preMode();
01086
01087 switch( tagID ) {
01088 case ID_PRE:
01089 prePos = 0;
01090 break;
01091 case ID_SCRIPT:
01092 if (beginTag) {
01093 searchStopper = scriptEnd;
01094 searchStopperLen = 8;
01095 script = true;
01096 parseSpecial(src);
01097 }
01098 break;
01099 case ID_STYLE:
01100 if (beginTag) {
01101 searchStopper = styleEnd;
01102 searchStopperLen = 7;
01103 style = true;
01104 parseSpecial(src);
01105 }
01106 break;
01107 case ID_TEXTAREA:
01108 if(beginTag) {
01109 searchStopper = textareaEnd;
01110 searchStopperLen = 10;
01111 textarea = true;
01112 discard = AllDiscard;
01113 parseSpecial(src);
01114 }
01115 break;
01116 case ID_TITLE:
01117 if (beginTag) {
01118 searchStopper = titleEnd;
01119 searchStopperLen = 7;
01120 title = true;
01121 parseSpecial(src);
01122 }
01123 break;
01124 case ID_XMP:
01125 if (beginTag) {
01126 searchStopper = xmpEnd;
01127 searchStopperLen = 5;
01128 xmp = true;
01129 parseSpecial(src);
01130 }
01131 break;
01132 case ID_SELECT:
01133 select = beginTag;
01134 break;
01135 case ID_PLAINTEXT:
01136 plaintext = beginTag;
01137 break;
01138 }
01139 return;
01140 }
01141 }
01142 }
01143 return;
01144 }
01145
01146 void HTMLTokenizer::addPending()
01147 {
01148 if ( select && !(comment || script))
01149 {
01150 *dest++ = ' ';
01151 }
01152 else if ( textarea )
01153 {
01154 switch(pending) {
01155 case LFPending: *dest++ = '\n'; prePos = 0; break;
01156 case SpacePending: *dest++ = ' '; ++prePos; break;
01157 case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break;
01158 case NonePending:
01159 assert(0);
01160 }
01161 }
01162 else if ( pre )
01163 {
01164 int p;
01165
01166 switch (pending)
01167 {
01168 case SpacePending:
01169
01170 *dest++ = QChar(' ');
01171 prePos++;
01172 break;
01173
01174 case LFPending:
01175 *dest = '\n';
01176 dest++;
01177 prePos = 0;
01178 break;
01179
01180 case TabPending:
01181 p = TAB_SIZE - ( prePos % TAB_SIZE );
01182 for ( int x = 0; x < p; x++ )
01183 *dest++ = QChar(' ');
01184 prePos += p;
01185 break;
01186
01187 case NonePending:
01188 assert(0);
01189 break;
01190 }
01191 }
01192 else
01193 {
01194 *dest++ = ' ';
01195 }
01196
01197 pending = NonePending;
01198 }
01199
01200 void HTMLTokenizer::write( const QString &str, bool appendData )
01201 {
01202 #ifdef TOKEN_DEBUG
01203 kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str << "\"," << appendData << ")" << endl;
01204 #endif
01205
01206 if ( !buffer )
01207 return;
01208
01209 if ( ( m_executingScript && appendData ) ||
01210 ( !m_executingScript && cachedScript.count() ) ) {
01211
01212 pendingSrc += str;
01213 return;
01214 }
01215
01216 if ( onHold ) {
01217 QString rest = QString( src.current(), src.length() );
01218 rest += str;
01219 setSrc(rest);
01220 return;
01221 }
01222 else
01223 setSrc(str);
01224
01225
01226
01227
01228 while ( src.length() )
01229 {
01230
01231 checkBuffer();
01232
01233 ushort cc = src->unicode();
01234
01235 if (skipLF && (cc != '\n'))
01236 skipLF = false;
01237
01238 if (skipLF) {
01239 skipLF = false;
01240 ++src;
01241 }
01242 else if ( Entity )
01243 parseEntity( src, dest );
01244 else if ( plaintext )
01245 parseText( src );
01246 else if (script)
01247 parseSpecial(src);
01248 else if (style)
01249 parseSpecial(src);
01250 else if (xmp)
01251 parseSpecial(src);
01252 else if (textarea)
01253 parseSpecial(src);
01254 else if (title)
01255 parseSpecial(src);
01256 else if (comment)
01257 parseComment(src);
01258 else if (server)
01259 parseServer(src);
01260 else if (processingInstruction)
01261 parseProcessingInstruction(src);
01262 else if (tag)
01263 parseTag(src);
01264 else if ( startTag )
01265 {
01266 startTag = false;
01267
01268 switch(cc) {
01269 case '/':
01270 break;
01271 case '!':
01272 {
01273
01274 searchCount = 1;
01275
01276 break;
01277 }
01278 case '?':
01279 {
01280
01281 processingInstruction = true;
01282 tquote = NoQuote;
01283 parseProcessingInstruction(src);
01284 continue;
01285
01286 break;
01287 }
01288 case '%':
01289 if (!brokenServer) {
01290
01291 server = true;
01292 tquote = NoQuote;
01293 parseServer(src);
01294 continue;
01295 }
01296
01297 default:
01298 {
01299 if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
01300 {
01301
01302 }
01303 else
01304 {
01305
01306
01307 if (pending)
01308 addPending();
01309 *dest = '<';
01310 dest++;
01311 continue;
01312 }
01313 }
01314 };
01315
01316 if ( pending ) {
01317
01318 if ( pre )
01319 addPending();
01320
01321
01322 else if ( !parser->selectMode() &&
01323 ( !parser->noSpaces() || dest > buffer )) {
01324 addPending();
01325 discard = AllDiscard;
01326 }
01327
01328 else
01329 pending = NonePending;
01330 }
01331
01332 processToken();
01333
01334 cBufferPos = 0;
01335 tag = TagName;
01336 parseTag(src);
01337 }
01338 else if ( cc == '&' && !src.escaped())
01339 {
01340 ++src;
01341 if ( pending )
01342 addPending();
01343 parseEntity(src, dest, true);
01344 }
01345 else if ( cc == '<' && !src.escaped())
01346 {
01347 tagStartLineno = lineno+src.lineCount();
01348 ++src;
01349 startTag = true;
01350 }
01351 else if (( cc == '\n' ) || ( cc == '\r' ))
01352 {
01353 if ( pre || textarea)
01354 {
01355 if (discard == LFDiscard || discard == AllDiscard)
01356 {
01357
01358 discard = NoneDiscard;
01359 }
01360 else
01361 {
01362
01363 if (pending)
01364 addPending();
01365 pending = LFPending;
01366 }
01367 }
01368 else
01369 {
01370 if (discard == LFDiscard)
01371 {
01372
01373 discard = NoneDiscard;
01374 }
01375 else if(discard == AllDiscard)
01376 {
01377 }
01378 else
01379 {
01380
01381 if (pending == NonePending)
01382 pending = LFPending;
01383 }
01384 }
01385
01386 if (cc == '\r')
01387 {
01388 skipLF = true;
01389 }
01390 ++src;
01391 }
01392 else if (( cc == ' ' ) || ( cc == '\t' ))
01393 {
01394 if ( pre || textarea)
01395 {
01396 if (discard == SpaceDiscard || discard == AllDiscard)
01397 {
01398
01399 discard = NoneDiscard;
01400 }
01401 else {
01402 if (pending)
01403 addPending();
01404 if (cc == ' ')
01405 pending = SpacePending;
01406 else
01407 pending = TabPending;
01408 }
01409 }
01410 else
01411 {
01412 if(discard == SpaceDiscard)
01413 discard = NoneDiscard;
01414 else if(discard == AllDiscard)
01415 { }
01416 else
01417 pending = SpacePending;
01418 }
01419 ++src;
01420 }
01421 else
01422 {
01423 if (pending)
01424 addPending();
01425
01426 discard = NoneDiscard;
01427 if ( pre )
01428 {
01429 prePos++;
01430 }
01431 *dest = *src;
01432 fixUpChar( *dest );
01433 ++dest;
01434 ++src;
01435 }
01436 }
01437 _src = QString::null;
01438
01439 if (noMoreData && cachedScript.isEmpty() && !m_executingScript )
01440 end();
01441 }
01442
01443 void HTMLTokenizer::end()
01444 {
01445 if ( buffer == 0 ) {
01446 emit finishedParsing();
01447 return;
01448 }
01449
01450
01451 if ( !tag )
01452 processToken();
01453
01454 if(buffer)
01455 KHTML_DELETE_QCHAR_VEC(buffer);
01456
01457 if(scriptCode)
01458 KHTML_DELETE_QCHAR_VEC(scriptCode);
01459
01460 scriptCode = 0;
01461 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01462 buffer = 0;
01463 emit finishedParsing();
01464 }
01465
01466 void HTMLTokenizer::finish()
01467 {
01468
01469 while((comment || server) && scriptCode && scriptCodeSize)
01470 {
01471
01472 if (comment)
01473 brokenComments = true;
01474 else
01475 brokenServer = true;
01476 checkScriptBuffer();
01477 scriptCode[ scriptCodeSize ] = 0;
01478 scriptCode[ scriptCodeSize + 1 ] = 0;
01479 int pos;
01480 QString food;
01481 if (script || style) {
01482 food.setUnicode(scriptCode, scriptCodeSize);
01483 }
01484 else if (server) {
01485 food = "<";
01486 food += QString(scriptCode, scriptCodeSize);
01487 }
01488 else {
01489 pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
01490 food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1);
01491 }
01492 KHTML_DELETE_QCHAR_VEC(scriptCode);
01493 scriptCode = 0;
01494 scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01495 comment = server = false;
01496 if ( !food.isEmpty() )
01497 write(food, true);
01498 }
01499
01500
01501 noMoreData = true;
01502 if (cachedScript.isEmpty() && !m_executingScript && !onHold)
01503 end();
01504 }
01505
01506 void HTMLTokenizer::processToken()
01507 {
01508 KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
01509 if (jsProxy)
01510 jsProxy->setEventHandlerLineno(tagStartLineno);
01511 if ( dest > buffer )
01512 {
01513 #ifdef TOKEN_DEBUG
01514 if(currToken.id) {
01515 qDebug( "unexpected token id: %d, str: *%s*", currToken.id,QConstString( buffer,dest-buffer ).string().latin1() );
01516 assert(0);
01517 }
01518
01519 #endif
01520 currToken.text = new DOMStringImpl( buffer, dest - buffer );
01521 currToken.text->ref();
01522 currToken.id = ID_TEXT;
01523 }
01524 else if(!currToken.id) {
01525 currToken.reset();
01526 if (jsProxy)
01527 jsProxy->setEventHandlerLineno(lineno+src.lineCount());
01528 return;
01529 }
01530
01531 dest = buffer;
01532
01533 #ifdef TOKEN_DEBUG
01534 QString name = getTagName(currToken.id).string();
01535 QString text;
01536 if(currToken.text)
01537 text = QConstString(currToken.text->s, currToken.text->l).string();
01538
01539 kdDebug( 6036 ) << "Token --> " << name << " id = " << currToken.id << endl;
01540 if (currToken.flat)
01541 kdDebug( 6036 ) << "Token is FLAT!" << endl;
01542 if(!text.isNull())
01543 kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
01544 unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
01545 if(l) {
01546 kdDebug( 6036 ) << "Attributes: " << l << endl;
01547 for (unsigned long i = 0; i < l; ++i) {
01548 AttributeImpl* c = currToken.attrs->attributeItem(i);
01549 kdDebug( 6036 ) << " " << c->id() << " " << parser->doc()->getDocument()->attrName(c->id()).string()
01550 << "=\"" << c->value().string() << "\"" << endl;
01551 }
01552 }
01553 kdDebug( 6036 ) << endl;
01554 #endif
01555
01556 parser->parseToken(&currToken);
01557
01558 if ( currToken.flat && currToken.id != ID_TEXT && !parser->noSpaces() )
01559 discard = NoneDiscard;
01560 else if ( parser->selectMode() )
01561 discard = AllDiscard;
01562
01563 currToken.reset();
01564 if (jsProxy)
01565 jsProxy->setEventHandlerLineno(0);
01566 }
01567
01568
01569 HTMLTokenizer::~HTMLTokenizer()
01570 {
01571 reset();
01572 delete parser;
01573 }
01574
01575
01576 void HTMLTokenizer::enlargeBuffer(int len)
01577 {
01578 int newsize = kMax(size*2, size+len);
01579 int oldoffs = (dest - buffer);
01580
01581 buffer = (QChar*)realloc(buffer, newsize*sizeof(QChar));
01582 dest = buffer + oldoffs;
01583 size = newsize;
01584 }
01585
01586 void HTMLTokenizer::enlargeScriptBuffer(int len)
01587 {
01588 int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
01589 scriptCode = (QChar*)realloc(scriptCode, newsize*sizeof(QChar));
01590 scriptCodeMaxSize = newsize;
01591 }
01592
01593 void HTMLTokenizer::notifyFinished(CachedObject* )
01594 {
01595 assert(!cachedScript.isEmpty());
01596 bool done = false;
01597 while (!done && cachedScript.head()->isLoaded()) {
01598 #ifdef TOKEN_DEBUG
01599 kdDebug( 6036 ) << "Finished loading an external script" << endl;
01600 #endif
01601 CachedScript* cs = cachedScript.dequeue();
01602 done = cachedScript.isEmpty();
01603 DOMString scriptSource = cs->script();
01604 #ifdef TOKEN_DEBUG
01605 kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl;
01606 #endif
01607 setSrc(QString::null);
01608
01609
01610
01611 QString cachedScriptUrl( cs->url().string() );
01612 cs->deref(this);
01613
01614 scriptExecution( scriptSource.string(), cachedScriptUrl );
01615
01616
01617
01618
01619 if ( !script ) {
01620 QString rest = pendingSrc;
01621 pendingSrc = QString::null;
01622 write(rest, false);
01623
01624
01625 }
01626 }
01627 }
01628
01629 void HTMLTokenizer::setSrc(const QString& source)
01630 {
01631 lineno += src.lineCount();
01632 _src = source;
01633 src = DOMStringIt(_src);
01634 }
01635
01636 void HTMLTokenizer::setOnHold(bool _onHold)
01637 {
01638 if (onHold == _onHold) return;
01639 onHold = _onHold;
01640 if (onHold)
01641 setSrc(QString(src.current(), src.length()));
01642 }
01643