00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include "kcharsets.h"
00020
00021 #include "kqiodevicegzip_p.h"
00022 #include "kentities.c"
00023
00024 #include <kapplication.h>
00025 #include <kglobal.h>
00026 #include <klocale.h>
00027 #include <kconfig.h>
00028
00029 #include <qfontinfo.h>
00030 #include <qstrlist.h>
00031 #include <qfontdatabase.h>
00032 #include <kdebug.h>
00033
00034 #include <qtextcodec.h>
00035 #include <qmap.h>
00036 #include <qcstring.h>
00037 #include <qdir.h>
00038 #include <qregexp.h>
00039
00040 #include <assert.h>
00041
00042 static const char * const language_names[] = {
00043 I18N_NOOP( "Other" ),
00044 I18N_NOOP( "Arabic" ),
00045 I18N_NOOP( "Baltic" ),
00046 I18N_NOOP( "Central European" ),
00047 I18N_NOOP( "Chinese Simplified" ),
00048 I18N_NOOP( "Chinese Traditional" ),
00049 I18N_NOOP( "Cyrillic" ),
00050 I18N_NOOP( "Greek" ),
00051 I18N_NOOP( "Hebrew" ),
00052 I18N_NOOP( "Japanese" ),
00053 I18N_NOOP( "Korean" ),
00054 I18N_NOOP( "Thai" ),
00055 I18N_NOOP( "Turkish" ),
00056 I18N_NOOP( "Western European" ),
00057 I18N_NOOP( "Tamil" ),
00058 I18N_NOOP( "Unicode" ),
00059 I18N_NOOP( "Northern Saami" ),
00060 I18N_NOOP( "Vietnamese" ),
00061 I18N_NOOP( "South-Eastern Europe" )
00062 };
00063
00064
00065
00066 static const char* const charsets_for_encoding[] = {
00067 "koi8-r",
00068 "koi8-u",
00069 "iso 8859-1",
00070 "iso 8859-2",
00071 "iso 8859-3",
00072 "iso 8859-4",
00073 "iso 8859-5",
00074 "iso 8859-6",
00075 "iso 8859-7",
00076 "iso 8859-8",
00077 "iso 8859-8-i",
00078 "iso 8859-9",
00079 "iso 8859-11",
00080 "iso 8859-13",
00081 "iso 8859-14",
00082 "iso 8859-15",
00083 "iso 8859-16",
00084 "utf8",
00085 "utf16",
00086 "iso-10646-ucs-2",
00087 "cp 1250",
00088 "cp 1251",
00089 "cp 1252",
00090 "cp 1253",
00091 "cp 1254",
00092 "cp 1255",
00093 "cp 1256",
00094 "cp 1257",
00095 "cp 1258",
00096 "ibm850",
00097 "ibm852",
00098 "ibm866",
00099 "tis620",
00100 "eucjp",
00101 "sjis",
00102 "jis7",
00103 "big5",
00104 "big5-hkscs",
00105 "gbk",
00106 "gb18030",
00107 "gb2312",
00108 "euckr",
00109 "tscii",
00110
00111 "winsami2",
00112 "cp 874",
00113 "maccyrillic",
00114 0 };
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136 static struct LanguageForEncoding
00137 {
00138 const char* index;
00139 int data;
00140 } const language_for_encoding[] = {
00141 { "iso 8859-1", 13 },
00142 { "iso 8859-15", 13 },
00143 { "iso 8859-14", 13 },
00144 { "cp 1252", 13 },
00145 { "ibm850", 13 },
00146 { "iso 8859-2", 3 },
00147 { "iso 8859-3", 3 },
00148 { "iso 8859-4", 2 },
00149 { "iso 8859-13", 2 },
00150 { "iso 8859-16", 18 },
00151 { "cp 1250", 3 },
00152 { "cp 1254", 12 },
00153 { "cp 1257", 2 },
00154 { "ibm852", 3 },
00155 { "koi8-r", 6 },
00156 { "iso 8859-5", 6 },
00157 { "cp 1251", 6 },
00158 { "koi8-u", 6 },
00159
00160 { "ibm866", 6 },
00161 { "big5", 5 },
00162 { "big5-hkscs", 5 },
00163 { "gb18030", 4 },
00164 { "gbk", 4 },
00165 { "gb2312", 4 },
00166 { "euckr", 10 },
00167 { "sjis", 9 },
00168 { "jis7", 9 },
00169 { "eucjp", 9 },
00170 { "iso 8859-7", 7 },
00171 { "cp 1253", 7 },
00172 { "iso 8859-6", 1 },
00173 { "cp 1256", 1 },
00174 { "iso 8859-8", 8 },
00175 { "iso 8859-8-i", 8 },
00176 { "cp 1255", 8 },
00177 { "iso 8859-9", 12 },
00178 { "tis620", 11 },
00179 { "iso 8859-11", 11 },
00180 { "cp 874", 11 },
00181 { "cp 1258", 17 },
00182 { "tscii", 14 },
00183 { "utf8", 15 },
00184 { "utf16", 15 },
00185 { "utf7", 15 },
00186 { "ucs2", 15 },
00187 { "iso-10646-ucs-2", 15 },
00188 { "winsami2", 16},
00189 { "maccyrillic", 6 },
00190 { 0, 0 } };
00191
00192
00193 static struct Builtin
00194 {
00195 const char* index;
00196 const char* data;
00197 } const builtin[] = {
00198 { "iso-ir-111", "koi8-r" },
00199 { "koi8-ru", "koi8-u" },
00200 { "koi unified", "koi8-r" },
00201
00202 { "us-ascii", "iso 8859-1" },
00203 { "usascii", "iso 8859-1" },
00204 { "ascii", "iso 8859-1" },
00205 { "x-utf-8", "utf-8" },
00206 { "x-utf-7", "utf-7" },
00207 { "unicode-1-1-utf-7", "utf-7" },
00208 { "utf-16", "iso-10646-ucs-2" },
00209 { "utf16", "iso-10646-ucs-2" },
00210 { "ucs2", "iso-10646-ucs-2" },
00211 { "iso10646-1", "iso-10646-ucs-2" },
00212 { "gb18030.2000-1", "gb18030" },
00213 { "gb18030.2000-0", "gb18030" },
00214 { "gbk-0", "gbk" },
00215 { "gb2312.1980-0", "gbk" },
00216 { "gb_2312-80", "gbk" },
00217 { "x-euc-kr", "euckr" },
00218 { "jisx0201.1976-0", "eucjp" },
00219 { "jisx0208.1983-0", "eucjp" },
00220 { "jisx0208.1990-0", "eucjp" },
00221 { "jisx0208.1997-0", "eucjp" },
00222 { "jisx0212.1990-0", "eucjp" },
00223 { "jisx0213.2000-1", "eucjp" },
00224 { "jisx0213.2000-2", "eucjp" },
00225 { "windows850", "ibm850" },
00226 { "windows866", "ibm866" },
00227 { "windows1251", "cp 1251" },
00228 { "windows1252", "cp 1252" },
00229 { "windows1253", "cp 1253" },
00230 { "windows1254", "cp 1254" },
00231 { "windows1255", "cp 1255" },
00232 { "windows1256", "cp 1256" },
00233 { "windows1257", "cp 1257" },
00234 { "windows1258", "cp 1258" },
00235 { "windows-850", "ibm850" },
00236 { "windows-866", "ibm866" },
00237 { "x-windows-850", "ibm850" },
00238 { "x-windows-866", "ibm866" },
00239 { "x-windows-1250", "cp 1250" },
00240 { "x-windows-1251", "cp 1251" },
00241 { "x-windows-1252", "cp 1252" },
00242 { "x-windows-1253", "cp 1253" },
00243 { "x-windows-1254", "cp 1254" },
00244 { "x-windows-1255", "cp 1255" },
00245 { "x-windows-1256", "cp 1256" },
00246 { "x-windows-1257", "cp 1257" },
00247 { "x-windows-1258", "cp 1258" },
00248 { "cp819", "iso 8859-1" },
00249 { "cp850", "ibm850" },
00250 { "cp866", "ibm866" },
00251 { "cp-819", "iso 8859-1" },
00252 { "cp-850", "ibm850" },
00253 { "cp-866", "ibm866" },
00254 { "cp-1250", "cp 1250" },
00255 { "cp-1251", "cp 1251" },
00256 { "cp-1252", "cp 1252" },
00257 { "cp-1253", "cp 1253" },
00258 { "cp-1254", "cp 1254" },
00259 { "cp-1255", "cp 1255" },
00260 { "cp-1256", "cp 1256" },
00261 { "cp-1257", "cp 1257" },
00262 { "cp-1258", "cp 1258" },
00263 { "cp-10000", "apple roman" },
00264 { "x-cp-850", "ibm850" },
00265 { "x-cp-866", "ibm866" },
00266 { "x-cp-1250", "cp 1250" },
00267 { "x-cp-1251", "cp 1251" },
00268 { "x-cp-1252", "cp 1252" },
00269 { "x-cp-1253", "cp 1253" },
00270 { "x-cp-1254", "cp 1254" },
00271 { "x-cp-1255", "cp 1255" },
00272 { "x-cp-1256", "cp 1256" },
00273 { "x-cp-1257", "cp 1257" },
00274 { "x-cp-1258", "cp 1258" },
00275 { "x-cp-10000", "apple roman" },
00276 { "ibm819", "iso 8859-1" },
00277 { "thai-tis620", "iso 8859-11" },
00278 { "windows-874", "cp 874" },
00279 { "windows874", "cp 874" },
00280 { "x-windows-874", "cp 874" },
00281 { "x-cp-874", "cp 874" },
00282 { "ibm 874", "cp 874" },
00283 { "ibm874", "cp 874" },
00284 { "x-ibm874", "cp 874" },
00285 { "ksc5601.1987-0", "euckr" },
00286 { "x-winsami2", "winsami2" },
00287 { "x-mac-roman", "apple roman" },
00288 { "macintosh", "apple roman" },
00289 { "mac", "apple roman" },
00290 { "csiso2022jp", "jis7" },
00291 { "big5-eten", "big5-hkscs" },
00292 { "cp950", "big5-hkscs" },
00293 { "x-mac-cyrillic", "maccyrillic" },
00294 { 0, 0 }};
00295
00296
00297
00298 static struct Aliases
00299 {
00300 const char* index;
00301 const char* data;
00302 } const aliases[] = {
00303 { "cp852", "ibm852" },
00304 { "cp-852", "ibm852" },
00305 { "x-cp-852", "ibm852" },
00306 { "windows852", "ibm852" },
00307 { "windows-852", "ibm852" },
00308 { "x-windows-852", "ibm852" },
00309 { 0, 0 }};
00310
00311
00312
00313
00314 static struct ConversionHints
00315 {
00316 const char* index;
00317 const char* data;
00318 } const conversion_hints[] = {
00319 { "cp1250", "iso-8859-2" },
00320 { "koi8-r", "iso-8859-5" },
00321 { "koi8-u", "koi8-r" },
00322
00323 { "pt 154", "cp 1251" },
00324 { "paratype-154", "cp 1251" },
00325 { "pt-154", "cp 1251" },
00326 { 0, 0 }};
00327
00328
00329
00330
00331 template< typename T, typename Data >
00332 static Data kcharsets_array_search( const T* start, const char* entry )
00333 {
00334 for( const T* pos = start;
00335 pos->index != 0;
00336 ++pos )
00337 if( qstrcmp( pos->index, entry ) == 0 )
00338 return pos->data;
00339 return 0;
00340 }
00341
00342
00343 class KCharsetsPrivate
00344 {
00345 public:
00346 KCharsetsPrivate(KCharsets* _kc)
00347 : codecForNameDict(43, false)
00348 {
00349 db = 0;
00350 kc = _kc;
00351 }
00352 ~KCharsetsPrivate()
00353 {
00354 delete db;
00355 }
00356 QFontDatabase *db;
00357 QAsciiDict<QTextCodec> codecForNameDict;
00358 KCharsets* kc;
00359 };
00360
00361
00362
00363 KCharsets::KCharsets()
00364 {
00365 d = new KCharsetsPrivate(this);
00366 }
00367
00368 KCharsets::~KCharsets()
00369 {
00370 delete d;
00371 }
00372
00373 QChar KCharsets::fromEntity(const QString &str)
00374 {
00375 QChar res = QChar::null;
00376
00377 int pos = 0;
00378 if(str[pos] == '&') pos++;
00379
00380
00381 if (str[pos] == '#' && str.length()-pos > 1) {
00382 bool ok;
00383 pos++;
00384 if (str[pos] == 'x' || str[pos] == 'X') {
00385 pos++;
00386
00387 QString tmp(str.unicode()+pos, str.length()-pos);
00388 res = tmp.toInt(&ok, 16);
00389 } else {
00390
00391 QString tmp(str.unicode()+pos, str.length()-pos);
00392 res = tmp.toInt(&ok, 10);
00393 }
00394 return res;
00395 }
00396
00397 const entity *e = kde_findEntity(str.ascii(), str.length());
00398
00399 if(!e)
00400 {
00401
00402 return QChar::null;
00403 }
00404
00405
00406 return QChar(e->code);
00407 }
00408
00409 QChar KCharsets::fromEntity(const QString &str, int &len)
00410 {
00411
00412
00413 len = 8;
00414 while(len > 0)
00415 {
00416 QString tmp = str.left(len);
00417 QChar res = fromEntity(tmp);
00418 if( res != QChar::null ) return res;
00419 len--;
00420 }
00421 return QChar::null;
00422 }
00423
00424
00425 QString KCharsets::toEntity(const QChar &ch)
00426 {
00427 QString ent;
00428 ent.sprintf("�x%x;", ch.unicode());
00429 return ent;
00430 }
00431
00432 QString KCharsets::resolveEntities( const QString &input )
00433 {
00434 QString text = input;
00435 const QChar *p = text.unicode();
00436 const QChar *end = p + text.length();
00437 const QChar *ampersand = 0;
00438 bool scanForSemicolon = false;
00439
00440 for ( ; p < end; ++p ) {
00441 const QChar ch = *p;
00442
00443 if ( ch == '&' ) {
00444 ampersand = p;
00445 scanForSemicolon = true;
00446 continue;
00447 }
00448
00449 if ( ch != ';' || scanForSemicolon == false )
00450 continue;
00451
00452 assert( ampersand );
00453
00454 scanForSemicolon = false;
00455
00456 const QChar *entityBegin = ampersand + 1;
00457
00458 const uint entityLength = p - entityBegin;
00459 if ( entityLength == 0 )
00460 continue;
00461
00462 const QChar entityValue = KCharsets::fromEntity( QConstString( entityBegin, entityLength ).string() );
00463 if ( entityValue.isNull() )
00464 continue;
00465
00466 const uint ampersandPos = ampersand - text.unicode();
00467
00468 text[ (int)ampersandPos ] = entityValue;
00469 text.remove( ampersandPos + 1, entityLength + 1 );
00470 p = text.unicode() + ampersandPos;
00471 end = text.unicode() + text.length();
00472 ampersand = 0;
00473 }
00474
00475 return text;
00476 }
00477
00478 QStringList KCharsets::availableEncodingNames()
00479 {
00480 QStringList available;
00481 for ( const char* const* pos = charsets_for_encoding; *pos; ++pos ) {
00482
00483 available.append( QString::fromLatin1( *pos ));
00484 }
00485 return available;
00486 }
00487
00488 QString KCharsets::languageForEncoding( const QString &encoding )
00489 {
00490 int lang = kcharsets_array_search< LanguageForEncoding, int >
00491 ( language_for_encoding, encoding.latin1());
00492 return i18n( language_names[lang] );
00493 }
00494
00495 QString KCharsets::encodingForName( const QString &descriptiveName )
00496 {
00497 const int left = descriptiveName.findRev( '(' );
00498
00499 if (left<0)
00500 return descriptiveName.stripWhiteSpace();
00501
00502 QString name(descriptiveName.mid(left+1));
00503
00504 const int right = name.findRev( ')' );
00505
00506 if (right<0)
00507 return name;
00508
00509 return name.left(right).stripWhiteSpace();
00510 }
00511
00512 QStringList KCharsets::descriptiveEncodingNames()
00513 {
00514
00515 QStringList encodings;
00516 for ( const LanguageForEncoding* pos = language_for_encoding; pos->index; ++pos ) {
00517 const QString name = QString::fromLatin1( pos->index );
00518 const QString description = i18n( language_names[ pos->data ] );
00519 encodings.append( i18n("Descriptive Encoding Name", "%1 ( %2 )"). arg ( description ). arg( name ) );
00520 }
00521 encodings.sort();
00522 return encodings;
00523 }
00524
00525 QTextCodec *KCharsets::codecForName(const QString &n) const
00526 {
00527 bool b;
00528 return codecForName( n, b );
00529 }
00530
00531 QTextCodec *KCharsets::codecForName(const QString &n, bool &ok) const
00532 {
00533 ok = true;
00534
00535 QTextCodec* codec = 0;
00536
00537 if((codec = d->codecForNameDict[n.isEmpty() ? "->locale<-" : n.latin1()]))
00538 return codec;
00539
00540 if (n.isEmpty()) {
00541 codec = KGlobal::locale()->codecForEncoding();
00542 d->codecForNameDict.replace("->locale<-", codec);
00543 return codec;
00544 }
00545
00546 QCString name = n.lower().latin1();
00547 QCString key = name;
00548 if (name.right(8) == "_charset")
00549 name.truncate(name.length()-8);
00550
00551 if (name.isEmpty()) {
00552 ok = false;
00553 return QTextCodec::codecForName("iso8859-1");
00554 }
00555
00556 codec = QTextCodec::codecForName(name);
00557
00558 if(codec) {
00559 d->codecForNameDict.replace(key, codec);
00560 return codec;
00561 }
00562
00563
00564
00565 QCString cname = kcharsets_array_search< Builtin, const char* >( builtin, name.data());
00566
00567 if(!cname.isEmpty())
00568 codec = QTextCodec::codecForName(cname);
00569
00570 if(codec)
00571 {
00572 d->codecForNameDict.replace(key, codec);
00573 return codec;
00574 }
00575
00576 QString dir;
00577 {
00578 KConfigGroupSaver cfgsav( KGlobal::config(), "i18n" );
00579 dir = KGlobal::config()->readPathEntry("i18ndir", QString::fromLatin1("/usr/share/i18n/charmaps"));
00580 }
00581
00582
00583
00584 cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data());
00585
00586 if(cname.isEmpty())
00587 cname = name;
00588 cname = cname.upper();
00589
00590 const QString basicName = QString::fromLatin1(cname);
00591 kdDebug() << k_funcinfo << endl << " Trying to find " << cname << " in " << dir << endl;
00592
00593 QString charMapFileName;
00594 bool gzipped = false;
00595 QDir qdir(dir);
00596 if (!qdir.exists()) {
00597
00598 }
00599 else if (qdir.exists(basicName, false)) {
00600 charMapFileName = basicName;
00601 }
00602 else if (qdir.exists(basicName+".gz", false)) {
00603 charMapFileName = basicName + ".gz";
00604 gzipped = true;
00605 }
00606 else {
00607
00608
00609
00610 QRegExp regexp("^(X-)?(CP|IBM)(-| )?(0-9)+");
00611 if ( regexp.search(basicName) != -1) {
00612 const QString num = regexp.cap(4);
00613 if (num.isEmpty()) {
00614
00615 }
00616 else if (qdir.exists("IBM"+num)) {
00617 charMapFileName = "IBM"+num;
00618 }
00619 else if (qdir.exists("IBM"+num+".gz")) {
00620 charMapFileName = "IBM"+num+".gz";
00621 gzipped = true;
00622 }
00623 else if (qdir.exists("CP"+num)) {
00624 charMapFileName = "CP"+num;
00625 }
00626 else if (qdir.exists("CP"+num+".gz")) {
00627 charMapFileName = "CP"+num+".gz";
00628 gzipped = true;
00629 }
00630 }
00631 }
00632
00633 if (gzipped && !charMapFileName.isEmpty()) {
00634 KQIODeviceGZip gzip(dir + "/" + charMapFileName);
00635 if (gzip.open(IO_ReadOnly)) {
00636 kdDebug() << "Loading gzipped charset..." << endl;
00637 codec = QTextCodec::loadCharmap(&gzip);
00638 gzip.close();
00639 }
00640 else
00641 kdWarning() << "Could not open gzipped charset!" << endl;
00642 }
00643 else if (!charMapFileName.isEmpty()) {
00644 codec = QTextCodec::loadCharmapFile(dir + "/" + charMapFileName);
00645 }
00646
00647 if(codec) {
00648 d->codecForNameDict.replace(key, codec);
00649 return codec;
00650 }
00651
00652
00653
00654 cname = kcharsets_array_search< ConversionHints, const char* >( conversion_hints, (const char*)name.data() );
00655
00656 if(!cname.isEmpty())
00657 codec = QTextCodec::codecForName(cname);
00658
00659 if(codec) {
00660 d->codecForNameDict.replace(key, codec);
00661 return codec;
00662 }
00663
00664
00665 ok = false;
00666 return QTextCodec::codecForName("iso8859-1");
00667 }