ispell_checker.cpp

00001 /* vim: set sw=8: -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
00002 /* kspell2 - adopted from Enchant
00003  * Copyright (C) 2003 Dom Lachowicz
00004  * Copyright (C) 2004 Zack Rusin <zack@kde.org>
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the
00018  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019  * Boston, MA 02110-1301, USA.
00020  *
00021  * In addition, as a special exception, Dom Lachowicz
00022  * gives permission to link the code of this program with
00023  * non-LGPL Spelling Provider libraries (eg: a MSFT Office
00024  * spell checker backend) and distribute linked combinations including
00025  * the two.  You must obey the GNU Lesser General Public License in all
00026  * respects for all of the code used other than said providers.  If you modify
00027  * this file, you may extend this exception to your version of the
00028  * file, but you are not obligated to do so.  If you do not wish to
00029  * do so, delete this exception statement from your version.
00030  */
00031 
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include <string.h>
00035 
00036 #include <string>
00037 #include <vector>
00038 
00039 #include "sp_spell.h"
00040 #include "ispell_checker.h"
00041 
00042 #include <qmap.h>
00043 #include <qdir.h>
00044 #include <qfileinfo.h>
00045 
00046 /***************************************************************************/
00047 
00048 typedef struct str_ispell_map
00049 {
00050     const char * lang;
00051     const char * dict;
00052     const char * enc;
00053 } IspellMap;
00054 
00055 static const char *ispell_dirs [] = {
00056     "/usr/lib/ispell",
00057     "/usr/local/lib/ispell",
00058     "/usr/local/share/ispell",
00059     "/usr/share/ispell",
00060     0
00061 };
00062 static const IspellMap ispell_map [] = {
00063     {"ca"    ,"catala.hash"         ,"iso-8859-1" },
00064     {"ca_ES" ,"catala.hash"         ,"iso-8859-1" },
00065     {"cs"    ,"czech.hash"          ,"iso-8859-2" },
00066     {"cs_CZ" ,"czech.hash"          ,"iso-8859-2" },
00067     {"da"    ,"dansk.hash"          ,"iso-8859-1" },
00068     {"da_DK" ,"dansk.hash"          ,"iso-8859-1" },
00069     {"de"    ,"deutsch.hash"        ,"iso-8859-1" },
00070     {"de_CH" ,"swiss.hash"          ,"iso-8859-1" },
00071     {"de_AT" ,"deutsch.hash"        ,"iso-8859-1" },
00072     {"de_DE" ,"deutsch.hash"        ,"iso-8859-1" },
00073     {"el"    ,"ellhnika.hash"       ,"iso-8859-7" },
00074     {"el_GR" ,"ellhnika.hash"       ,"iso-8859-7" },
00075     {"en"    ,"british.hash"        ,"iso-8859-1" },
00076     {"en_AU" ,"british.hash"        ,"iso-8859-1" },
00077     {"en_BZ" ,"british.hash"        ,"iso-8859-1" },
00078     {"en_CA" ,"british.hash"        ,"iso-8859-1" },
00079     {"en_GB" ,"british.hash"        ,"iso-8859-1" },
00080     {"en_IE" ,"british.hash"        ,"iso-8859-1" },
00081     {"en_JM" ,"british.hash"        ,"iso-8859-1" },
00082     {"en_NZ" ,"british.hash"        ,"iso-8859-1" },
00083     {"en_TT" ,"british.hash"        ,"iso-8859-1" },
00084     {"en_ZA" ,"british.hash"        ,"iso-8859-1" },
00085     {"en_ZW" ,"british.hash"        ,"iso-8859-1" },
00086     {"en_PH" ,"american.hash"       ,"iso-8859-1" },
00087     {"en_US" ,"american.hash"       ,"iso-8859-1" },
00088     {"eo"    ,"esperanto.hash"      ,"iso-8859-3" },
00089     {"es"    ,"espanol.hash"        ,"iso-8859-1" },
00090     {"es_AR" ,"espanol.hash"        ,"iso-8859-1" },
00091     {"es_BO" ,"espanol.hash"        ,"iso-8859-1" },
00092     {"es_CL" ,"espanol.hash"        ,"iso-8859-1" },
00093     {"es_CO" ,"espanol.hash"        ,"iso-8859-1" },
00094     {"es_CR" ,"espanol.hash"        ,"iso-8859-1" },
00095     {"es_DO" ,"espanol.hash"        ,"iso-8859-1" },
00096     {"es_EC" ,"espanol.hash"        ,"iso-8859-1" },
00097     {"es_ES" ,"espanol.hash"        ,"iso-8859-1" },
00098     {"es_GT" ,"espanol.hash"        ,"iso-8859-1" },
00099     {"es_HN" ,"espanol.hash"        ,"iso-8859-1" },
00100     {"es_MX" ,"espanol.hash"        ,"iso-8859-1" },
00101     {"es_NI" ,"espanol.hash"        ,"iso-8859-1" },
00102     {"es_PA" ,"espanol.hash"        ,"iso-8859-1" },
00103     {"es_PE" ,"espanol.hash"        ,"iso-8859-1" },
00104     {"es_PR" ,"espanol.hash"        ,"iso-8859-1" },
00105     {"es_PY" ,"espanol.hash"        ,"iso-8859-1" },
00106     {"es_SV" ,"espanol.hash"        ,"iso-8859-1" },
00107     {"es_UY" ,"espanol.hash"        ,"iso-8859-1" },
00108     {"es_VE" ,"espanol.hash"        ,"iso-8859-1" },
00109     {"fi"    ,"finnish.hash"        ,"iso-8859-1" },
00110     {"fi_FI" ,"finnish.hash"        ,"iso-8859-1" },
00111     {"fr"    ,"francais.hash"       ,"iso-8859-1" },
00112     {"fr_BE" ,"francais.hash"       ,"iso-8859-1" },
00113     {"fr_CA" ,"francais.hash"       ,"iso-8859-1" },
00114     {"fr_CH" ,"francais.hash"       ,"iso-8859-1" },
00115     {"fr_FR" ,"francais.hash"       ,"iso-8859-1" },
00116     {"fr_LU" ,"francais.hash"       ,"iso-8859-1" },
00117     {"fr_MC" ,"francais.hash"       ,"iso-8859-1" },
00118     {"hu"    ,"hungarian.hash"      ,"iso-8859-2" },
00119     {"hu_HU" ,"hungarian.hash"      ,"iso-8859-2" },
00120     {"ga"    ,"irish.hash"          ,"iso-8859-1" },
00121     {"ga_IE" ,"irish.hash"          ,"iso-8859-1" },
00122     {"gl"    ,"galician.hash"       ,"iso-8859-1" },
00123     {"gl_ES" ,"galician.hash"       ,"iso-8859-1" },
00124     {"ia"    ,"interlingua.hash"    ,"iso-8859-1" },
00125     {"it"    ,"italian.hash"        ,"iso-8859-1" },
00126     {"it_IT" ,"italian.hash"        ,"iso-8859-1" },
00127     {"it_CH" ,"italian.hash"        ,"iso-8859-1" },
00128     {"la"    ,"mlatin.hash"         ,"iso-8859-1" },
00129     {"la_IT" ,"mlatin.hash"         ,"iso-8859-1" },
00130     {"lt"    ,"lietuviu.hash"       ,"iso-8859-13" },
00131     {"lt_LT" ,"lietuviu.hash"       ,"iso-8859-13" },
00132     {"nl"    ,"nederlands.hash"     ,"iso-8859-1" },
00133     {"nl_NL" ,"nederlands.hash"     ,"iso-8859-1" },
00134     {"nl_BE" ,"nederlands.hash"     ,"iso-8859-1" },
00135     {"nb"    ,"norsk.hash"          ,"iso-8859-1" },
00136     {"nb_NO" ,"norsk.hash"          ,"iso-8859-1" },
00137     {"nn"    ,"nynorsk.hash"        ,"iso-8859-1" },
00138     {"nn_NO" ,"nynorsk.hash"        ,"iso-8859-1" },
00139     {"no"    ,"norsk.hash"          ,"iso-8859-1" },
00140     {"no_NO" ,"norsk.hash"          ,"iso-8859-1" },
00141     {"pl"    ,"polish.hash"         ,"iso-8859-2" },
00142     {"pl_PL" ,"polish.hash"         ,"iso-8859-2" },
00143     {"pt"    ,"brazilian.hash"      ,"iso-8859-1" },
00144     {"pt_BR" ,"brazilian.hash"      ,"iso-8859-1" },
00145     {"pt_PT" ,"portugues.hash"      ,"iso-8859-1" },
00146     {"ru"    ,"russian.hash"        ,"koi8-r" },
00147     {"ru_MD" ,"russian.hash"        ,"koi8-r" },
00148     {"ru_RU" ,"russian.hash"        ,"koi8-r" },
00149     {"sc"    ,"sardinian.hash"      ,"iso-8859-1" },
00150     {"sc_IT" ,"sardinian.hash"      ,"iso-8859-1" },
00151     {"sk"    ,"slovak.hash"         ,"iso-8859-2" },
00152     {"sk_SK" ,"slovak.hash"         ,"iso-8859-2" },
00153     {"sl"    ,"slovensko.hash"      ,"iso-8859-2" },
00154     {"sl_SI" ,"slovensko.hash"      ,"iso-8859-2" },
00155     {"sv"    ,"svenska.hash"        ,"iso-8859-1" },
00156     {"sv_SE" ,"svenska.hash"        ,"iso-8859-1" },
00157     {"uk"    ,"ukrainian.hash"      ,"koi8-u" },
00158     {"uk_UA" ,"ukrainian.hash"      ,"koi8-u" },
00159     {"yi"    ,"yiddish-yivo.hash"   ,"utf-8" }
00160 };
00161 
00162 static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) );
00163 static QMap<QString, QString> ispell_dict_map;
00164 
00165 
00166 void
00167 ISpellChecker::try_autodetect_charset(const char * const inEncoding)
00168 {
00169     if (inEncoding && strlen(inEncoding))
00170         {
00171             m_translate_in = QTextCodec::codecForName(inEncoding);
00172         }
00173 }
00174 
00175 /***************************************************************************/
00176 /***************************************************************************/
00177 
00178 ISpellChecker::ISpellChecker()
00179     : deftflag(-1),
00180      prefstringchar(-1),
00181      m_bSuccessfulInit(false),
00182      m_BC(NULL),
00183      m_cd(NULL),
00184      m_cl(NULL),
00185      m_cm(NULL),
00186      m_ho(NULL),
00187      m_nd(NULL),
00188      m_so(NULL),
00189      m_se(NULL),
00190      m_ti(NULL),
00191      m_te(NULL),
00192      m_hashstrings(NULL),
00193      m_hashtbl(NULL),
00194      m_pflaglist(NULL),
00195      m_sflaglist(NULL),
00196      m_chartypes(NULL),
00197      m_infile(NULL),
00198      m_outfile(NULL),
00199      m_askfilename(NULL),
00200      m_Trynum(0),
00201      m_translate_in(0)
00202 {
00203     memset(m_sflagindex,0,sizeof(m_sflagindex));
00204     memset(m_pflagindex,0,sizeof(m_pflagindex));
00205 }
00206 
00207 #ifndef FREEP
00208 #define FREEP(p)        do { if (p) free(p); } while (0)
00209 #endif
00210 
00211 ISpellChecker::~ISpellChecker()
00212 {
00213     if (m_bSuccessfulInit) {
00214         // only cleanup our mess if we were successfully initialized
00215 
00216         clearindex (m_pflagindex);
00217         clearindex (m_sflagindex);
00218     }
00219 
00220     FREEP(m_hashtbl);
00221     FREEP(m_hashstrings);
00222     FREEP(m_sflaglist);
00223     FREEP(m_chartypes);
00224 
00225     delete m_translate_in;
00226     m_translate_in = 0;
00227 }
00228 
00229 bool
00230 ISpellChecker::checkWord( const QString& utf8Word )
00231 {
00232     ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN];
00233     if (!m_bSuccessfulInit)
00234         return false;
00235 
00236     if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty())
00237         return false;
00238 
00239     bool retVal = false;
00240     QCString out;
00241     if (!m_translate_in)
00242         return false;
00243     else {
00244         /* convert to 8bit string and null terminate */
00245         int len_out = utf8Word.length();
00246 
00247         out = m_translate_in->fromUnicode( utf8Word, len_out );
00248     }
00249 
00250     if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
00251         {
00252             if (good(iWord, 0, 0, 1, 0) == 1 ||
00253                 compoundgood(iWord, 1) == 1)
00254                 {
00255                     retVal = true;
00256                 }
00257         }
00258 
00259     return retVal;
00260 }
00261 
00262 QStringList
00263 ISpellChecker::suggestWord(const QString& utf8Word)
00264 {
00265     ichar_t  iWord[INPUTWORDLEN + MAXAFFIXLEN];
00266     int  c;
00267 
00268     if (!m_bSuccessfulInit)
00269         return QStringList();
00270 
00271     if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) ||
00272             utf8Word.length() == 0)
00273         return QStringList();
00274 
00275     QCString out;
00276     if (!m_translate_in)
00277         return QStringList();
00278     else
00279         {
00280             /* convert to 8bit string and null terminate */
00281 
00282             int len_out = utf8Word.length();
00283             out = m_translate_in->fromUnicode( utf8Word, len_out );
00284         }
00285 
00286     if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
00287         makepossibilities(iWord);
00288     else
00289         return QStringList();
00290 
00291     QStringList sugg_arr;
00292     for (c = 0; c < m_pcount; c++)
00293     {
00294         QString utf8Word;
00295 
00296         if (!m_translate_in)
00297         {
00298             /* copy to 8bit string and null terminate */
00299             utf8Word = QString::fromUtf8( m_possibilities[c] );
00300         }
00301         else
00302         {
00303             /* convert to 32bit string and null terminate */
00304             utf8Word = m_translate_in->toUnicode( m_possibilities[c] );
00305         }
00306 
00307         sugg_arr.append( utf8Word );
00308     }
00309 
00310     return sugg_arr;
00311 }
00312 
00313 static void
00314 s_buildHashNames (std::vector<std::string> & names, const char * dict)
00315 {
00316     const char * tmp = 0;
00317     int i = 0;
00318 
00319     names.clear ();
00320 
00321     while ( (tmp = ispell_dirs[i++]) ) {
00322         QCString maybeFile = QCString( tmp ) + '/';
00323         maybeFile += dict;
00324         names.push_back( maybeFile.data() );
00325     }
00326 }
00327 
00328 static void
00329 s_allDics()
00330 {
00331     const char * tmp = 0;
00332     int i = 0;
00333 
00334     while ( (tmp = ispell_dirs[i++]) ) {
00335         QDir dir( tmp );
00336         QStringList lst = dir.entryList( "*.hash" );
00337         for ( QStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) {
00338             QFileInfo info( *it );
00339             for (size_t i = 0; i < size_ispell_map; i++)
00340             {
00341                 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
00342                 if (!strcmp (info.fileName().latin1(), mapping->dict))
00343                 {
00344                     ispell_dict_map.insert( mapping->lang, *it );
00345                 }
00346             }
00347         }
00348     }
00349 }
00350 
00351 QValueList<QString>
00352 ISpellChecker::allDics()
00353 {
00354     if ( ispell_dict_map.empty() )
00355         s_allDics();
00356 
00357     return ispell_dict_map.keys();
00358 }
00359 
00360 QString
00361 ISpellChecker::loadDictionary (const char * szdict)
00362 {
00363     std::vector<std::string> dict_names;
00364 
00365     s_buildHashNames (dict_names, szdict);
00366 
00367     for (size_t i = 0; i < dict_names.size(); i++)
00368         {
00369             if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0)
00370                 return dict_names[i].c_str();
00371         }
00372 
00373     return QString::null;
00374 }
00375 
00382 bool
00383 ISpellChecker::loadDictionaryForLanguage ( const char * szLang )
00384 {
00385     QString hashname;
00386 
00387     const char * encoding = NULL;
00388     const char * szFile = NULL;
00389 
00390     for (size_t i = 0; i < size_ispell_map; i++)
00391         {
00392             const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
00393             if (!strcmp (szLang, mapping->lang))
00394                 {
00395                     szFile = mapping->dict;
00396                     encoding = mapping->enc;
00397                     break;
00398                 }
00399         }
00400 
00401     if (!szFile || !strlen(szFile))
00402         return false;
00403 
00404     alloc_ispell_struct();
00405 
00406     hashname = loadDictionary(szFile);
00407     if (hashname.isEmpty())
00408         return false;
00409 
00410     // one of the two above calls succeeded
00411     setDictionaryEncoding (hashname, encoding);
00412 
00413     return true;
00414 }
00415 
00416 void
00417 ISpellChecker::setDictionaryEncoding( const QString& hashname, const char * encoding )
00418 {
00419     /* Get Hash encoding from XML file. This should always work! */
00420     try_autodetect_charset(encoding);
00421 
00422     if (m_translate_in)
00423         {
00424             /* We still have to setup prefstringchar*/
00425             prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag
00426                               : static_cast<int *>(NULL));
00427 
00428             if (prefstringchar < 0)
00429                 {
00430                     std::string teststring;
00431                     for(int n1 = 1; n1 <= 15; n1++)
00432                         {
00433                             teststring = "latin" + n1;
00434                             prefstringchar = findfiletype(teststring.c_str(), 1,
00435                                               deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00436                             if (prefstringchar >= 0)
00437                                 break;
00438                         }
00439                 }
00440 
00441             return; /* success */
00442         }
00443 
00444     /* Test for UTF-8 first */
00445     prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00446     if (prefstringchar >= 0)
00447         {
00448             m_translate_in = QTextCodec::codecForName("utf8");
00449         }
00450 
00451     if (m_translate_in)
00452         return; /* success */
00453 
00454     /* Test for "latinN" */
00455     if (!m_translate_in)
00456         {
00457             /* Look for "altstringtype" names from latin1 to latin15 */
00458             for(int n1 = 1; n1 <= 15; n1++)
00459                 {
00460                     QString teststring = QString("latin%1").arg(n1);
00461                     prefstringchar = findfiletype(teststring.latin1(), 1,
00462                                       deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00463                     if (prefstringchar >= 0)
00464                         {
00465                             //FIXME: latin1 might be wrong
00466                             m_translate_in = QTextCodec::codecForName( teststring.latin1() );
00467                             break;
00468                         }
00469                 }
00470         }
00471 
00472     /* If nothing found, use latin1 */
00473     if (!m_translate_in)
00474         {
00475             m_translate_in = QTextCodec::codecForName("latin1");
00476         }
00477 }
00478 
00479 bool
00480 ISpellChecker::requestDictionary(const char *szLang)
00481 {
00482     if (!loadDictionaryForLanguage (szLang))
00483         {
00484             // handle a shortened version of the language tag: en_US => en
00485             std::string shortened_dict (szLang);
00486             size_t uscore_pos;
00487 
00488             if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) {
00489                 shortened_dict = shortened_dict.substr(0, uscore_pos);
00490                 if (!loadDictionaryForLanguage (shortened_dict.c_str()))
00491                     return false;
00492             } else
00493                 return false;
00494         }
00495 
00496     m_bSuccessfulInit = true;
00497 
00498     if (prefstringchar < 0)
00499         m_defdupchar = 0;
00500     else
00501         m_defdupchar = prefstringchar;
00502 
00503     return true;
00504 }
KDE Home | KDE Accessibility Home | Description of Access Keys