LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
00001 /***************************************************************************
00002           ofx_preproc.cpp
00003                              -------------------
00004     copyright            : (C) 2002 by Benoit Gr�oir
00005     email                : benoitg@coeus.ca
00006 ***************************************************************************/
00012 /***************************************************************************
00013  *                                                                         *
00014  *   This program is free software; you can redistribute it and/or modify  *
00015  *   it under the terms of the GNU General Public License as published by  *
00016  *   the Free Software Foundation; either version 2 of the License, or     *
00017  *   (at your option) any later version.                                   *
00018  *                                                                         *
00019  ***************************************************************************/
00020 #include "../config.h"
00021 #include <iostream>
00022 #include <fstream>
00023 #include <cstdlib>
00024 #include <stdio.h>
00025 #include <string>
00026 #include "ParserEventGeneratorKit.h"
00027 #include "libofx.h"
00028 #include "messages.hh"
00029 #include "ofx_sgml.hh"
00030 #include "ofc_sgml.hh"
00031 #include "ofx_preproc.hh"
00032 #include "ofx_utilities.hh"
00033 #ifdef HAVE_ICONV
00034 #include <iconv.h>
00035 #endif
00036 
00037 #ifdef OS_WIN32
00038 # define DIRSEP "\\"
00039 #else
00040 # define DIRSEP "/"
00041 #endif
00042 
00043 #ifdef OS_WIN32
00044 # include "win32.hh"
00045 # include <windows.h> // for GetModuleFileName()
00046 # undef ERROR
00047 # undef DELETE
00048 #endif
00049 
00050 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
00051 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
00052 
00053 using namespace std;
00057 #ifdef MAKEFILE_DTD_PATH
00058 const int DTD_SEARCH_PATH_NUM = 4;
00059 #else
00060 const int DTD_SEARCH_PATH_NUM = 3;
00061 #endif
00062 
00066 const char *DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM] =
00067 {
00068 #ifdef MAKEFILE_DTD_PATH
00069   MAKEFILE_DTD_PATH ,
00070 #endif
00071   "/usr/local/share/libofx/dtd",
00072   "/usr/share/libofx/dtd",
00073   "~"
00074 };
00075 const unsigned int READ_BUFFER_SIZE = 1024;
00076 
00081 int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
00082 {
00083   LibofxContext *libofx_context;
00084   bool ofx_start = false;
00085   bool ofx_end = false;
00086   bool file_is_xml = false;
00087 
00088   ifstream input_file;
00089   ofstream tmp_file;
00090   char buffer[READ_BUFFER_SIZE];
00091   char *iconv_buffer;
00092   string s_buffer;
00093   char *filenames[3];
00094   char tmp_filename[256];
00095   int tmp_file_fd;
00096 #ifdef HAVE_ICONV
00097   iconv_t conversion_descriptor;
00098 #endif
00099   libofx_context = (LibofxContext*)ctx;
00100 
00101   if (p_filename != NULL && strcmp(p_filename, "") != 0)
00102   {
00103     message_out(DEBUG, string("ofx_proc_file():Opening file: ") + p_filename);
00104 
00105     input_file.open(p_filename);
00106     if (!input_file)
00107     {
00108       message_out(ERROR, "ofx_proc_file():Unable to open the input file " + string(p_filename));
00109     }
00110 
00111     mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
00112 
00113     message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + string(tmp_filename));
00114     tmp_file_fd = mkstemp(tmp_filename);
00115     if (tmp_file_fd)
00116     {
00117       tmp_file.open(tmp_filename);
00118       if (!tmp_file)
00119       {
00120         message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + string(tmp_filename));
00121         return -1;
00122       }
00123     }
00124     else
00125     {
00126       message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + string(tmp_filename));
00127       return -1;
00128     }
00129 
00130     if (input_file && tmp_file)
00131     {
00132       int header_separator_idx;
00133       string header_name;
00134       string header_value;
00135       string ofx_encoding;
00136       string ofx_charset;
00137       do
00138       {
00139         s_buffer.clear();
00140         bool end_of_line = false;
00141         do
00142         {
00143           input_file.get(buffer, sizeof(buffer), '\n');
00144           //cout<< "got: \"" << buffer<<"\"\n";
00145           s_buffer.append(buffer);
00146 
00147           // Watch out: If input_file is in eof(), any subsequent read or
00148           // peek() will fail and we must exit this loop.
00149           if (input_file.eof())
00150             break;
00151 
00152           //cout<<"input_file.gcount(): "<<input_file.gcount()<< " s_buffer.size=" << s_buffer.size()<<" sizeof(buffer): "<<sizeof(buffer) << " peek=\"" << int(input_file.peek()) << "\"" <<endl;
00153           if (input_file.fail()) // If no characters were extracted above, the failbit is set.
00154           {
00155             // No characters extracted means that we've reached the newline
00156             // delimiter (because we already checked for EOF). We will check
00157             // for and remove that newline in the next if-clause, but must
00158             // remove the failbit so that peek() will work again.
00159             input_file.clear();
00160           }
00161 
00162           // Is the next character really the newline?
00163           if (input_file.peek() == '\n')
00164           {
00165             // Yes. Then discard that newline character from the stream and
00166             // append it manually to the output string.
00167             input_file.get();
00168             s_buffer.append("\n");
00169             end_of_line = true; // We found the end-of-line.
00170           }
00171         }
00172         // Continue reading as long as we're not at EOF *and* we've not yet
00173         // reached an end-of-line.
00174         while (!input_file.eof() && !end_of_line);
00175 
00176         if (ofx_start == false && (s_buffer.find("<?xml") != string::npos))
00177         {
00178           message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
00179           file_is_xml = true;
00180         }
00181 
00182         int ofx_start_idx;
00183         if (ofx_start == false &&
00184             (
00185               (libofx_context->currentFileType() == OFX &&
00186                ((ofx_start_idx = s_buffer.find("<OFX>")) !=
00187                 string::npos || (ofx_start_idx = s_buffer.find("<ofx>")) != string::npos))
00188               || (libofx_context->currentFileType() == OFC &&
00189                   ((ofx_start_idx = s_buffer.find("<OFC>")) != string::npos ||
00190                    (ofx_start_idx = s_buffer.find("<ofc>")) != string::npos))
00191             )
00192            )
00193         {
00194           ofx_start = true;
00195           if (file_is_xml == false)
00196           {
00197             s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
00198           }
00199           message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
00200 
00201           if (file_is_xml == true)
00202           {
00203             static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
00204             if (putenv(sp_charset_fixed) != 0)
00205             {
00206               message_out(ERROR, "ofx_proc_file(): putenv failed");
00207             }
00208             /* Normally the following would be "xml".
00209              * Unfortunately, opensp's generic api will garble UTF-8 if this is
00210              * set to xml.  So we set any single byte encoding to avoid messing
00211              * up UTF-8.  Unfortunately this means that non-UTF-8 files will not
00212              * get properly translated.  We'd need to manually detect the
00213              * encoding in the XML header and convert the xml with iconv like we
00214              * do for SGML to work around the problem.  Most unfortunate. */
00215             static char sp_encoding[] = "SP_ENCODING=ms-dos";
00216             if (putenv(sp_encoding) != 0)
00217             {
00218               message_out(ERROR, "ofx_proc_file(): putenv failed");
00219             }
00220           }
00221           else
00222           {
00223             static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
00224             if (putenv(sp_charset_fixed) != 0)
00225             {
00226               message_out(ERROR, "ofx_proc_file(): putenv failed");
00227             }
00228             static char sp_encoding[] = "SP_ENCODING=ms-dos"; //Any single byte encoding will do, we don't want opensp messing up UTF-8;
00229             if (putenv(sp_encoding) != 0)
00230             {
00231               message_out(ERROR, "ofx_proc_file(): putenv failed");
00232             }
00233 #ifdef HAVE_ICONV
00234             string fromcode;
00235             string tocode;
00236             if (ofx_encoding.compare("USASCII") == 0)
00237             {
00238               if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
00239               {
00240                 //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
00241                 fromcode = "ISO-8859-1";
00242               }
00243               else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
00244               {
00245                 //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
00246                 fromcode = "CP1252";
00247               }
00248               else if (ofx_charset.compare("NONE") == 0)
00249               {
00250                 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00251               }
00252               else
00253               {
00254                 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00255               }
00256             }
00257             else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
00258             {
00259               //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
00260               fromcode = "UTF-8";
00261             }
00262             else
00263             {
00264               fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
00265             }
00266             tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
00267             message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
00268             conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
00269 #endif
00270           }
00271         }
00272         else
00273         {
00274           //We are still in the headers
00275           if ((header_separator_idx = s_buffer.find(':')) != string::npos)
00276           {
00277             //Header processing
00278             header_name.assign(s_buffer.substr(0, header_separator_idx));
00279             header_value.assign(s_buffer.substr(header_separator_idx + 1));
00280             while ( header_value[header_value.length() -1 ] == '\n' ||
00281                     header_value[header_value.length() -1 ] == '\r' )
00282               header_value.erase(header_value.length() - 1);
00283             message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
00284             if (header_name.compare("ENCODING") == 0)
00285             {
00286               ofx_encoding.assign(header_value);
00287             }
00288             if (header_name.compare("CHARSET") == 0)
00289             {
00290               ofx_charset.assign(header_value);
00291             }
00292           }
00293         }
00294 
00295         if (file_is_xml == true || (ofx_start == true && ofx_end == false))
00296         {
00297           if (ofx_start == true)
00298           {
00299             /* The above test won't help us if the <OFX> tag is on the same line
00300              * as the xml header, but as opensp can't be used to parse it anyway
00301              * this isn't a great loss for now.
00302              */
00303             s_buffer = sanitize_proprietary_tags(s_buffer);
00304           }
00305           //cout<< s_buffer<<"\n";
00306           if (file_is_xml == false)
00307           {
00308 #ifdef HAVE_ICONV
00309             size_t inbytesleft = strlen(s_buffer.c_str());
00310             size_t outbytesleft = inbytesleft * 2 - 1;
00311             iconv_buffer = (char*) malloc (inbytesleft * 2);
00312             memset(iconv_buffer, 0, inbytesleft * 2);
00313 #if defined(OS_WIN32) || defined(__sun)
00314             const char * inchar = (const char *)s_buffer.c_str();
00315 #else
00316             char * inchar = (char *)s_buffer.c_str();
00317 #endif
00318             char * outchar = iconv_buffer;
00319             int iconv_retval = iconv (conversion_descriptor,
00320                                       &inchar, &inbytesleft,
00321                                       &outchar, &outbytesleft);
00322             if (iconv_retval == -1)
00323             {
00324               message_out(ERROR, "ofx_proc_file(): Conversion error");
00325             }
00326             s_buffer = iconv_buffer;
00327             free (iconv_buffer);
00328 #endif
00329           }
00330           cout << s_buffer << "\n";
00331           tmp_file.write(s_buffer.c_str(), s_buffer.length());
00332         }
00333 
00334         if (ofx_start == true &&
00335             (
00336               (libofx_context->currentFileType() == OFX &&
00337                ((ofx_start_idx = s_buffer.find("</OFX>")) != string::npos ||
00338                 (ofx_start_idx = s_buffer.find("</ofx>")) != string::npos))
00339               || (libofx_context->currentFileType() == OFC &&
00340                   ((ofx_start_idx = s_buffer.find("</OFC>")) != string::npos ||
00341                    (ofx_start_idx = s_buffer.find("</ofc>")) != string::npos))
00342             )
00343            )
00344         {
00345           ofx_end = true;
00346           message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC>  has been found");
00347         }
00348 
00349       }
00350       while (!input_file.eof() && !input_file.bad());
00351     }
00352     input_file.close();
00353     tmp_file.close();
00354 #ifdef HAVE_ICONV
00355     if (file_is_xml == false)
00356     {
00357       iconv_close(conversion_descriptor);
00358     }
00359 #endif
00360     char filename_openspdtd[255];
00361     char filename_dtd[255];
00362     char filename_ofx[255];
00363     strncpy(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME).c_str(), 255); //The opensp sgml dtd file
00364     if (libofx_context->currentFileType() == OFX)
00365     {
00366       strncpy(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME).c_str(), 255); //The ofx dtd file
00367     }
00368     else if (libofx_context->currentFileType() == OFC)
00369     {
00370       strncpy(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME).c_str(), 255); //The ofc dtd file
00371     }
00372     else
00373     {
00374       message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00375     }
00376 
00377     if ((string)filename_dtd != "" && (string)filename_openspdtd != "")
00378     {
00379       strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
00380       filenames[0] = filename_openspdtd;
00381       filenames[1] = filename_dtd;
00382       filenames[2] = filename_ofx;
00383       if (libofx_context->currentFileType() == OFX)
00384       {
00385         ofx_proc_sgml(libofx_context, 3, filenames);
00386       }
00387       else if (libofx_context->currentFileType() == OFC)
00388       {
00389         ofc_proc_sgml(libofx_context, 3, filenames);
00390       }
00391       else
00392       {
00393         message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser"));
00394       }
00395       if (remove(tmp_filename) != 0)
00396       {
00397         message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + string(tmp_filename));
00398       }
00399     }
00400     else
00401     {
00402       message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
00403     }
00404   }
00405   else
00406   {
00407     message_out(ERROR, "ofx_proc_file():No input file specified");
00408   }
00409   return 0;
00410 }
00411 
00412 
00417 string sanitize_proprietary_tags(string input_string)
00418 {
00419   unsigned int i;
00420   size_t input_string_size;
00421   bool strip = false;
00422   bool tag_open = false;
00423   int tag_open_idx = 0; //Are we within < > ?
00424   bool closing_tag_open = false; //Are we within </ > ?
00425   int orig_tag_open_idx = 0;
00426   bool proprietary_tag = false; //Are we within a proprietary element?
00427   bool proprietary_closing_tag = false;
00428   int crop_end_idx = 0;
00429   char buffer[READ_BUFFER_SIZE] = "";
00430   char tagname[READ_BUFFER_SIZE] = "";
00431   int tagname_idx = 0;
00432   char close_tagname[READ_BUFFER_SIZE] = "";
00433 
00434   for (i = 0; i < READ_BUFFER_SIZE; i++)
00435   {
00436     buffer[i] = 0;
00437     tagname[i] = 0;
00438     close_tagname[i] = 0;
00439   }
00440 
00441   input_string_size = input_string.size();
00442 
00443   for (i = 0; i <= input_string_size; i++)
00444   {
00445     if (input_string.c_str()[i] == '<')
00446     {
00447       tag_open = true;
00448       tag_open_idx = i;
00449       if (proprietary_tag == true && input_string.c_str()[i+1] == '/')
00450       {
00451         //We are now in a closing tag
00452         closing_tag_open = true;
00453         //cout<<"Comparaison: "<<tagname<<"|"<<&(input_string.c_str()[i+2])<<"|"<<strlen(tagname)<<endl;
00454         if (strncmp(tagname, &(input_string.c_str()[i+2]), strlen(tagname)) != 0)
00455         {
00456           //If it is the begining of an other tag
00457           //cout<<"DIFFERENT!"<<endl;
00458           crop_end_idx = i - 1;
00459           strip = true;
00460         }
00461         else
00462         {
00463           //Otherwise, it is the start of the closing tag of the proprietary tag
00464           proprietary_closing_tag = true;
00465         }
00466       }
00467       else if (proprietary_tag == true)
00468       {
00469         //It is the start of a new tag, following a proprietary tag
00470         crop_end_idx = i - 1;
00471         strip = true;
00472       }
00473     }
00474     else if (input_string.c_str()[i] == '>')
00475     {
00476       tag_open = false;
00477       closing_tag_open = false;
00478       tagname[tagname_idx] = 0;
00479       tagname_idx = 0;
00480       if (proprietary_closing_tag == true)
00481       {
00482         crop_end_idx = i;
00483         strip = true;
00484       }
00485     }
00486     else if (tag_open == true && closing_tag_open == false)
00487     {
00488       if (input_string.c_str()[i] == '.')
00489       {
00490         if (proprietary_tag != true)
00491         {
00492           orig_tag_open_idx = tag_open_idx;
00493           proprietary_tag = true;
00494         }
00495       }
00496       tagname[tagname_idx] = input_string.c_str()[i];
00497       tagname_idx++;
00498     }
00499     //cerr <<i<<endl;
00500     if (strip == true && orig_tag_open_idx < input_string.size())
00501     {
00502       input_string.copy(buffer, (crop_end_idx - orig_tag_open_idx) + 1, orig_tag_open_idx);
00503       message_out(INFO, "sanitize_proprietary_tags() (end tag or new tag) removed: " + string(buffer));
00504       input_string.erase(orig_tag_open_idx, (crop_end_idx - orig_tag_open_idx) + 1);
00505       i = orig_tag_open_idx - 1;
00506       proprietary_tag = false;
00507       proprietary_closing_tag = false;
00508       closing_tag_open = false;
00509       tag_open = false;
00510       strip = false;
00511     }
00512 
00513   }//end for
00514   if (proprietary_tag == true && orig_tag_open_idx < input_string.size())
00515   {
00516     if (crop_end_idx == 0)   //no closing tag
00517     {
00518       crop_end_idx = input_string.size() - 1;
00519     }
00520     input_string.copy(buffer, (crop_end_idx - orig_tag_open_idx) + 1, orig_tag_open_idx);
00521     message_out(INFO, "sanitize_proprietary_tags() (end of line) removed: " + string(buffer));
00522     input_string.erase(orig_tag_open_idx, (crop_end_idx - orig_tag_open_idx) + 1);
00523   }
00524   return input_string;
00525 }
00526 
00527 
00528 #ifdef OS_WIN32
00529 static std::string get_dtd_installation_directory()
00530 {
00531   // Partial implementation of
00532   // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
00533   char ch_fn[MAX_PATH], *p;
00534   std::string str_fn;
00535 
00536   if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
00537 
00538   if ((p = strrchr(ch_fn, '\\')) != NULL)
00539     * p = '\0';
00540 
00541   p = strrchr(ch_fn, '\\');
00542   if (p && (_stricmp(p + 1, "bin") == 0 ||
00543             _stricmp(p + 1, "lib") == 0))
00544     *p = '\0';
00545 
00546   str_fn = ch_fn;
00547   str_fn += "\\share\\libofx\\dtd";
00548 
00549   return str_fn;
00550 }
00551 #endif
00552 
00553 
00566 std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
00567 {
00568   string dtd_path_filename;
00569   char *env_dtd_path;
00570 
00571   dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
00572   if (!dtd_path_filename.empty())
00573   {
00574     dtd_path_filename.append(dtd_filename);
00575     ifstream dtd_file(dtd_path_filename.c_str());
00576     if (dtd_file)
00577     {
00578       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00579       return dtd_path_filename;
00580     }
00581   }
00582 
00583 #ifdef OS_WIN32
00584   dtd_path_filename = get_dtd_installation_directory();
00585   if (!dtd_path_filename.empty())
00586   {
00587     dtd_path_filename.append(DIRSEP);
00588     dtd_path_filename.append(dtd_filename);
00589     ifstream dtd_file(dtd_path_filename.c_str());
00590     if (dtd_file)
00591     {
00592       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00593       return dtd_path_filename;
00594     }
00595   }
00596 #endif
00597   /* Search in environement variable OFX_DTD_PATH */
00598   env_dtd_path = getenv("OFX_DTD_PATH");
00599   if (env_dtd_path)
00600   {
00601     dtd_path_filename.append(env_dtd_path);
00602     dtd_path_filename.append(DIRSEP);
00603     dtd_path_filename.append(dtd_filename);
00604     ifstream dtd_file(dtd_path_filename.c_str());
00605     if (!dtd_file)
00606     {
00607       message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
00608     }
00609     else
00610     {
00611       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00612       return dtd_path_filename;
00613     }
00614   }
00615 
00616   for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
00617   {
00618     dtd_path_filename = DTD_SEARCH_PATH[i];
00619     dtd_path_filename.append(DIRSEP);
00620     dtd_path_filename.append(dtd_filename);
00621     ifstream dtd_file(dtd_path_filename.c_str());
00622     if (!dtd_file)
00623     {
00624       message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
00625     }
00626     else
00627     {
00628       message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00629       return dtd_path_filename;
00630     }
00631   }
00632 
00633   /* Last resort, look in source tree relative path (useful for development) */
00634   dtd_path_filename = "";
00635   dtd_path_filename.append("..");
00636   dtd_path_filename.append(DIRSEP);
00637   dtd_path_filename.append("dtd");
00638   dtd_path_filename.append(DIRSEP);
00639   dtd_path_filename.append(dtd_filename);
00640   ifstream dtd_file(dtd_path_filename.c_str());
00641   if (!dtd_file)
00642   {
00643     message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
00644   }
00645   else
00646   {
00647     message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
00648     return dtd_path_filename;
00649   }
00650 
00651 
00652   message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
00653   return "";
00654 }
00655 
00656