LibOFX
|
00001 /*************************************************************************** 00002 ofx_preproc.cpp 00003 ------------------- 00004 copyright : (C) 2002 by Benoit Gr�oir 00005 email : benoitg@coeus.ca 00006 ***************************************************************************/ 00012 /*************************************************************************** 00013 * * 00014 * This program is free software; you can redistribute it and/or modify * 00015 * it under the terms of the GNU General Public License as published by * 00016 * the Free Software Foundation; either version 2 of the License, or * 00017 * (at your option) any later version. * 00018 * * 00019 ***************************************************************************/ 00020 #include "../config.h" 00021 #include <iostream> 00022 #include <fstream> 00023 #include <cstdlib> 00024 #include <stdio.h> 00025 #include <string> 00026 #include "ParserEventGeneratorKit.h" 00027 #include "libofx.h" 00028 #include "messages.hh" 00029 #include "ofx_sgml.hh" 00030 #include "ofc_sgml.hh" 00031 #include "ofx_preproc.hh" 00032 #include "ofx_utilities.hh" 00033 #ifdef HAVE_ICONV 00034 #include <iconv.h> 00035 #endif 00036 00037 #ifdef OS_WIN32 00038 # define DIRSEP "\\" 00039 #else 00040 # define DIRSEP "/" 00041 #endif 00042 00043 #ifdef OS_WIN32 00044 # include "win32.hh" 00045 # include <windows.h> // for GetModuleFileName() 00046 # undef ERROR 00047 # undef DELETE 00048 #endif 00049 00050 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252" 00051 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8" 00052 00053 using namespace std; 00057 #ifdef MAKEFILE_DTD_PATH 00058 const int DTD_SEARCH_PATH_NUM = 4; 00059 #else 00060 const int DTD_SEARCH_PATH_NUM = 3; 00061 #endif 00062 00066 const char *DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM] = 00067 { 00068 #ifdef MAKEFILE_DTD_PATH 00069 MAKEFILE_DTD_PATH , 00070 #endif 00071 "/usr/local/share/libofx/dtd", 00072 "/usr/share/libofx/dtd", 00073 "~" 00074 }; 00075 const unsigned int READ_BUFFER_SIZE = 1024; 00076 00081 int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename) 00082 { 00083 LibofxContext *libofx_context; 00084 bool ofx_start = false; 00085 bool ofx_end = false; 00086 bool file_is_xml = false; 00087 00088 ifstream input_file; 00089 ofstream tmp_file; 00090 char buffer[READ_BUFFER_SIZE]; 00091 char *iconv_buffer; 00092 string s_buffer; 00093 char *filenames[3]; 00094 char tmp_filename[256]; 00095 int tmp_file_fd; 00096 #ifdef HAVE_ICONV 00097 iconv_t conversion_descriptor; 00098 #endif 00099 libofx_context = (LibofxContext*)ctx; 00100 00101 if (p_filename != NULL && strcmp(p_filename, "") != 0) 00102 { 00103 message_out(DEBUG, string("ofx_proc_file():Opening file: ") + p_filename); 00104 00105 input_file.open(p_filename); 00106 if (!input_file) 00107 { 00108 message_out(ERROR, "ofx_proc_file():Unable to open the input file " + string(p_filename)); 00109 } 00110 00111 mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename)); 00112 00113 message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + string(tmp_filename)); 00114 tmp_file_fd = mkstemp(tmp_filename); 00115 if (tmp_file_fd) 00116 { 00117 tmp_file.open(tmp_filename); 00118 if (!tmp_file) 00119 { 00120 message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + string(tmp_filename)); 00121 return -1; 00122 } 00123 } 00124 else 00125 { 00126 message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + string(tmp_filename)); 00127 return -1; 00128 } 00129 00130 if (input_file && tmp_file) 00131 { 00132 int header_separator_idx; 00133 string header_name; 00134 string header_value; 00135 string ofx_encoding; 00136 string ofx_charset; 00137 do 00138 { 00139 s_buffer.clear(); 00140 bool end_of_line = false; 00141 do 00142 { 00143 input_file.get(buffer, sizeof(buffer), '\n'); 00144 //cout<< "got: \"" << buffer<<"\"\n"; 00145 s_buffer.append(buffer); 00146 00147 // Watch out: If input_file is in eof(), any subsequent read or 00148 // peek() will fail and we must exit this loop. 00149 if (input_file.eof()) 00150 break; 00151 00152 //cout<<"input_file.gcount(): "<<input_file.gcount()<< " s_buffer.size=" << s_buffer.size()<<" sizeof(buffer): "<<sizeof(buffer) << " peek=\"" << int(input_file.peek()) << "\"" <<endl; 00153 if (input_file.fail()) // If no characters were extracted above, the failbit is set. 00154 { 00155 // No characters extracted means that we've reached the newline 00156 // delimiter (because we already checked for EOF). We will check 00157 // for and remove that newline in the next if-clause, but must 00158 // remove the failbit so that peek() will work again. 00159 input_file.clear(); 00160 } 00161 00162 // Is the next character really the newline? 00163 if (input_file.peek() == '\n') 00164 { 00165 // Yes. Then discard that newline character from the stream and 00166 // append it manually to the output string. 00167 input_file.get(); 00168 s_buffer.append("\n"); 00169 end_of_line = true; // We found the end-of-line. 00170 } 00171 } 00172 // Continue reading as long as we're not at EOF *and* we've not yet 00173 // reached an end-of-line. 00174 while (!input_file.eof() && !end_of_line); 00175 00176 if (ofx_start == false && (s_buffer.find("<?xml") != string::npos)) 00177 { 00178 message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped."); 00179 file_is_xml = true; 00180 } 00181 00182 int ofx_start_idx; 00183 if (ofx_start == false && 00184 ( 00185 (libofx_context->currentFileType() == OFX && 00186 ((ofx_start_idx = s_buffer.find("<OFX>")) != 00187 string::npos || (ofx_start_idx = s_buffer.find("<ofx>")) != string::npos)) 00188 || (libofx_context->currentFileType() == OFC && 00189 ((ofx_start_idx = s_buffer.find("<OFC>")) != string::npos || 00190 (ofx_start_idx = s_buffer.find("<ofc>")) != string::npos)) 00191 ) 00192 ) 00193 { 00194 ofx_start = true; 00195 if (file_is_xml == false) 00196 { 00197 s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header. 00198 } 00199 message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found"); 00200 00201 if (file_is_xml == true) 00202 { 00203 static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1"; 00204 if (putenv(sp_charset_fixed) != 0) 00205 { 00206 message_out(ERROR, "ofx_proc_file(): putenv failed"); 00207 } 00208 /* Normally the following would be "xml". 00209 * Unfortunately, opensp's generic api will garble UTF-8 if this is 00210 * set to xml. So we set any single byte encoding to avoid messing 00211 * up UTF-8. Unfortunately this means that non-UTF-8 files will not 00212 * get properly translated. We'd need to manually detect the 00213 * encoding in the XML header and convert the xml with iconv like we 00214 * do for SGML to work around the problem. Most unfortunate. */ 00215 static char sp_encoding[] = "SP_ENCODING=ms-dos"; 00216 if (putenv(sp_encoding) != 0) 00217 { 00218 message_out(ERROR, "ofx_proc_file(): putenv failed"); 00219 } 00220 } 00221 else 00222 { 00223 static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1"; 00224 if (putenv(sp_charset_fixed) != 0) 00225 { 00226 message_out(ERROR, "ofx_proc_file(): putenv failed"); 00227 } 00228 static char sp_encoding[] = "SP_ENCODING=ms-dos"; //Any single byte encoding will do, we don't want opensp messing up UTF-8; 00229 if (putenv(sp_encoding) != 0) 00230 { 00231 message_out(ERROR, "ofx_proc_file(): putenv failed"); 00232 } 00233 #ifdef HAVE_ICONV 00234 string fromcode; 00235 string tocode; 00236 if (ofx_encoding.compare("USASCII") == 0) 00237 { 00238 if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0) 00239 { 00240 //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well... 00241 fromcode = "ISO-8859-1"; 00242 } 00243 else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0) 00244 { 00245 //Only "1252" is actually a legal value, but since the banks follows the spec SO well... 00246 fromcode = "CP1252"; 00247 } 00248 else if (ofx_charset.compare("NONE") == 0) 00249 { 00250 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING; 00251 } 00252 else 00253 { 00254 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING; 00255 } 00256 } 00257 else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0) 00258 { 00259 //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such... 00260 fromcode = "UTF-8"; 00261 } 00262 else 00263 { 00264 fromcode = LIBOFX_DEFAULT_INPUT_ENCODING; 00265 } 00266 tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING; 00267 message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode); 00268 conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str()); 00269 #endif 00270 } 00271 } 00272 else 00273 { 00274 //We are still in the headers 00275 if ((header_separator_idx = s_buffer.find(':')) != string::npos) 00276 { 00277 //Header processing 00278 header_name.assign(s_buffer.substr(0, header_separator_idx)); 00279 header_value.assign(s_buffer.substr(header_separator_idx + 1)); 00280 while ( header_value[header_value.length() -1 ] == '\n' || 00281 header_value[header_value.length() -1 ] == '\r' ) 00282 header_value.erase(header_value.length() - 1); 00283 message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found"); 00284 if (header_name.compare("ENCODING") == 0) 00285 { 00286 ofx_encoding.assign(header_value); 00287 } 00288 if (header_name.compare("CHARSET") == 0) 00289 { 00290 ofx_charset.assign(header_value); 00291 } 00292 } 00293 } 00294 00295 if (file_is_xml == true || (ofx_start == true && ofx_end == false)) 00296 { 00297 if (ofx_start == true) 00298 { 00299 /* The above test won't help us if the <OFX> tag is on the same line 00300 * as the xml header, but as opensp can't be used to parse it anyway 00301 * this isn't a great loss for now. 00302 */ 00303 s_buffer = sanitize_proprietary_tags(s_buffer); 00304 } 00305 //cout<< s_buffer<<"\n"; 00306 if (file_is_xml == false) 00307 { 00308 #ifdef HAVE_ICONV 00309 size_t inbytesleft = strlen(s_buffer.c_str()); 00310 size_t outbytesleft = inbytesleft * 2 - 1; 00311 iconv_buffer = (char*) malloc (inbytesleft * 2); 00312 memset(iconv_buffer, 0, inbytesleft * 2); 00313 #if defined(OS_WIN32) || defined(__sun) 00314 const char * inchar = (const char *)s_buffer.c_str(); 00315 #else 00316 char * inchar = (char *)s_buffer.c_str(); 00317 #endif 00318 char * outchar = iconv_buffer; 00319 int iconv_retval = iconv (conversion_descriptor, 00320 &inchar, &inbytesleft, 00321 &outchar, &outbytesleft); 00322 if (iconv_retval == -1) 00323 { 00324 message_out(ERROR, "ofx_proc_file(): Conversion error"); 00325 } 00326 s_buffer = iconv_buffer; 00327 free (iconv_buffer); 00328 #endif 00329 } 00330 cout << s_buffer << "\n"; 00331 tmp_file.write(s_buffer.c_str(), s_buffer.length()); 00332 } 00333 00334 if (ofx_start == true && 00335 ( 00336 (libofx_context->currentFileType() == OFX && 00337 ((ofx_start_idx = s_buffer.find("</OFX>")) != string::npos || 00338 (ofx_start_idx = s_buffer.find("</ofx>")) != string::npos)) 00339 || (libofx_context->currentFileType() == OFC && 00340 ((ofx_start_idx = s_buffer.find("</OFC>")) != string::npos || 00341 (ofx_start_idx = s_buffer.find("</ofc>")) != string::npos)) 00342 ) 00343 ) 00344 { 00345 ofx_end = true; 00346 message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC> has been found"); 00347 } 00348 00349 } 00350 while (!input_file.eof() && !input_file.bad()); 00351 } 00352 input_file.close(); 00353 tmp_file.close(); 00354 #ifdef HAVE_ICONV 00355 if (file_is_xml == false) 00356 { 00357 iconv_close(conversion_descriptor); 00358 } 00359 #endif 00360 char filename_openspdtd[255]; 00361 char filename_dtd[255]; 00362 char filename_ofx[255]; 00363 strncpy(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME).c_str(), 255); //The opensp sgml dtd file 00364 if (libofx_context->currentFileType() == OFX) 00365 { 00366 strncpy(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME).c_str(), 255); //The ofx dtd file 00367 } 00368 else if (libofx_context->currentFileType() == OFC) 00369 { 00370 strncpy(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME).c_str(), 255); //The ofc dtd file 00371 } 00372 else 00373 { 00374 message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser")); 00375 } 00376 00377 if ((string)filename_dtd != "" && (string)filename_openspdtd != "") 00378 { 00379 strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file 00380 filenames[0] = filename_openspdtd; 00381 filenames[1] = filename_dtd; 00382 filenames[2] = filename_ofx; 00383 if (libofx_context->currentFileType() == OFX) 00384 { 00385 ofx_proc_sgml(libofx_context, 3, filenames); 00386 } 00387 else if (libofx_context->currentFileType() == OFC) 00388 { 00389 ofc_proc_sgml(libofx_context, 3, filenames); 00390 } 00391 else 00392 { 00393 message_out(ERROR, string("ofx_proc_file(): Error unknown file format for the OFX parser")); 00394 } 00395 if (remove(tmp_filename) != 0) 00396 { 00397 message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + string(tmp_filename)); 00398 } 00399 } 00400 else 00401 { 00402 message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting"); 00403 } 00404 } 00405 else 00406 { 00407 message_out(ERROR, "ofx_proc_file():No input file specified"); 00408 } 00409 return 0; 00410 } 00411 00412 00417 string sanitize_proprietary_tags(string input_string) 00418 { 00419 unsigned int i; 00420 size_t input_string_size; 00421 bool strip = false; 00422 bool tag_open = false; 00423 int tag_open_idx = 0; //Are we within < > ? 00424 bool closing_tag_open = false; //Are we within </ > ? 00425 int orig_tag_open_idx = 0; 00426 bool proprietary_tag = false; //Are we within a proprietary element? 00427 bool proprietary_closing_tag = false; 00428 int crop_end_idx = 0; 00429 char buffer[READ_BUFFER_SIZE] = ""; 00430 char tagname[READ_BUFFER_SIZE] = ""; 00431 int tagname_idx = 0; 00432 char close_tagname[READ_BUFFER_SIZE] = ""; 00433 00434 for (i = 0; i < READ_BUFFER_SIZE; i++) 00435 { 00436 buffer[i] = 0; 00437 tagname[i] = 0; 00438 close_tagname[i] = 0; 00439 } 00440 00441 input_string_size = input_string.size(); 00442 00443 for (i = 0; i <= input_string_size; i++) 00444 { 00445 if (input_string.c_str()[i] == '<') 00446 { 00447 tag_open = true; 00448 tag_open_idx = i; 00449 if (proprietary_tag == true && input_string.c_str()[i+1] == '/') 00450 { 00451 //We are now in a closing tag 00452 closing_tag_open = true; 00453 //cout<<"Comparaison: "<<tagname<<"|"<<&(input_string.c_str()[i+2])<<"|"<<strlen(tagname)<<endl; 00454 if (strncmp(tagname, &(input_string.c_str()[i+2]), strlen(tagname)) != 0) 00455 { 00456 //If it is the begining of an other tag 00457 //cout<<"DIFFERENT!"<<endl; 00458 crop_end_idx = i - 1; 00459 strip = true; 00460 } 00461 else 00462 { 00463 //Otherwise, it is the start of the closing tag of the proprietary tag 00464 proprietary_closing_tag = true; 00465 } 00466 } 00467 else if (proprietary_tag == true) 00468 { 00469 //It is the start of a new tag, following a proprietary tag 00470 crop_end_idx = i - 1; 00471 strip = true; 00472 } 00473 } 00474 else if (input_string.c_str()[i] == '>') 00475 { 00476 tag_open = false; 00477 closing_tag_open = false; 00478 tagname[tagname_idx] = 0; 00479 tagname_idx = 0; 00480 if (proprietary_closing_tag == true) 00481 { 00482 crop_end_idx = i; 00483 strip = true; 00484 } 00485 } 00486 else if (tag_open == true && closing_tag_open == false) 00487 { 00488 if (input_string.c_str()[i] == '.') 00489 { 00490 if (proprietary_tag != true) 00491 { 00492 orig_tag_open_idx = tag_open_idx; 00493 proprietary_tag = true; 00494 } 00495 } 00496 tagname[tagname_idx] = input_string.c_str()[i]; 00497 tagname_idx++; 00498 } 00499 //cerr <<i<<endl; 00500 if (strip == true && orig_tag_open_idx < input_string.size()) 00501 { 00502 input_string.copy(buffer, (crop_end_idx - orig_tag_open_idx) + 1, orig_tag_open_idx); 00503 message_out(INFO, "sanitize_proprietary_tags() (end tag or new tag) removed: " + string(buffer)); 00504 input_string.erase(orig_tag_open_idx, (crop_end_idx - orig_tag_open_idx) + 1); 00505 i = orig_tag_open_idx - 1; 00506 proprietary_tag = false; 00507 proprietary_closing_tag = false; 00508 closing_tag_open = false; 00509 tag_open = false; 00510 strip = false; 00511 } 00512 00513 }//end for 00514 if (proprietary_tag == true && orig_tag_open_idx < input_string.size()) 00515 { 00516 if (crop_end_idx == 0) //no closing tag 00517 { 00518 crop_end_idx = input_string.size() - 1; 00519 } 00520 input_string.copy(buffer, (crop_end_idx - orig_tag_open_idx) + 1, orig_tag_open_idx); 00521 message_out(INFO, "sanitize_proprietary_tags() (end of line) removed: " + string(buffer)); 00522 input_string.erase(orig_tag_open_idx, (crop_end_idx - orig_tag_open_idx) + 1); 00523 } 00524 return input_string; 00525 } 00526 00527 00528 #ifdef OS_WIN32 00529 static std::string get_dtd_installation_directory() 00530 { 00531 // Partial implementation of 00532 // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory 00533 char ch_fn[MAX_PATH], *p; 00534 std::string str_fn; 00535 00536 if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return ""; 00537 00538 if ((p = strrchr(ch_fn, '\\')) != NULL) 00539 * p = '\0'; 00540 00541 p = strrchr(ch_fn, '\\'); 00542 if (p && (_stricmp(p + 1, "bin") == 0 || 00543 _stricmp(p + 1, "lib") == 0)) 00544 *p = '\0'; 00545 00546 str_fn = ch_fn; 00547 str_fn += "\\share\\libofx\\dtd"; 00548 00549 return str_fn; 00550 } 00551 #endif 00552 00553 00566 std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename) 00567 { 00568 string dtd_path_filename; 00569 char *env_dtd_path; 00570 00571 dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir(); 00572 if (!dtd_path_filename.empty()) 00573 { 00574 dtd_path_filename.append(dtd_filename); 00575 ifstream dtd_file(dtd_path_filename.c_str()); 00576 if (dtd_file) 00577 { 00578 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename); 00579 return dtd_path_filename; 00580 } 00581 } 00582 00583 #ifdef OS_WIN32 00584 dtd_path_filename = get_dtd_installation_directory(); 00585 if (!dtd_path_filename.empty()) 00586 { 00587 dtd_path_filename.append(DIRSEP); 00588 dtd_path_filename.append(dtd_filename); 00589 ifstream dtd_file(dtd_path_filename.c_str()); 00590 if (dtd_file) 00591 { 00592 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename); 00593 return dtd_path_filename; 00594 } 00595 } 00596 #endif 00597 /* Search in environement variable OFX_DTD_PATH */ 00598 env_dtd_path = getenv("OFX_DTD_PATH"); 00599 if (env_dtd_path) 00600 { 00601 dtd_path_filename.append(env_dtd_path); 00602 dtd_path_filename.append(DIRSEP); 00603 dtd_path_filename.append(dtd_filename); 00604 ifstream dtd_file(dtd_path_filename.c_str()); 00605 if (!dtd_file) 00606 { 00607 message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename); 00608 } 00609 else 00610 { 00611 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename); 00612 return dtd_path_filename; 00613 } 00614 } 00615 00616 for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++) 00617 { 00618 dtd_path_filename = DTD_SEARCH_PATH[i]; 00619 dtd_path_filename.append(DIRSEP); 00620 dtd_path_filename.append(dtd_filename); 00621 ifstream dtd_file(dtd_path_filename.c_str()); 00622 if (!dtd_file) 00623 { 00624 message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename); 00625 } 00626 else 00627 { 00628 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename); 00629 return dtd_path_filename; 00630 } 00631 } 00632 00633 /* Last resort, look in source tree relative path (useful for development) */ 00634 dtd_path_filename = ""; 00635 dtd_path_filename.append(".."); 00636 dtd_path_filename.append(DIRSEP); 00637 dtd_path_filename.append("dtd"); 00638 dtd_path_filename.append(DIRSEP); 00639 dtd_path_filename.append(dtd_filename); 00640 ifstream dtd_file(dtd_path_filename.c_str()); 00641 if (!dtd_file) 00642 { 00643 message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree."); 00644 } 00645 else 00646 { 00647 message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename); 00648 return dtd_path_filename; 00649 } 00650 00651 00652 message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename); 00653 return ""; 00654 } 00655 00656