SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <shogun/ui/GUIFeatures.h> 00013 #include <shogun/ui/SGInterface.h> 00014 00015 #include <shogun/lib/config.h> 00016 #include <shogun/io/SGIO.h> 00017 #include <shogun/io/AsciiFile.h> 00018 00019 using namespace shogun; 00020 00021 CGUIFeatures::CGUIFeatures(CSGInterface* ui_) 00022 : CSGObject(), ui(ui_), train_features(NULL), test_features(NULL), 00023 ref_features(NULL) 00024 { 00025 } 00026 00027 CGUIFeatures::~CGUIFeatures() 00028 { 00029 SG_UNREF(train_features); 00030 SG_UNREF(test_features); 00031 SG_UNREF(ref_features); 00032 } 00033 00034 void CGUIFeatures::invalidate_train() 00035 { 00036 CKernel *k = ui->ui_kernel->get_kernel(); 00037 if (k) 00038 k->remove_lhs(); 00039 } 00040 00041 void CGUIFeatures::invalidate_test() 00042 { 00043 CKernel *k = ui->ui_kernel->get_kernel(); 00044 if (k) 00045 k->remove_rhs(); 00046 } 00047 00048 bool CGUIFeatures::load( 00049 char* filename, char* fclass, char* type, char* target, int32_t size, 00050 int32_t comp_features) 00051 { 00052 bool result=false; 00053 CFeatures** f_ptr=NULL; 00054 00055 if (strncmp(target, "TRAIN", 5)==0) 00056 { 00057 f_ptr=&train_features; 00058 invalidate_train(); 00059 } 00060 else if (strncmp(target, "TEST", 4)==0) 00061 { 00062 f_ptr=&test_features; 00063 invalidate_test(); 00064 } 00065 else 00066 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target); 00067 00068 SG_UNREF(*f_ptr); 00069 *f_ptr=NULL; 00070 00071 CAsciiFile* file=new CAsciiFile(filename); 00072 if (strncmp(fclass, "SIMPLE", 6)==0) 00073 { 00074 if (strncmp(type, "REAL", 4)==0) 00075 { 00076 *f_ptr=new CDenseFeatures<float64_t>(file); 00077 } 00078 else if (strncmp(type, "BYTE", 4)==0) 00079 { 00081 *f_ptr=new CDenseFeatures<uint8_t>(file); 00082 } 00083 else if (strncmp(type, "CHAR", 4)==0) 00084 { 00086 *f_ptr=new CDenseFeatures<char>(file); 00087 } 00088 else if (strncmp(type, "SHORT", 5)==0) 00089 { 00090 *f_ptr=new CDenseFeatures<int16_t>(file); 00091 } 00092 else 00093 { 00094 SG_ERROR("Unknown type.\n"); 00095 return false; 00096 } 00097 } 00098 else if (strncmp(fclass, "SPARSE", 6)==0) 00099 { 00100 SG_NOTIMPLEMENTED; 00101 } 00102 else if (strncmp(fclass, "STRING", 6)==0) 00103 { 00104 if (strncmp(type, "REAL", 4)==0) 00105 { 00106 *f_ptr=new CStringFeatures<float64_t>(file); 00107 } 00108 else if (strncmp(type, "BYTE", 4)==0) 00109 { 00111 *f_ptr=new CStringFeatures<uint8_t>(file, DNA); 00112 } 00113 else if (strncmp(type, "CHAR", 4)==0) 00114 { 00116 *f_ptr=new CStringFeatures<char>(file, DNA); 00117 } 00118 else if (strncmp(type, "SHORT", 5)==0) 00119 { 00120 *f_ptr=new CStringFeatures<int16_t>(file); 00121 } 00122 else if (strncmp(type, "WORD", 4)==0) 00123 { 00124 *f_ptr=new CStringFeatures<uint16_t>(file); 00125 } 00126 else if (strncmp(type, "ULONG", 5)==0) 00127 { 00128 *f_ptr=new CStringFeatures<uint64_t>(file); 00129 } 00130 else 00131 { 00132 SG_ERROR("Unknown type.\n"); 00133 return false; 00134 } 00135 } 00136 SG_UNREF(file); 00137 00138 return result; 00139 } 00140 00141 bool CGUIFeatures::save(char* filename, char* type, char* target) 00142 { 00143 bool result=false; 00144 00145 CFeatures** f_ptr=NULL; 00146 00147 if (strncmp(target, "TRAIN", 5)==0) 00148 { 00149 f_ptr=&train_features; 00150 } 00151 else if (strncmp(target, "TEST", 4)==0) 00152 { 00153 f_ptr=&test_features; 00154 } 00155 else 00156 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target); 00157 00158 if (*f_ptr) 00159 { 00160 try 00161 { 00162 CAsciiFile* file=new CAsciiFile(filename, 'w'); 00163 if (strncmp(type, "REAL", 4)==0) 00164 { 00165 ((CDenseFeatures<float64_t>*) (*f_ptr))->save(file); 00166 } 00167 else if (strncmp(type, "BYTE", 4)==0) 00168 { 00169 ((CDenseFeatures<uint8_t>*) (*f_ptr))->save(file); 00170 } 00171 else if (strncmp(type, "CHAR", 4)==0) 00172 { 00173 ((CDenseFeatures<char>*) (*f_ptr))->save(file); 00174 } 00175 else if (strncmp(type, "SHORT", 5)==0) 00176 { 00177 ((CDenseFeatures<int16_t>*) (*f_ptr))->save(file); 00178 } 00179 else if (strncmp(type, "WORD", 4)==0) 00180 { 00181 ((CDenseFeatures<uint16_t>*) (*f_ptr))->save(file); 00182 } 00183 else 00184 { 00185 SG_ERROR("Unknown type.\n"); 00186 return false; 00187 } 00188 SG_UNREF(file); 00189 } 00190 catch (...) 00191 { 00192 SG_ERROR("Writing to file %s failed!\n", filename); 00193 } 00194 00195 SG_INFO( "Successfully written features into \"%s\" !\n", filename); 00196 result=true; 00197 00198 } else 00199 SG_ERROR("Set features first.\n"); 00200 00201 return result; 00202 } 00203 00204 bool CGUIFeatures::clean(char* target) 00205 { 00206 if (strncmp(target, "TRAIN", 5)==0) 00207 set_train_features(NULL); 00208 else if (strncmp(target, "TEST", 4)==0) 00209 set_test_features(NULL); 00210 else 00211 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target); 00212 00213 return true; 00214 } 00215 00216 bool CGUIFeatures::reshape(char* target, int32_t num_feat, int32_t num_vec) 00217 { 00218 CFeatures** f_ptr=NULL; 00219 00220 if (strncmp(target, "TRAIN", 5)==0) 00221 { 00222 f_ptr=&train_features; 00223 invalidate_train(); 00224 } 00225 else if (strncmp(target, "TEST", 4)==0) 00226 { 00227 f_ptr=&test_features; 00228 invalidate_test(); 00229 } 00230 else 00231 { 00232 SG_ERROR("Invalid target %s\n", target); 00233 return false; 00234 } 00235 00236 bool result=false; 00237 if (f_ptr) 00238 { 00239 SG_INFO( "reshape data to %d x %d\n", num_feat, num_vec); 00240 result=(*f_ptr)->reshape(num_feat, num_vec); 00241 00242 if (!result) 00243 SG_ERROR("Reshaping failed.\n"); 00244 } 00245 00246 return result; 00247 } 00248 00249 CFeatures* CGUIFeatures::get_convert_features(char* target) 00250 { 00251 CFeatures* features; 00252 00253 if (strncmp(target, "TEST", 4)==0) 00254 features=get_test_features(); 00255 else if (strncmp(target, "TRAIN", 5)==0) 00256 features=get_train_features(); 00257 else 00258 return NULL; 00259 00260 if (features->get_feature_class()==C_COMBINED) 00261 features=((CCombinedFeatures*) features)->get_last_feature_obj(); 00262 00263 return features; 00264 } 00265 00266 bool CGUIFeatures::set_convert_features(CFeatures* features, char* target) 00267 { 00268 CFeatures* features_prev; 00269 00270 if (strncmp(target, "TEST", 4)==0) 00271 features_prev=get_test_features(); 00272 else if (strncmp(target, "TRAIN", 5)==0) 00273 features_prev=get_train_features(); 00274 else 00275 return false; 00276 00277 // in case of combined features delete current (==last) feature obj 00278 // pointer from list (feature object got deleted already above) 00279 // and append *f_ptr which holds the newly created feature object 00280 if (features_prev->get_feature_class()==C_COMBINED) 00281 { 00282 CCombinedFeatures* combined=(CCombinedFeatures*) features_prev; 00283 combined->delete_feature_obj(); 00284 combined->append_feature_obj(features); 00285 combined->list_feature_objs(); 00286 } 00287 else // set features to new test/train features 00288 { 00289 if (strncmp(target, "TEST", 4)==0) 00290 set_test_features(features); 00291 else 00292 set_train_features(features); 00293 } 00294 00295 return true; 00296 } 00297 00298 CSparseFeatures<float64_t>* CGUIFeatures::convert_simple_real_to_sparse_real( 00299 CDenseFeatures<float64_t>* src) 00300 { 00301 if (src && 00302 src->get_feature_class()==C_DENSE && 00303 src->get_feature_type()==F_DREAL) 00304 { 00305 //create sparse features with 0 cache 00306 SG_INFO("Attempting to convert dense feature matrix to a sparse one.\n"); 00307 CSparseFeatures<float64_t>* target=new CSparseFeatures<float64_t>(0); 00308 int32_t num_f=0; 00309 int32_t num_v=0; 00310 float64_t* feats=src->get_feature_matrix(num_f, num_v); 00311 if (target->set_full_feature_matrix(SGMatrix<float64_t>(feats, num_f, num_v))) 00312 return target; 00313 00314 SG_UNREF(target); 00315 } 00316 else 00317 SG_ERROR("No SIMPLE DREAL features available.\n"); 00318 00319 return NULL; 00320 } 00321 00322 CStringFeatures<char>* CGUIFeatures::convert_simple_char_to_string_char( 00323 CDenseFeatures<char>* src) 00324 { 00325 if (src && src->get_feature_class()==C_DENSE) 00326 { 00327 int32_t num_vec=src->get_num_vectors(); 00328 SGString<char>* strings=SG_MALLOC(SGString<char>, num_vec); 00329 int32_t max_len=-1; 00330 00331 for (int32_t i=0; i<num_vec; i++) 00332 { 00333 bool to_free=false; 00334 int32_t len=0; 00335 char* str=src->get_feature_vector(i, len, to_free); 00336 strings[i].slen=len ; 00337 for (int32_t j=0; j<len; j++) 00338 if (str[j]==0) 00339 { 00340 strings[i].slen=j ; 00341 break ; 00342 } ; 00343 strings[i].string=SG_MALLOC(char, strings[i].slen); 00344 00345 for (int32_t j=0; j<strings[i].slen; j++) 00346 strings[i].string[j]=str[j]; 00347 00348 if (strings[i].slen> max_len) 00349 max_len=strings[i].slen; 00350 00351 src->free_feature_vector(str, i, to_free); 00352 } 00353 00354 CStringFeatures<char>* target=new CStringFeatures<char>(new CAlphabet(DNA)); 00355 target->set_features(strings, num_vec, max_len); 00356 return target; 00357 } 00358 else 00359 SG_ERROR("No features of class/type SIMPLE/CHAR available.\n"); 00360 00361 return NULL; 00362 } 00363 00364 CDenseFeatures<float64_t>* CGUIFeatures::convert_simple_word_to_simple_salzberg( 00365 CDenseFeatures<uint16_t>* src) 00366 { 00367 CPluginEstimate* pie=ui->ui_pluginestimate->get_estimator(); 00368 00369 if (src && 00370 src->get_feature_type()==F_WORD && 00371 src->get_feature_class()==C_DENSE && 00372 pie) 00373 { 00374 CDenseFeatures<float64_t>* target=new CDenseFeatures<float64_t>(0); 00375 int32_t num_feat=src->get_num_features(); 00376 int32_t num_vec=src->get_num_vectors(); 00377 float64_t* fm=SG_MALLOC(float64_t, num_vec*num_feat); 00378 00379 if (fm) 00380 { 00381 for (int32_t i=0; i<num_vec; i++) 00382 { 00383 int32_t len=0; 00384 bool to_free=false; 00385 uint16_t* vec = src->get_feature_vector(i, len, to_free); 00386 ASSERT(num_feat==len); 00387 00388 for (int32_t j=0; j<num_feat; j++) 00389 fm[i*num_feat+j]= 00390 pie->get_parameterwise_log_odds(vec[j], j); 00391 00392 src->free_feature_vector(vec, i, to_free); 00393 } 00394 target->set_feature_matrix(SGMatrix<float64_t>(fm, num_feat, num_vec)); 00395 00396 } 00397 return target; 00398 } 00399 else 00400 SG_ERROR("No SIMPLE WORD features or PluginEstimator available.\n"); 00401 00402 return NULL; 00403 } 00404 00405 00406 CTOPFeatures* CGUIFeatures::convert_string_word_to_simple_top( 00407 CStringFeatures<uint16_t>* src) 00408 { 00409 CTOPFeatures* tf=NULL; 00410 00411 if (src && 00412 src->get_feature_class()==C_DENSE && 00413 src->get_feature_type()==F_WORD) 00414 { 00415 SG_INFO("Converting to TOP features.\n"); 00416 00417 if (ui->ui_hmm->get_pos() && ui->ui_hmm->get_neg()) 00418 { 00419 ui->ui_hmm->get_pos()->set_observations(src); 00420 ui->ui_hmm->get_neg()->set_observations(src); 00421 00422 bool neglinear=false; 00423 bool poslinear=false; 00424 00425 tf=new CTOPFeatures( 00426 0, ui->ui_hmm->get_pos(), ui->ui_hmm->get_neg(), 00427 neglinear, poslinear); 00428 ASSERT(tf->set_feature_matrix()); 00429 } 00430 else 00431 SG_ERROR("HMMs not correctly assigned!\n"); 00432 } 00433 else 00434 SG_ERROR("No SIMPLE WORD features available.\n"); 00435 00436 return tf; 00437 } 00438 00439 CFKFeatures* CGUIFeatures::convert_string_word_to_simple_fk( 00440 CStringFeatures<uint16_t>* src) 00441 { 00442 CFKFeatures* fkf=NULL; 00443 00444 SG_INFO("Converting to FK features.\n"); 00445 00446 if (ui->ui_hmm->get_pos() && ui->ui_hmm->get_neg()) 00447 { 00448 CStringFeatures<uint16_t>* old_obs_pos= 00449 ui->ui_hmm->get_pos()->get_observations(); 00450 CStringFeatures<uint16_t>* old_obs_neg= 00451 ui->ui_hmm->get_neg()->get_observations(); 00452 00453 CStringFeatures<uint16_t>* string_feat=src; 00454 ui->ui_hmm->get_pos()->set_observations(string_feat); 00455 ui->ui_hmm->get_neg()->set_observations(string_feat); 00456 00457 fkf=new CFKFeatures( 00458 0, ui->ui_hmm->get_pos(), ui->ui_hmm->get_neg()); 00459 //, neglinear, poslinear); 00460 if (train_features) 00461 fkf->set_opt_a(((CFKFeatures*) train_features)->get_weight_a()); 00462 else 00463 SG_ERROR("Need train features to set optimal a.\n"); 00464 00465 ASSERT(fkf->set_feature_matrix()); 00466 00467 ui->ui_hmm->get_pos()->set_observations(old_obs_pos); 00468 ui->ui_hmm->get_neg()->set_observations(old_obs_neg); 00469 } 00470 else 00471 SG_ERROR("HMMs not correctly assigned!\n"); 00472 00473 return fkf; 00474 } 00475 00476 00477 CDenseFeatures<float64_t>* CGUIFeatures::convert_sparse_real_to_simple_real( 00478 CSparseFeatures<float64_t>* src) 00479 { 00480 if (src && 00481 src->get_feature_class()==C_SPARSE && 00482 src->get_feature_type() == F_DREAL) 00483 { 00484 //create dense features with 0 cache 00485 SG_INFO("Attempting to convert sparse feature matrix to a dense one.\n"); 00486 CDenseFeatures<float64_t>* rf=new CDenseFeatures<float64_t>(0); 00487 if (rf) 00488 { 00489 SGMatrix<float64_t> feats=src->get_full_feature_matrix(); 00490 rf->set_feature_matrix(feats); 00491 return rf; 00492 } 00493 } 00494 else 00495 SG_ERROR("No SPARSE REAL features available.\n"); 00496 00497 return NULL; 00498 } 00499 00500 CExplicitSpecFeatures* CGUIFeatures::convert_string_byte_to_spec_word( 00501 CStringFeatures<uint16_t>* src, bool use_norm) 00502 { 00503 return new CExplicitSpecFeatures(src, use_norm); 00504 } 00505 00506 CDenseFeatures<float64_t>* CGUIFeatures::convert_simple_char_to_simple_align( 00507 CDenseFeatures<char>* src, float64_t gap_cost) 00508 { 00509 if (src && 00510 src->get_feature_class()==C_DENSE && 00511 src->get_feature_type()==F_CHAR) 00512 { 00513 //create dense features with 0 cache 00514 SG_INFO("Converting CHAR features to REAL ones.\n"); 00515 00516 CDenseFeatures<float64_t>* rf=new CDenseFeatures<float64_t>(0); 00517 if (rf) 00518 { 00519 SG_INFO("Start aligment with gapCost=%1.2f.\n", gap_cost); 00520 /*rf->Align_char_features( 00521 src, (CDenseFeatures<char>*) ref_features, gap_cost);*/ 00522 SG_INFO("Conversion was successful.\n"); 00523 return rf; 00524 } 00525 } 00526 else 00527 SG_ERROR("No SIMPLE CHAR features available.\n"); 00528 00529 SG_ERROR("Conversion failed.\n"); 00530 return NULL; 00531 } 00532 00533 bool CGUIFeatures::set_reference_features(char* target) 00534 { 00535 if (strncmp(target, "TRAIN", 5)==0) 00536 { 00537 SG_UNREF(ref_features); 00538 ref_features=train_features; 00539 train_features=NULL; 00540 invalidate_train(); 00541 return true; 00542 } 00543 else if (strncmp(target, "TEST", 4)==0) 00544 { 00545 SG_UNREF(ref_features); 00546 ref_features=test_features; 00547 test_features=NULL; 00548 invalidate_test(); 00549 return true; 00550 } 00551 00552 return false; 00553 } 00554 00555 void CGUIFeatures::add_train_features(CFeatures* f) 00556 { 00557 ASSERT(f); 00558 invalidate_train(); 00559 00560 if (!train_features) 00561 { 00562 train_features=new CCombinedFeatures(); 00563 SG_REF(train_features); 00564 } 00565 00566 if (train_features->get_feature_class()!=C_COMBINED) 00567 { 00568 CFeatures* first_elem=train_features; 00569 train_features=new CCombinedFeatures(); 00570 SG_REF(train_features); 00571 ((CCombinedFeatures*) train_features)->append_feature_obj(first_elem); 00572 ((CCombinedFeatures*) train_features)->list_feature_objs(); 00573 SG_UNREF(first_elem); 00574 } 00575 00576 bool result=((CCombinedFeatures*) train_features)->append_feature_obj(f); 00577 if (result) 00578 ((CCombinedFeatures*) train_features)->list_feature_objs(); 00579 else 00580 SG_ERROR("appending feature object failed\n"); 00581 } 00582 00583 void CGUIFeatures::add_train_dotfeatures(CDotFeatures* f) 00584 { 00585 ASSERT(f); 00586 SG_PRINT("DOTFVEC %d\n", f->get_num_vectors()); 00587 invalidate_train(); 00588 00589 if (!train_features) 00590 { 00591 train_features=new CCombinedDotFeatures(); 00592 SG_REF(train_features); 00593 } 00594 00595 if (train_features->get_feature_class()!=C_COMBINED_DOT) 00596 { 00597 if (!train_features->has_property(FP_DOT)) 00598 SG_ERROR("Trainfeatures not based on DotFeatures.\n"); 00599 00600 CDotFeatures* first_elem=(CDotFeatures*) train_features; 00601 train_features=new CCombinedDotFeatures(); 00602 SG_REF(train_features); 00603 ((CCombinedDotFeatures*) train_features)->append_feature_obj(first_elem); 00604 ((CCombinedDotFeatures*) train_features)->list_feature_objs(); 00605 SG_UNREF(first_elem); 00606 } 00607 00608 bool result=((CCombinedDotFeatures*) train_features)->append_feature_obj(f); 00609 if (result) 00610 ((CCombinedDotFeatures*) train_features)->list_feature_objs(); 00611 else 00612 SG_ERROR("appending dot feature object failed\n"); 00613 } 00614 00615 void CGUIFeatures::add_test_dotfeatures(CDotFeatures* f) 00616 { 00617 ASSERT(f); 00618 invalidate_test(); 00619 00620 if (!test_features) 00621 { 00622 test_features=new CCombinedDotFeatures(); 00623 SG_REF(test_features); 00624 } 00625 00626 if (test_features->get_feature_class()!=C_COMBINED_DOT) 00627 { 00628 if (!test_features->has_property(FP_DOT)) 00629 SG_ERROR("Trainfeatures not based on DotFeatures.\n"); 00630 00631 CDotFeatures* first_elem=(CDotFeatures*) test_features; 00632 test_features=new CCombinedDotFeatures(); 00633 SG_REF(test_features); 00634 ((CCombinedDotFeatures*) test_features)->append_feature_obj(first_elem); 00635 ((CCombinedDotFeatures*) test_features)->list_feature_objs(); 00636 SG_UNREF(first_elem); 00637 } 00638 00639 bool result=((CCombinedDotFeatures*) test_features)->append_feature_obj(f); 00640 if (result) 00641 ((CCombinedDotFeatures*) test_features)->list_feature_objs(); 00642 else 00643 SG_ERROR("Appending feature object failed.\n"); 00644 } 00645 00646 void CGUIFeatures::add_test_features(CFeatures* f) 00647 { 00648 ASSERT(f); 00649 invalidate_test(); 00650 00651 if (!test_features) 00652 { 00653 test_features=new CCombinedFeatures(); 00654 SG_REF(test_features); 00655 } 00656 00657 if (test_features->get_feature_class()!=C_COMBINED) 00658 { 00659 CFeatures* first_elem=test_features; 00660 test_features=new CCombinedFeatures(); 00661 SG_REF(test_features); 00662 ((CCombinedFeatures*) test_features)->append_feature_obj(first_elem); 00663 ((CCombinedFeatures*) test_features)->list_feature_objs(); 00664 SG_UNREF(first_elem); 00665 } 00666 00667 bool result=((CCombinedFeatures*) test_features)->append_feature_obj(f); 00668 if (result) 00669 ((CCombinedFeatures*) test_features)->list_feature_objs(); 00670 else 00671 SG_ERROR("Appending feature object failed.\n"); 00672 } 00673 00674 bool CGUIFeatures::del_last_feature_obj(char* target) 00675 { 00676 CCombinedFeatures* cf=NULL; 00677 if (strncmp(target, "TRAIN", 5)==0) 00678 { 00679 if (!train_features) 00680 SG_ERROR("No train features available.\n"); 00681 if (train_features->get_feature_class()!=C_COMBINED) 00682 SG_ERROR("Train features are not combined features.\n"); 00683 00684 cf=(CCombinedFeatures*) train_features; 00685 } 00686 else if (strncmp(target, "TEST", 4)==0) 00687 { 00688 if (!test_features) 00689 SG_ERROR("No test features available.\n"); 00690 if (test_features->get_feature_class()!=C_COMBINED) 00691 SG_ERROR("Test features are not combined features.\n"); 00692 00693 cf=(CCombinedFeatures*) test_features; 00694 } 00695 else 00696 SG_ERROR("Unknown target %s, neither TRAIN nor TEST.\n", target); 00697 00698 if (!cf->delete_feature_obj()) 00699 SG_ERROR("No features available to delete.\n"); 00700 00701 return false; 00702 }