SHOGUN
v2.0.0
|
00001 #include <shogun/features/StringFileFeatures.h> 00002 00003 namespace shogun 00004 { 00005 00006 template <class ST> CStringFileFeatures<ST>::CStringFileFeatures() : CStringFeatures<ST>(), file(NULL) 00007 { 00008 } 00009 00010 template <class ST> CStringFileFeatures<ST>::CStringFileFeatures(const char* fname, EAlphabet alpha) 00011 : CStringFeatures<ST>(alpha) 00012 { 00013 file = new CMemoryMappedFile<ST>(fname); 00014 fetch_meta_info_from_file(); 00015 } 00016 00017 template <class ST> CStringFileFeatures<ST>::~CStringFileFeatures() 00018 { 00019 SG_UNREF(file); 00020 CStringFileFeatures<ST>::cleanup(); 00021 } 00022 00023 template <class ST> ST* CStringFileFeatures<ST>::get_line(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length) 00024 { 00025 ST* s = file->get_map(); 00026 for (uint64_t i=offs; i<file_length; i++) 00027 { 00028 ST c=s[i]; 00029 00030 if (c == '\n') 00031 { 00032 ST* line=&s[offs]; 00033 len=i-offs; 00034 offs=i+1; 00035 line_nr++; 00036 return line; 00037 } 00038 else 00039 { 00040 if (!CStringFeatures<ST>::alphabet->is_valid((uint8_t) c)) 00041 { 00042 CStringFileFeatures<ST>::cleanup(); 00043 CStringFeatures<ST>::SG_ERROR("Invalid character (%c) in line %d\n", c, line_nr); 00044 } 00045 } 00046 } 00047 00048 len=0; 00049 offs=file_length; 00050 return NULL; 00051 } 00052 00053 template <class ST> void CStringFileFeatures<ST>::cleanup() 00054 { 00055 CStringFeatures<ST>::num_vectors=0; 00056 SG_FREE(CStringFeatures<ST>::features); 00057 SG_FREE(CStringFeatures<ST>::symbol_mask_table); 00058 CStringFeatures<ST>::features=NULL; 00059 CStringFeatures<ST>::symbol_mask_table=NULL; 00060 00061 /* start with a fresh alphabet, but instead of emptying the histogram 00062 * create a new object (to leave the alphabet object alone if it is used 00063 * by others) 00064 */ 00065 CAlphabet* alpha=new CAlphabet(CStringFeatures<ST>::alphabet->get_alphabet()); 00066 SG_UNREF(CStringFeatures<ST>::alphabet); 00067 CStringFeatures<ST>::alphabet=alpha; 00068 SG_REF(CStringFeatures<ST>::alphabet); 00069 } 00070 00071 template <class ST> void CStringFileFeatures<ST>::cleanup_feature_vector(int32_t num) 00072 { 00073 CStringFeatures<ST>::SG_ERROR("Cleaning single feature vector not" 00074 "supported by StringFileFeatures\n"); 00075 } 00076 00077 template <class ST> void CStringFileFeatures<ST>::fetch_meta_info_from_file(int32_t granularity) 00078 { 00079 CStringFileFeatures<ST>::cleanup(); 00080 uint64_t file_size=file->get_size(); 00081 ASSERT(granularity>=1); 00082 ASSERT(CStringFeatures<ST>::alphabet); 00083 00084 int64_t buffer_size=granularity; 00085 CStringFeatures<ST>::features=SG_MALLOC(SGString<ST>, buffer_size); 00086 00087 uint64_t offs=0; 00088 uint64_t len=0; 00089 CStringFeatures<ST>::max_string_length=0; 00090 CStringFeatures<ST>::num_vectors=0; 00091 00092 while (true) 00093 { 00094 ST* line=get_line(len, offs, CStringFeatures<ST>::num_vectors, file_size); 00095 00096 if (line) 00097 { 00098 if (CStringFeatures<ST>::num_vectors > buffer_size) 00099 { 00100 CStringFeatures<ST>::features = SG_REALLOC(SGString<ST>, CStringFeatures<ST>::features, buffer_size+granularity); 00101 buffer_size+=granularity; 00102 } 00103 00104 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].string=line; 00105 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].slen=len; 00106 CStringFeatures<ST>::max_string_length=CMath::max(CStringFeatures<ST>::max_string_length, (int32_t) len); 00107 } 00108 else 00109 break; 00110 } 00111 00112 CStringFeatures<ST>::SG_INFO("number of strings:%d\n", CStringFeatures<ST>::num_vectors); 00113 CStringFeatures<ST>::SG_INFO("maximum string length:%d\n", CStringFeatures<ST>::max_string_length); 00114 CStringFeatures<ST>::SG_INFO("max_value_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_max_value_in_histogram()); 00115 CStringFeatures<ST>::SG_INFO("num_symbols_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_num_symbols_in_histogram()); 00116 00117 if (!CStringFeatures<ST>::alphabet->check_alphabet_size() || !CStringFeatures<ST>::alphabet->check_alphabet()) 00118 CStringFileFeatures<ST>::cleanup(); 00119 00120 CStringFeatures<ST>::features=SG_REALLOC(SGString<ST>, CStringFeatures<ST>::features, CStringFeatures<ST>::num_vectors); 00121 } 00122 00123 template class CStringFileFeatures<bool>; 00124 template class CStringFileFeatures<char>; 00125 template class CStringFileFeatures<int8_t>; 00126 template class CStringFileFeatures<uint8_t>; 00127 template class CStringFileFeatures<int16_t>; 00128 template class CStringFileFeatures<uint16_t>; 00129 template class CStringFileFeatures<int32_t>; 00130 template class CStringFileFeatures<uint32_t>; 00131 template class CStringFileFeatures<int64_t>; 00132 template class CStringFileFeatures<uint64_t>; 00133 template class CStringFileFeatures<float32_t>; 00134 template class CStringFileFeatures<float64_t>; 00135 template class CStringFileFeatures<floatmax_t>; 00136 }