SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max Planck Society 00009 */ 00010 #ifndef __BITSTRING_H__ 00011 #define __BITSTRING_H__ 00012 00013 #include <shogun/features/Alphabet.h> 00014 #include <shogun/lib/common.h> 00015 #include <shogun/io/SGIO.h> 00016 #include <shogun/io/MemoryMappedFile.h> 00017 #include <shogun/mathematics/Math.h> 00018 00019 namespace shogun 00020 { 00028 class CBitString : public CSGObject 00029 { 00030 public: 00032 CBitString() { 00033 SG_UNSTABLE("CBitString::CBitString()", "\n"); 00034 00035 alphabet = NULL; 00036 string = NULL; 00037 length = 0; 00038 word_len = 0; 00039 mask = 0; 00040 single_mask = 0; 00041 } 00042 00050 CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0) 00051 { 00052 alphabet=new CAlphabet(alpha); 00053 int32_t nbits=alphabet->get_num_bits(); 00054 word_len = width*nbits; 00055 00056 mask=0; 00057 for (int32_t j=0; j<word_len; j++) 00058 mask=(mask<<1) | (uint64_t) 1; 00059 mask<<=sizeof(uint64_t)*8-word_len; 00060 00061 single_mask=0; 00062 for (int32_t j=0; j<nbits; j++) 00063 mask=(mask<<1) | (uint64_t) 1; 00064 } 00065 00067 ~CBitString() 00068 { 00069 cleanup(); 00070 SG_UNREF(alphabet); 00071 } 00072 00074 void cleanup() 00075 { 00076 SG_FREE(string); 00077 string=NULL; 00078 length=0; 00079 } 00080 00086 void obtain_from_char(char* str, uint64_t len) 00087 { 00088 cleanup(); 00089 uint64_t stream_len=len/sizeof(uint64_t)+1; 00090 string=SG_MALLOC(uint64_t, stream_len); 00091 length=len; 00092 00093 uint64_t w=0; 00094 int32_t nbits=alphabet->get_num_bits(); 00095 uint64_t j=0; 00096 uint64_t nfit=8*sizeof(w)/nbits; 00097 for (uint64_t i=0; i<len; i++) 00098 { 00099 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]); 00100 00101 if (i % nfit == nfit-1) 00102 { 00103 string[j]=w; 00104 j++; 00105 } 00106 } 00107 00108 if (j<stream_len) 00109 { 00110 string[j]=w; 00111 j++; 00112 } 00113 00114 ASSERT(j==stream_len); 00115 } 00116 00123 void load_fasta_file(const char* fname, bool ignore_invalid=false) 00124 { 00125 cleanup(); 00126 00127 uint64_t len=0; 00128 uint64_t offs=0; 00129 00130 CMemoryMappedFile<char> f(fname); 00131 00132 uint64_t id_len=0; 00133 char* id=f.get_line(id_len, offs); 00134 00135 if (!id_len || id[0]!='>') 00136 SG_SERROR("No fasta hunks (lines starting with '>') found\n"); 00137 00138 if (offs==f.get_size()) 00139 SG_SERROR("Empty file?\n"); 00140 00141 char* fasta=NULL; 00142 char* s=NULL; 00143 int32_t spanned_lines=0; 00144 int32_t fasta_len=0; 00145 00146 while (true) 00147 { 00148 s=f.get_line(len, offs); 00149 00150 if (!fasta) 00151 fasta=s; 00152 00153 if (!s || len==0) 00154 SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len); 00155 00156 if (s[0]=='>') 00157 SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n"); 00158 00159 if (offs==f.get_size()) 00160 { 00161 offs-=len+1; // seek to beginning 00162 fasta_len+=len; 00163 00164 uint64_t w=0; 00165 int32_t nbits=alphabet->get_num_bits(); 00166 uint64_t nfit=8*sizeof(w)/nbits; 00167 00168 len = fasta_len-spanned_lines; 00169 uint64_t stream_len=len/(nfit)+1; 00170 string=SG_MALLOC(uint64_t, stream_len); 00171 length=len; 00172 00173 uint64_t idx=0; 00174 int32_t k=0; 00175 00176 for (int32_t j=0; j<fasta_len; j++, k++) 00177 { 00178 if (fasta[j]=='\n') 00179 { 00180 k--; 00181 continue; 00182 } 00183 00184 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]); 00185 00186 if (k % nfit == nfit-1) 00187 { 00188 string[idx]=w; 00189 w=0; 00190 idx++; 00191 } 00192 } 00193 00194 if (idx<stream_len) 00195 string[idx]=w<<(nbits*(nfit - k%nfit)); 00196 00197 break; 00198 } 00199 00200 spanned_lines++; 00201 fasta_len+=len+1; // including '\n' 00202 } 00203 } 00204 00210 void set_string(uint64_t* str, uint64_t len) 00211 { 00212 cleanup(); 00213 string=str; 00214 length=len; 00215 } 00216 00221 void create(uint64_t len) 00222 { 00223 cleanup(); 00224 uint64_t stream_len=len/sizeof(uint64_t)+1; 00225 string=SG_MALLOC(uint64_t, stream_len); 00226 SGVector<uint64_t>::fill_vector(string, (int32_t) stream_len, (uint64_t) 0); 00227 length=len; 00228 } 00229 00230 /* 00231 inline uint64_t condense(uint64_t bits, uint64_t mask) const 00232 { 00233 uint64_t res = 0 ; 00234 uint64_t m=mask ; 00235 uint64_t b=bits ; 00236 char cnt=0 ; 00237 00238 char ar[256][256] ; 00239 00240 for (int i=0; i<8; i++) 00241 { 00242 //fprintf(stdout, "%i %lx %lx %lx %i\n", i, res, m, b, (int)cnt) ; 00243 if (m&1) 00244 res = res>>8 | ar[b&255][m&255] ; 00245 //else 00246 // cnt++ ; 00247 m=m>>8 ; 00248 b=b>>8 ; 00249 } 00250 res=res>>cnt ; 00251 //fprintf(stdout, "%lx %lx %lx\n", res, m, b) ; 00252 00253 //res = res & bits & mask ; 00254 00255 return res ; 00256 } 00257 */ 00258 00263 inline uint64_t operator[](uint64_t index) const 00264 { 00265 ASSERT(index<length); 00266 00267 uint64_t bitindex=alphabet->get_num_bits()*index; 00268 int32_t ws=8*sizeof(uint64_t); 00269 uint64_t i=bitindex/ws; 00270 int32_t j=bitindex % ws; 00271 int32_t missing=word_len-(ws-j); 00272 00273 //SG_SPRINT("i=%lld j=%d ws=%d word_len=%d missing=%d left=%llx shift=%d\n", i, j, ws, word_len, missing, ( string[i] << j ) & mask, ws-word_len); 00274 uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len); 00275 00276 if (missing>0) 00277 res|= ( string[i+1] >> (ws-missing) ); 00278 00279 //res = condense(res, 1<<31|1<<24|1<<10|1<<8|1<<4|1<<2|1<<1|1) ; 00280 00281 return res; 00282 } 00283 00289 inline void set_binary_word(uint16_t word, uint64_t index) 00290 { 00291 ASSERT(index<length); 00292 00293 uint64_t bitindex=alphabet->get_num_bits()*index; 00294 int32_t ws=8*sizeof(uint64_t); 00295 uint64_t i=bitindex/ws; 00296 int32_t j=bitindex % ws; 00297 int32_t missing=word_len-(ws-j); 00298 00299 uint64_t sl = j-word_len; 00300 uint64_t ml; 00301 uint64_t wl; 00302 uint64_t mr; 00303 uint64_t wr; 00304 00305 if (sl>0) 00306 { 00307 ml=mask<<sl; 00308 wl=word<<sl ; 00309 } 00310 else 00311 { 00312 sl=-sl; 00313 ml=mask>>sl; 00314 wl=word>>sl; 00315 } 00316 00317 string[i] = ( string[i] & (~ml) ) | ( wl & ml); 00318 00319 if (missing>0) 00320 { 00321 mr = mask<<missing; 00322 wr = word<<missing; 00323 string[i+1] = ( string[i+1] & (~mr) ) | ( wr & mr); 00324 } 00325 00326 } 00327 00329 inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; } 00330 00332 inline virtual const char* get_name() const { return "BitString"; } 00333 00334 private: 00336 CAlphabet* alphabet; 00338 uint64_t* string; 00340 uint64_t length; 00342 int32_t word_len; 00344 uint64_t mask; 00346 uint64_t single_mask; 00347 }; 00348 } 00349 #endif //__BITSTRING_H__