SHOGUN
v2.0.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Gunnar Raetsch 00008 * Written (W) 1999-2009 Soeren Sonnenburg 00009 * Written (W) 2008-2009 Jonas Behr 00010 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00011 */ 00012 00013 #ifndef __CDYNPROG_H__ 00014 #define __CDYNPROG_H__ 00015 00016 #include <shogun/mathematics/Math.h> 00017 #include <shogun/lib/common.h> 00018 #include <shogun/base/SGObject.h> 00019 #include <shogun/io/SGIO.h> 00020 #include <shogun/lib/config.h> 00021 #include <shogun/structure/PlifMatrix.h> 00022 #include <shogun/structure/PlifBase.h> 00023 #include <shogun/structure/Plif.h> 00024 #include <shogun/structure/IntronList.h> 00025 #include <shogun/structure/SegmentLoss.h> 00026 #include <shogun/features/StringFeatures.h> 00027 #include <shogun/features/SparseFeatures.h> 00028 #include <shogun/distributions/Distribution.h> 00029 #include <shogun/lib/DynamicArray.h> 00030 #include <shogun/lib/DynamicObjectArray.h> 00031 #include <shogun/lib/Time.h> 00032 00033 #include <stdio.h> 00034 #include <limits.h> 00035 00036 namespace shogun 00037 { 00038 template <class T> class CSparseFeatures; 00039 class CIntronList; 00040 class CPlifMatrix; 00041 class CSegmentLoss; 00042 00043 template <class T> class CDynamicArray; 00044 00045 //#define DYNPROG_TIMING 00046 00047 #ifdef USE_BIGSTATES 00048 typedef uint16_t T_STATES ; 00049 #else 00050 typedef uint8_t T_STATES ; 00051 #endif 00052 typedef T_STATES* P_STATES ; 00053 00054 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00055 00056 struct segment_loss_struct 00057 { 00059 int32_t maxlookback; 00061 int32_t seqlen; 00063 int32_t *segments_changed; 00065 float64_t *num_segment_id; 00067 int32_t *length_segment_id ; 00068 }; 00069 #endif 00070 00076 class CDynProg : public CSGObject 00077 { 00078 public: 00083 CDynProg(int32_t p_num_svms=8); 00084 virtual ~CDynProg(); 00085 00086 // model related functions 00092 void set_num_states(int32_t N); 00093 00095 int32_t get_num_states(); 00096 00098 int32_t get_num_svms(); 00099 00105 void init_content_svm_value_array(const int32_t p_num_svms); 00106 00114 void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes); 00115 00122 void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs); 00123 00128 void resize_lin_feat(int32_t num_new_feat); 00133 void set_p_vector(SGVector<float64_t> p); 00134 00139 void set_q_vector(SGVector<float64_t> q); 00140 00145 void set_a(SGMatrix<float64_t> a); 00146 00151 void set_a_id(SGMatrix<int32_t> a); 00152 00157 void set_a_trans_matrix(SGMatrix<float64_t> a_trans); 00158 00163 void init_mod_words_array(SGMatrix<int32_t> p_mod_words_array); 00164 00170 bool check_svm_arrays(); 00171 00176 void set_observation_matrix(SGNDArray<float64_t> seq); 00177 00184 int32_t get_num_positions(); 00185 00195 void set_content_type_array(SGMatrix<float64_t> seg_path); 00196 00201 void set_pos(SGVector<int32_t> pos); 00202 00208 void set_orf_info(SGMatrix<int32_t> orf_info); 00209 00214 void set_gene_string(SGVector<char> genestr); 00215 00216 00221 void set_dict_weights(SGMatrix<float64_t> dictionary_weights); 00222 00227 void best_path_set_segment_loss(SGMatrix<float64_t> segment_loss); 00228 00235 void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m); 00236 00238 void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2); 00239 00244 void set_plif_matrices(CPlifMatrix* pm); 00245 00246 // best_path result retrieval functions 00251 SGVector<float64_t> get_scores(); 00252 00257 SGMatrix<int32_t> get_states(); 00258 00263 SGMatrix<int32_t> get_positions(); 00264 00265 00274 void compute_nbest_paths(int32_t max_num_signals, 00275 bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences); 00276 00278 00290 void best_path_trans_deriv( 00291 int32_t* my_state_seq, int32_t *my_pos_seq, 00292 int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals); 00293 00294 // additional best_path_trans_deriv functions 00299 void set_my_state_seq(int32_t* my_state_seq); 00300 00305 void set_my_pos_seq(int32_t* my_pos_seq); 00306 00314 void get_path_scores(float64_t** my_scores, int32_t* seq_len); 00315 00323 void get_path_losses(float64_t** my_losses, int32_t* seq_len); 00324 00325 00327 inline T_STATES get_N() const 00328 { 00329 return m_N ; 00330 } 00331 00336 inline void set_q(T_STATES offset, float64_t value) 00337 { 00338 m_end_state_distribution_q[offset]=value; 00339 } 00340 00345 inline void set_p(T_STATES offset, float64_t value) 00346 { 00347 m_initial_state_distribution_p[offset]=value; 00348 } 00349 00356 inline void set_a(T_STATES line_, T_STATES column, float64_t value) 00357 { 00358 m_transition_matrix_a.element(line_,column)=value; // look also best_path! 00359 } 00360 00366 inline float64_t get_q(T_STATES offset) const 00367 { 00368 return m_end_state_distribution_q[offset]; 00369 } 00370 00376 inline float64_t get_q_deriv(T_STATES offset) const 00377 { 00378 return m_end_state_distribution_q_deriv[offset]; 00379 } 00380 00386 inline float64_t get_p(T_STATES offset) const 00387 { 00388 return m_initial_state_distribution_p[offset]; 00389 } 00390 00396 inline float64_t get_p_deriv(T_STATES offset) const 00397 { 00398 return m_initial_state_distribution_p_deriv[offset]; 00399 } 00400 00404 void precompute_content_values(); 00405 00412 inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2) 00413 { 00414 m_lin_feat.get_array_size(dim1, dim2); 00415 return m_lin_feat.get_array(); 00416 } 00425 inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len) 00426 { 00427 m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true); 00428 } 00433 void create_word_string(); 00434 00437 void precompute_stop_codons(); 00438 00445 inline float64_t get_a(T_STATES line_, T_STATES column) const 00446 { 00447 return m_transition_matrix_a.element(line_, column); // look also best_path()! 00448 } 00449 00456 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const 00457 { 00458 return m_transition_matrix_a_deriv.element(line_, column); // look also best_path()! 00459 } 00461 00466 void set_intron_list(CIntronList* intron_list, int32_t num_plifs); 00467 00469 CSegmentLoss* get_segment_loss_object() 00470 { 00471 return m_seg_loss_obj; 00472 } 00473 00480 void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len) 00481 { 00482 m_long_transitions = use_long_transitions; 00483 m_long_transition_threshold = threshold; 00484 SG_DEBUG("ignoring max_len\n") ; 00485 //m_long_transition_max = max_len; 00486 } 00487 00488 protected: 00489 00490 /* helper functions */ 00491 00501 void lookup_content_svm_values(const int32_t from_state, 00502 const int32_t to_state, const int32_t from_pos, const int32_t to_pos, 00503 float64_t* svm_values, int32_t frame); 00504 00512 inline void lookup_tiling_plif_values(const int32_t from_state, 00513 const int32_t to_state, const int32_t len, float64_t* svm_values); 00514 00519 inline int32_t find_frame(const int32_t from_state); 00520 00529 inline int32_t raw_intensities_interval_query( 00530 const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type); 00531 00532 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00533 00534 struct svm_values_struct 00535 { 00537 int32_t maxlookback; 00539 int32_t seqlen; 00540 00542 int32_t* start_pos; 00544 float64_t ** svm_values_unnormalized; 00546 float64_t * svm_values; 00548 bool *** word_used; 00550 int32_t **num_unique_words; 00551 }; 00552 #endif // DOXYGEN_SHOULD_SKIP_THIS 00553 00562 bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to); 00563 00565 inline virtual const char* get_name() const { return "DynProg"; } 00566 00567 private: 00568 00569 T_STATES trans_list_len; 00570 T_STATES **trans_list_forward; 00571 T_STATES *trans_list_forward_cnt; 00572 float64_t **trans_list_forward_val; 00573 int32_t **trans_list_forward_id; 00574 bool mem_initialized; 00575 00576 #ifdef DYNPROG_TIMING 00577 CTime MyTime; 00578 CTime MyTime2; 00579 CTime MyTime3; 00580 00581 float64_t segment_init_time; 00582 float64_t segment_pos_time; 00583 float64_t segment_clean_time; 00584 float64_t segment_extend_time; 00585 float64_t orf_time; 00586 float64_t content_time; 00587 float64_t content_penalty_time; 00588 float64_t content_svm_values_time ; 00589 float64_t content_plifs_time ; 00590 float64_t svm_init_time; 00591 float64_t svm_pos_time; 00592 float64_t inner_loop_time; 00593 float64_t inner_loop_max_time ; 00594 float64_t svm_clean_time; 00595 float64_t long_transition_time ; 00596 #endif 00597 00598 00599 protected: 00604 00605 int32_t m_N; 00606 00608 CDynamicArray<int32_t> m_transition_matrix_a_id; // 2d 00609 CDynamicArray<float64_t> m_transition_matrix_a; // 2d 00610 CDynamicArray<float64_t> m_transition_matrix_a_deriv; // 2d 00611 00613 CDynamicArray<float64_t> m_initial_state_distribution_p; 00614 CDynamicArray<float64_t> m_initial_state_distribution_p_deriv; 00615 00617 CDynamicArray<float64_t> m_end_state_distribution_q; 00618 CDynamicArray<float64_t> m_end_state_distribution_q_deriv; 00619 00621 00623 int32_t m_num_degrees; 00625 int32_t m_num_svms; 00626 00628 CDynamicArray<int32_t> m_word_degree; 00630 CDynamicArray<int32_t> m_cum_num_words; 00632 int32_t * m_cum_num_words_array; 00634 CDynamicArray<int32_t> m_num_words; 00636 int32_t* m_num_words_array; 00638 CDynamicArray<int32_t> m_mod_words; // 2d 00640 int32_t* m_mod_words_array; 00642 CDynamicArray<bool> m_sign_words; 00644 bool* m_sign_words_array; 00646 CDynamicArray<int32_t> m_string_words; 00648 int32_t* m_string_words_array; 00649 00651 // CDynamicArray<int32_t> m_svm_pos_start; 00653 CDynamicArray<int32_t> m_num_unique_words; 00655 bool m_svm_arrays_clean; 00657 int32_t m_max_a_id; 00658 00659 // input arguments 00661 CDynamicArray<float64_t> m_observation_matrix; //3d 00663 CDynamicArray<int32_t> m_pos; 00665 int32_t m_seq_len; 00667 CDynamicArray<int32_t> m_orf_info; // 2d 00669 CDynamicArray<float64_t> m_segment_sum_weights; // 2d 00671 CDynamicObjectArray m_plif_list; // CPlifBase* 00673 CDynamicObjectArray m_PEN; // 2d, CPlifBase* 00675 CDynamicObjectArray m_PEN_state_signals; // 2d, CPlifBase* 00677 CDynamicArray<char> m_genestr; 00692 uint16_t*** m_wordstr; 00694 CDynamicArray<float64_t> m_dict_weights; // 2d 00696 CDynamicArray<float64_t> m_segment_loss; // 3d 00698 CDynamicArray<int32_t> m_segment_ids; 00700 CDynamicArray<float64_t> m_segment_mask; 00702 CDynamicArray<int32_t> m_my_state_seq; 00704 CDynamicArray<int32_t> m_my_pos_seq; 00706 CDynamicArray<float64_t> m_my_scores; 00708 CDynamicArray<float64_t> m_my_losses; 00709 00712 CSegmentLoss* m_seg_loss_obj; 00713 00714 // output arguments 00716 CDynamicArray<float64_t> m_scores; 00718 CDynamicArray<int32_t> m_states; // 2d 00720 CDynamicArray<int32_t> m_positions; // 2d 00721 00723 CSparseFeatures<float64_t>* m_seq_sparse1; 00725 CSparseFeatures<float64_t>* m_seq_sparse2; 00727 CPlifMatrix* m_plif_matrices; 00728 00732 CDynamicArray<bool> m_genestr_stop; 00733 00736 CIntronList* m_intron_list; 00737 00739 int32_t m_num_intron_plifs; 00740 00745 CDynamicArray<float64_t> m_lin_feat; // 2d 00746 00748 float64_t *m_raw_intensities; 00750 int32_t* m_probe_pos; 00752 int32_t* m_num_probes_cum; 00754 int32_t* m_num_lin_feat_plifs_cum; 00756 int32_t m_num_raw_data; 00757 00759 bool m_long_transitions ; 00762 int32_t m_long_transition_threshold ; 00767 //int32_t m_long_transition_max ; 00768 00772 static int32_t word_degree_default[4]; 00773 00777 static int32_t cum_num_words_default[5]; 00778 00781 static int32_t frame_plifs[3]; 00782 00785 static int32_t num_words_default[4]; 00786 00788 static int32_t mod_words_default[32]; 00789 00791 static bool sign_words_default[16]; 00792 00794 static int32_t string_words_default[16]; 00795 }; 00796 } 00797 #endif