SHOGUN
v2.0.0
|
00001 /* 00002 * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights 00003 * embodied in the content of this file are licensed under the BSD 00004 * (revised) open source license. 00005 * 00006 * This program is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 3 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * Written (W) 2011 Shashwat Lal Das 00012 * Adaptation of Vowpal Wabbit v5.1. 00013 * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society. 00014 */ 00015 00016 #ifndef _VW_PARSER_H__ 00017 #define _VW_PARSER_H__ 00018 00019 #include <shogun/base/SGObject.h> 00020 #include <shogun/io/SGIO.h> 00021 #include <shogun/lib/Hash.h> 00022 #include <shogun/classifier/vw/vw_common.h> 00023 #include <shogun/classifier/vw/cache/VwCacheWriter.h> 00024 00025 namespace shogun 00026 { 00028 enum E_VW_PARSER_TYPE 00029 { 00030 T_VW = 1, 00031 T_SVMLIGHT = 2, 00032 T_DENSE = 3 00033 }; 00034 00046 class CVwParser: public CSGObject 00047 { 00048 public: 00052 CVwParser(); 00053 00059 CVwParser(CVwEnvironment* env_to_use); 00060 00064 virtual ~CVwParser(); 00065 00071 CVwEnvironment* get_env() 00072 { 00073 SG_REF(env); 00074 return env; 00075 } 00076 00082 void set_env(CVwEnvironment* env_to_use) 00083 { 00084 env = env_to_use; 00085 SG_REF(env); 00086 } 00087 00094 void set_cache_parameters(char * fname, EVwCacheType type = C_NATIVE) 00095 { 00096 init_cache(fname, type); 00097 } 00098 00104 EVwCacheType get_cache_type() 00105 { 00106 return cache_type; 00107 } 00108 00114 void set_write_cache(bool wr_cache) 00115 { 00116 write_cache = wr_cache; 00117 if (wr_cache) 00118 init_cache(NULL); 00119 else 00120 if (cache_writer) 00121 SG_UNREF(cache_writer); 00122 } 00123 00129 bool get_write_cache() 00130 { 00131 return write_cache; 00132 } 00133 00139 void set_mm(float64_t label) 00140 { 00141 env->min_label = CMath::min(env->min_label, label); 00142 if (label != FLT_MAX) 00143 env->max_label = CMath::max(env->max_label, label); 00144 } 00145 00152 void noop_mm(float64_t label) { } 00153 00160 void set_minmax(float64_t label) 00161 { 00162 set_mm(label); 00163 } 00164 00173 int32_t read_features(CIOBuffer* buf, VwExample*& ex); 00174 00183 int32_t read_svmlight_features(CIOBuffer* buf, VwExample*& ae); 00184 00193 int32_t read_dense_features(CIOBuffer* buf, VwExample*& ae); 00194 00200 virtual const char* get_name() const { return "VwParser"; } 00201 00202 protected: 00209 void init_cache(char * fname, EVwCacheType type = C_NATIVE); 00210 00219 void feature_value(substring &s, v_array<substring>& name, float32_t &v); 00220 00229 void tokenize(char delim, substring s, v_array<substring> &ret); 00230 00241 inline char* safe_index(char *start, char v, char *max) 00242 { 00243 while (start != max && *start != v) 00244 start++; 00245 return start; 00246 } 00247 00248 public: 00250 hash_func_t hasher; 00251 00252 protected: 00254 CVwEnvironment* env; 00256 CVwCacheWriter* cache_writer; 00258 EVwCacheType cache_type; 00260 bool write_cache; 00261 00262 private: 00264 v_array<substring> channels; 00265 v_array<substring> words; 00266 v_array<substring> name; 00267 }; 00268 00269 } 00270 #endif // _VW_PARSER_H__