Package Bio :: Module NetCatch
[hide private]
[frames] | no frames]

Source Code for Module Bio.NetCatch

  1  # Copyright 2002 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  NetCatch enables the user to scan a list of labelled urls and select  
  8  a subset to read into a file. 
  9   
 10  Functions: 
 11  get_urls_by_label 
 12  get_urls_by_index 
 13  get_urls_by_range 
 14  select_output_file 
 15  """ 
 16  import sys 
 17  import os 
 18  import urllib 
 19  from tempfile import mktemp 
 20  import sgmllib 
 21  import string 
 22  from Bio import File 
 23   
 24   
25 -def is_absolute_url( candidate ):
26 ( url_type, url ) = urllib.splittype( candidate ) 27 if( url_type == None ): 28 return 0 29 ( url_host, url ) = urllib.splithost( url ) 30 if( url_host == None ): 31 return 0 32 return 1
33 34 """ 35 ExtractUrls.py 36 37 38 Scans a file in http format and builds a dictionary of urls 39 """ 40
41 -class ExtractUrls( sgmllib.SGMLParser ):
42
43 - def __init__( self ):
44 sgmllib.SGMLParser.__init__( self ) 45 self.reset()
46
47 - def reset( self ):
48 sgmllib.SGMLParser.reset( self ) 49 self.urls = {} 50 self._inlink = 0 51 self._pending_url = '' 52 self.text = ''
53
54 - def __str__( self ):
55 output = '' 56 for key in self.urls.keys(): 57 val = self.urls[ key ] 58 output = output + '%s : %s\n' % ( key, val ) 59 return output
60
61 - def extract_urls(self, handle):
62 self.feed(handle) 63 return self.urls
64
65 - def feed(self, handle):
66 """feed(self, handle ) 67 68 Feed in data for scanning. handle is a file-like object 69 containing html. 70 71 """ 72 if isinstance(handle, File.UndoHandle): 73 uhandle = handle 74 else: 75 uhandle = File.UndoHandle(handle) 76 text = uhandle.read() 77 sgmllib.SGMLParser.feed( self, text )
78
79 - def handle_data(self, data):
80 if( self._inlink ): 81 self.text = self.text + data
82
83 - def start_a( self, attrs ):
84 self._inlink = 1 85 for key, val in attrs: 86 if key.lower() == 'href': 87 self._pending_url = val
88
89 - def end_a( self ):
90 self._inlink = 0 91 key = self.text 92 self.text = '' 93 if not key == '': 94 key = key.replace( ' ', '_' ) 95 self.urls[ key ] = self._pending_url
96
97 -class Url:
98
99 - def __init__( self, label, url ):
100 assert is_absolute_url( url ) 101 assert type( label ) == type( '' ) 102 self.label = label 103 self.url = url
104
105 -class NetCatch:
106 """ 107 Decorator for a dictionary of links. Each link is indexed by its label. 108 Allows the user to select links of interest and read each selection into 109 its own file. The filename is contructed by appending the label with an 110 extension of html. 111 112 Files can be selected by index, range or label. The destination directory 113 defaults to the current directory. The user can specify another 114 dictionary by passing a list of path segments to the constructor. 115 116 net_catch = NetCatch() 117 net_catch = NetCatch( [ 'amylase', 'species' ] ) 118 net_catch.get_all_urls() 119 net_catch.get_urls_by_label( [ 'pig', 'dog', 'cow' ] ) 120 net_catch.get_urls_by_index( [ 1, 4, 6, 9 ] ) 121 net_catch.get_urls_by_range( 2, 5 ) 122 """ 123
124 - def __init__( self, path_segments = [] ):
125 self._urls = {} 126 self._labels = [] 127 assert type( path_segments ) == type( [] ) 128 self.path_segments = path_segments 129 self._build_path()
130
131 - def _build_path( self ):
132 base_path = os.path.join( '' ) 133 for segment in self.path_segments: 134 base_path = os.path.join( base_path, segment ) 135 self.base_path = base_path
136
137 - def __str__( self ):
138 i = 0 139 output = '' 140 for label in self._labels: 141 output = output + '%d %s: %s\n' % ( i, label, self._urls[ label ] ) 142 i = i + 1 143 return output
144
145 - def import_dict( self, href_dict ):
146 for ( key, val ) in href_dict.items(): 147 self.add_url( key, val )
148
149 - def add_url( self, label, url ):
150 assert is_absolute_url( url ) 151 assert type( label ) == type( '' ) 152 self._labels.append( label ) 153 self._urls[ label ] = url
154
155 - def get_all_urls( self ):
156 url_opener = urllib.URLopener() 157 i = 0 158 for label in self._labels: 159 base_path = self.base_path 160 name = '%s%d.htm' % ( label, i ) 161 full_path = os.path.join( base_path, name ) 162 out_handle = open( full_path , "wb" ) 163 i = i + 1 164 url = self._urls[ label ] 165 url_handle = url_opener.open( url ) 166 contents = url_handle.read() 167 out_handle.write( contents ) 168 url_opener.close( ) 169 out_handle.close()
170
171 - def get_urls_by_label( self, labels ):
172 url_opener = urllib.URLopener() 173 for label in labels: 174 base_path = self.base_path 175 name = '%s.htm' % ( label ) 176 full_path = os.path.join( base_path, name ) 177 out_handle = open( full_path , "wb" ) 178 url = self._urls[ label ] 179 url_handle = url_opener.open( url ) 180 contents = url_handle.read() 181 out_handle.write( contents ) 182 url_opener.close( ) 183 out_handle.close( )
184
185 - def get_urls_by_index( self, indices ):
186 url_opener = urllib.URLopener() 187 for index in indices: 188 base_path = self.base_path 189 name = '%s.htm' % self._labels[ index ] 190 full_path = os.path.join( base_path, name ) 191 out_handle = open( full_path , "wb" ) 192 label = self._labels[ index ] 193 url = self._urls[ label ] 194 url_handle = url_opener.open( url ) 195 contents = url_handle.read() 196 out_handle.write( contents ) 197 url_opener.close( ) 198 out_handle.close( )
199
200 - def get_urls_by_range( self, low, hi ):
201 url_opener = urllib.URLopener( ) 202 for index in range( low, hi ): 203 base_path = self.base_path 204 name = '%s.htm' % self._labels[ index ] 205 full_path = os.path.join( base_path, name ) 206 out_handle = open( full_path , "wb" ) 207 label = self._labels[ index ] 208 url = self._urls[ label ] 209 url_handle = url_opener.open( url ) 210 contents = url_handle.read() 211 out_handle.write( contents ) 212 url_opener.close( ) 213 out_handle.close( )
214