Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # Version 2.0 
  8  # 
  9  # (c) 2003 Kristian Rother 
 10  # This work was supported by the German Ministry of Education 
 11  # and Research (BMBF). Project http://www.bcbio.de 
 12  #  
 13  # Contact the author 
 14  #    homepage : http://www.rubor.de/bioinf 
 15  #    email    : krother@genesilico.pl 
 16  # 
 17  # 
 18  # This Code is released under the conditions of the Biopython license. 
 19  # It may be distributed freely with respect to the original author. 
 20  # Any maintainer of the BioPython code may change this notice 
 21  # when appropriate. 
 22  # 
 23  # Last modified on Fri, Oct 24th 2006, Warszawa 
 24  # 
 25  # Removed 'write' options from retrieve_pdb_file method: it is not used. 
 26  # Also added a 'dir' options (pdb file is put in this directory if given), 
 27  # and an 'exist' option (test if the file is already there). This method 
 28  # now returns the name of the downloaded uncompressed file. 
 29  # 
 30  # -Thomas, 1/06/04 
 31  # 
 32  # 
 33  # Including bugfixes from Sunjoong Lee (9/2006) 
 34  # 
 35   
 36  """Access the PDB over the internet (for example to download structures).""" 
 37   
 38  import os 
 39  import shutil 
 40  import urllib 
 41   
 42   
43 -class PDBList:
44 """ 45 This class provides quick access to the structure lists on the 46 PDB server or its mirrors. The structure lists contain 47 four-letter PDB codes, indicating that structures are 48 new, have been modified or are obsolete. The lists are released 49 on a weekly basis. 50 51 It also provides a function to retrieve PDB files from the server. 52 To use it properly, prepare a directory /pdb or the like, 53 where PDB files are stored. 54 55 If You want to use this module from inside a proxy, add 56 the proxy variable to Your environment, e.g. in Unix 57 export HTTP_PROXY='http://realproxy.charite.de:888' 58 (This can also be added to ~/.bashrc) 59 """ 60 61 PDB_REF=""" 62 The Protein Data Bank: a computer-based archival file for macromolecular structures. 63 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 64 J. Mol. Biol. 112 pp. 535-542 (1977) 65 http://www.pdb.org/. 66 """ 67 68 alternative_download_url = "http://www.rcsb.org/pdb/files/" 69 # just append PDB code to this, and then it works. 70 # (above URL verified with a XXXX.pdb appended on 2 Sept 2008) 71
72 - def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
73 """Initialize the class with the default server or a custom one.""" 74 # remote pdb server 75 self.pdb_server = server 76 77 # local pdb file tree 78 self.local_pdb = pdb 79 80 # local file tree for obsolete pdb files 81 if obsolete_pdb: 82 self.obsolete_pdb = obsolete_pdb 83 else: 84 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 85 if not os.access(self.obsolete_pdb,os.F_OK): 86 os.makedirs(self.obsolete_pdb) 87 88 # variables for command-line options 89 self.overwrite = 0 90 self.flat_tree = 0
91 92
93 - def get_status_list(self,url):
94 """Retrieves a list of pdb codes in the weekly pdb status file 95 from the given URL. Used by get_recent_files. 96 97 Typical contents of the list files parsed by this method is now 98 very simply one PDB name per line. 99 """ 100 handle = urllib.urlopen(url) 101 answer = [] 102 for line in handle: 103 pdb = line.strip() 104 assert len(pdb)==4 105 answer.append(pdb) 106 handle.close() 107 return answer
108 109
110 - def get_recent_changes(self):
111 """Returns three lists of the newest weekly files (added,mod,obsolete). 112 113 Reads the directories with changed entries from the PDB server and 114 returns a tuple of three URL's to the files of new, modified and 115 obsolete entries from the most recent list. The directory with the 116 largest numerical name is used. 117 Returns None if something goes wrong. 118 119 Contents of the data/status dir (20031013 would be used); 120 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 121 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 122 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 123 124 125 """ 126 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/') 127 128 # added by S.Lee 129 # recent = filter(lambda x: x.isdigit(), \ 130 # map(lambda x: x.split()[-1], url.readlines()))[-1] 131 recent = filter(str.isdigit, 132 (x.split()[-1] for x in url.readlines()) 133 )[-1] 134 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent) 135 # retrieve the lists 136 added = self.get_status_list(path+'added.pdb') 137 modified = self.get_status_list(path+'modified.pdb') 138 obsolete = self.get_status_list(path+'obsolete.pdb') 139 return [added,modified,obsolete]
140
141 - def get_all_entries(self):
142 """Retrieves a big file containing all the 143 PDB entries and some annotation to them. 144 Returns a list of PDB codes in the index file. 145 """ 146 print "retrieving index file. Takes about 5 MB." 147 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/index/entries.idx') 148 # extract four-letter-codes 149 # entries = map(lambda x: x[:4], \ 150 # filter(lambda x: len(x)>4, url.readlines()[2:])) 151 return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
152
153 - def get_all_obsolete(self):
154 """Returns a list of all obsolete entries ever in the PDB. 155 156 Returns a list of all obsolete pdb codes that have ever been 157 in the PDB. 158 159 Gets and parses the file from the PDB server in the format 160 (the first pdb_code column is the one used). The file looks 161 like this: 162 163 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 164 OBSLTE 31-JUL-94 116L 216L 165 ... 166 OBSLTE 29-JAN-96 1HFT 2HFT 167 OBSLTE 21-SEP-06 1HFV 2J5X 168 OBSLTE 21-NOV-03 1HG6 169 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 170 OBSLTE 08-NOV-96 1HID 2HID 171 OBSLTE 01-APR-97 1HIU 2HIU 172 OBSLTE 14-JAN-04 1HKE 1UUZ 173 ... 174 175 """ 176 handle = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat') 177 # extract pdb codes. Could use a list comprehension, but I want 178 # to include an assert to check for mis-reading the data. 179 obsolete = [] 180 for line in handle: 181 if not line.startswith("OBSLTE ") : continue 182 pdb = line.split()[2] 183 assert len(pdb)==4 184 obsolete.append(pdb) 185 handle.close() 186 return obsolete
187 188 189
190 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression='.gz', 191 uncompress="gunzip", pdir=None):
192 """Retrieves a PDB structure file from the PDB server and 193 stores it in a local file tree. 194 The PDB structure is returned as a single string. 195 If obsolete is 1, the file will be by default saved in a special file tree. 196 The compression should be '.Z' or '.gz'. 'uncompress' is 197 the command called to uncompress the files. 198 199 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 200 @type pdir: string 201 202 @return: filename 203 @rtype: string 204 """ 205 # get the structure 206 code=pdb_code.lower() 207 filename="pdb%s.ent%s"%(code,compression) 208 if not obsolete: 209 url=(self.pdb_server+ 210 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s' 211 % (code[1:3],code,compression)) 212 else: 213 url=(self.pdb_server+ 214 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent%s' 215 % (code[1:3],code,compression)) 216 217 # in which dir to put the pdb file? 218 if pdir is None: 219 if self.flat_tree: 220 if not obsolete: 221 path=self.local_pdb 222 else: 223 path=self.obsolete_pdb 224 else: 225 # Put in PDB style directory tree 226 if not obsolete: 227 path=os.path.join(self.local_pdb, code[1:3]) 228 else: 229 path=os.path.join(self.obsolete_pdb,code[1:3]) 230 else: 231 # Put in specified directory 232 path=pdir 233 234 if not os.access(path,os.F_OK): 235 os.makedirs(path) 236 237 filename=os.path.join(path, filename) 238 # the final uncompressed file 239 final_file=os.path.join(path, "pdb%s.ent" % code) 240 241 # check whether the file exists 242 if not self.overwrite: 243 if os.path.exists(final_file): 244 print "file exists, not retrieved %s" % final_file 245 return final_file 246 247 # Retrieve the file 248 print 'retrieving %s' % url 249 lines=urllib.urlopen(url).read() 250 open(filename,'wb').write(lines) 251 # uncompress the file 252 os.system("%s %s" % (uncompress, filename)) 253 254 return final_file
255 256
257 - def update_pdb(self):
258 """ 259 I guess this is the 'most wanted' function from this module. 260 It gets the weekly lists of new and modified pdb entries and 261 automatically downloads the according PDB files. 262 You can call this module as a weekly cronjob. 263 """ 264 assert os.path.isdir(self.local_pdb) 265 assert os.path.isdir(self.obsolete_pdb) 266 267 new, modified, obsolete = self.get_recent_changes() 268 269 for pdb_code in new+modified: 270 try: 271 #print 'retrieving %s' % pdb_code 272 self.retrieve_pdb_file(pdb_code) 273 except Exception: 274 print 'error %s\n' % pdb_code 275 # you can insert here some more log notes that 276 # something has gone wrong. 277 278 # move the obsolete files to a special folder 279 for pdb_code in obsolete: 280 if self.flat_tree: 281 old_file = os.path.join(self.local_pdb, 282 'pdb%s.ent' % pdb_code) 283 new_dir = self.obsolete_pdb 284 else: 285 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 286 'pdb%s.ent' % pdb_code) 287 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 288 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 289 if os.path.isfile(old_file): 290 if not os.path.isdir(new_dir): 291 os.mkdir(new_dir) 292 try: 293 shutil.move(old_file, new_file) 294 except Exception: 295 print "Could not move %s to obsolete folder" % old_file 296 elif os.path.isfile(new_file): 297 print "Obsolete file %s already moved" % old_file 298 else: 299 print "Obsolete file %s is missing" % old_file
300 301
302 - def download_entire_pdb(self, listfile=None):
303 """Retrieve all PDB entries not present in the local PDB copy. 304 305 Writes a list file containing all PDB codes (optional, if listfile is 306 given). 307 """ 308 entries = self.get_all_entries() 309 for pdb_code in entries: 310 self.retrieve_pdb_file(pdb_code) 311 # Write the list 312 if listfile: 313 outfile = open(listfile, 'w') 314 outfile.writelines((x+'\n' for x in entries)) 315 outfile.close()
316
317 - def download_obsolete_entries(self, listfile=None):
318 """Retrieve all obsolete PDB entries not present in the local obsolete 319 PDB copy. 320 321 Writes a list file containing all PDB codes (optional, if listfile is 322 given). 323 """ 324 entries = self.get_all_obsolete() 325 for pdb_code in entries: 326 self.retrieve_pdb_file(pdb_code, obsolete=1) 327 328 # Write the list 329 if listfile: 330 outfile = open(listfile, 'w') 331 outfile.writelines((x+'\n' for x in entries)) 332 outfile.close()
333 334 335 # this is actually easter egg code not used by any of the methods 336 # maybe someone will find it useful. 337 #
338 - def get_seqres_file(self,savefile='pdb_seqres.txt'):
339 """Retrieves a (big) file containing all the sequences of PDB entries 340 and writes it to a file. 341 """ 342 print "retrieving sequence file. Takes about 15 MB." 343 url = urllib.urlopen(self.pdb_server + 344 '/pub/pdb/derived_data/pdb_seqres.txt') 345 lines = url.readlines() 346 outfile = open(savefile, 'w') 347 outfile.writelines(lines) 348 outfile.close()
349 350 351 if __name__ == '__main__': 352 353 import sys 354 355 doc = """PDBList.py 356 (c) Kristian Rother 2003, Contributed to BioPython 357 358 Usage: 359 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 360 local pdb tree. 361 PDBList.py all <pdb_path> [options] - write all PDB entries to 362 local pdb tree. 363 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 364 entries to local pdb tree. 365 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 366 367 Options: 368 -d A single directory will be used as <pdb_path>, not a tree. 369 -o Overwrite existing structure files. 370 """ 371 print doc 372 373 if len(sys.argv)>2: 374 pdb_path = sys.argv[2] 375 pl = PDBList(pdb=pdb_path) 376 if len(sys.argv)>3: 377 for option in sys.argv[3:]: 378 if option == '-d': pl.flat_tree = 1 379 elif option == '-o': pl.overwrite = 1 380 381 else: 382 pdb_path = os.getcwd() 383 pl = PDBList() 384 pl.flat_tree = 1 385 386 if len(sys.argv) > 1: 387 if sys.argv[1] == 'update': 388 # update PDB 389 print "updating local PDB at "+pdb_path 390 pl.update_pdb() 391 392 elif sys.argv[1] == 'all': 393 # get the entire PDB 394 pl.download_entire_pdb() 395 396 elif sys.argv[1] == 'obsol': 397 # get all obsolete entries 398 pl.download_obsolete_entries(pdb_path) 399 400 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit(): 401 # get single PDB entry 402 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path) 403