1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 """Access the PDB over the internet (for example to download structures)."""
37
38 import os
39 import shutil
40 import urllib
41
42
44 """
45 This class provides quick access to the structure lists on the
46 PDB server or its mirrors. The structure lists contain
47 four-letter PDB codes, indicating that structures are
48 new, have been modified or are obsolete. The lists are released
49 on a weekly basis.
50
51 It also provides a function to retrieve PDB files from the server.
52 To use it properly, prepare a directory /pdb or the like,
53 where PDB files are stored.
54
55 If You want to use this module from inside a proxy, add
56 the proxy variable to Your environment, e.g. in Unix
57 export HTTP_PROXY='http://realproxy.charite.de:888'
58 (This can also be added to ~/.bashrc)
59 """
60
61 PDB_REF="""
62 The Protein Data Bank: a computer-based archival file for macromolecular structures.
63 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi
64 J. Mol. Biol. 112 pp. 535-542 (1977)
65 http://www.pdb.org/.
66 """
67
68 alternative_download_url = "http://www.rcsb.org/pdb/files/"
69
70
71
72 - def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
73 """Initialize the class with the default server or a custom one."""
74
75 self.pdb_server = server
76
77
78 self.local_pdb = pdb
79
80
81 if obsolete_pdb:
82 self.obsolete_pdb = obsolete_pdb
83 else:
84 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete')
85 if not os.access(self.obsolete_pdb,os.F_OK):
86 os.makedirs(self.obsolete_pdb)
87
88
89 self.overwrite = 0
90 self.flat_tree = 0
91
92
94 """Retrieves a list of pdb codes in the weekly pdb status file
95 from the given URL. Used by get_recent_files.
96
97 Typical contents of the list files parsed by this method is now
98 very simply one PDB name per line.
99 """
100 handle = urllib.urlopen(url)
101 answer = []
102 for line in handle:
103 pdb = line.strip()
104 assert len(pdb)==4
105 answer.append(pdb)
106 handle.close()
107 return answer
108
109
111 """Returns three lists of the newest weekly files (added,mod,obsolete).
112
113 Reads the directories with changed entries from the PDB server and
114 returns a tuple of three URL's to the files of new, modified and
115 obsolete entries from the most recent list. The directory with the
116 largest numerical name is used.
117 Returns None if something goes wrong.
118
119 Contents of the data/status dir (20031013 would be used);
120 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006
121 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013
122 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README
123
124
125 """
126 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/')
127
128
129
130
131 recent = filter(str.isdigit,
132 (x.split()[-1] for x in url.readlines())
133 )[-1]
134 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent)
135
136 added = self.get_status_list(path+'added.pdb')
137 modified = self.get_status_list(path+'modified.pdb')
138 obsolete = self.get_status_list(path+'obsolete.pdb')
139 return [added,modified,obsolete]
140
142 """Retrieves a big file containing all the
143 PDB entries and some annotation to them.
144 Returns a list of PDB codes in the index file.
145 """
146 print "retrieving index file. Takes about 5 MB."
147 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/index/entries.idx')
148
149
150
151 return [line[:4] for line in url.readlines()[2:] if len(line) > 4]
152
154 """Returns a list of all obsolete entries ever in the PDB.
155
156 Returns a list of all obsolete pdb codes that have ever been
157 in the PDB.
158
159 Gets and parses the file from the PDB server in the format
160 (the first pdb_code column is the one used). The file looks
161 like this:
162
163 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS
164 OBSLTE 31-JUL-94 116L 216L
165 ...
166 OBSLTE 29-JAN-96 1HFT 2HFT
167 OBSLTE 21-SEP-06 1HFV 2J5X
168 OBSLTE 21-NOV-03 1HG6
169 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB
170 OBSLTE 08-NOV-96 1HID 2HID
171 OBSLTE 01-APR-97 1HIU 2HIU
172 OBSLTE 14-JAN-04 1HKE 1UUZ
173 ...
174
175 """
176 handle = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat')
177
178
179 obsolete = []
180 for line in handle:
181 if not line.startswith("OBSLTE ") : continue
182 pdb = line.split()[2]
183 assert len(pdb)==4
184 obsolete.append(pdb)
185 handle.close()
186 return obsolete
187
188
189
190 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression='.gz',
191 uncompress="gunzip", pdir=None):
192 """Retrieves a PDB structure file from the PDB server and
193 stores it in a local file tree.
194 The PDB structure is returned as a single string.
195 If obsolete is 1, the file will be by default saved in a special file tree.
196 The compression should be '.Z' or '.gz'. 'uncompress' is
197 the command called to uncompress the files.
198
199 @param pdir: put the file in this directory (default: create a PDB-style directory tree)
200 @type pdir: string
201
202 @return: filename
203 @rtype: string
204 """
205
206 code=pdb_code.lower()
207 filename="pdb%s.ent%s"%(code,compression)
208 if not obsolete:
209 url=(self.pdb_server+
210 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s'
211 % (code[1:3],code,compression))
212 else:
213 url=(self.pdb_server+
214 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent%s'
215 % (code[1:3],code,compression))
216
217
218 if pdir is None:
219 if self.flat_tree:
220 if not obsolete:
221 path=self.local_pdb
222 else:
223 path=self.obsolete_pdb
224 else:
225
226 if not obsolete:
227 path=os.path.join(self.local_pdb, code[1:3])
228 else:
229 path=os.path.join(self.obsolete_pdb,code[1:3])
230 else:
231
232 path=pdir
233
234 if not os.access(path,os.F_OK):
235 os.makedirs(path)
236
237 filename=os.path.join(path, filename)
238
239 final_file=os.path.join(path, "pdb%s.ent" % code)
240
241
242 if not self.overwrite:
243 if os.path.exists(final_file):
244 print "file exists, not retrieved %s" % final_file
245 return final_file
246
247
248 print 'retrieving %s' % url
249 lines=urllib.urlopen(url).read()
250 open(filename,'wb').write(lines)
251
252 os.system("%s %s" % (uncompress, filename))
253
254 return final_file
255
256
258 """
259 I guess this is the 'most wanted' function from this module.
260 It gets the weekly lists of new and modified pdb entries and
261 automatically downloads the according PDB files.
262 You can call this module as a weekly cronjob.
263 """
264 assert os.path.isdir(self.local_pdb)
265 assert os.path.isdir(self.obsolete_pdb)
266
267 new, modified, obsolete = self.get_recent_changes()
268
269 for pdb_code in new+modified:
270 try:
271
272 self.retrieve_pdb_file(pdb_code)
273 except Exception:
274 print 'error %s\n' % pdb_code
275
276
277
278
279 for pdb_code in obsolete:
280 if self.flat_tree:
281 old_file = os.path.join(self.local_pdb,
282 'pdb%s.ent' % pdb_code)
283 new_dir = self.obsolete_pdb
284 else:
285 old_file = os.path.join(self.local_pdb, pdb_code[1:3],
286 'pdb%s.ent' % pdb_code)
287 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3])
288 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code)
289 if os.path.isfile(old_file):
290 if not os.path.isdir(new_dir):
291 os.mkdir(new_dir)
292 try:
293 shutil.move(old_file, new_file)
294 except Exception:
295 print "Could not move %s to obsolete folder" % old_file
296 elif os.path.isfile(new_file):
297 print "Obsolete file %s already moved" % old_file
298 else:
299 print "Obsolete file %s is missing" % old_file
300
301
303 """Retrieve all PDB entries not present in the local PDB copy.
304
305 Writes a list file containing all PDB codes (optional, if listfile is
306 given).
307 """
308 entries = self.get_all_entries()
309 for pdb_code in entries:
310 self.retrieve_pdb_file(pdb_code)
311
312 if listfile:
313 outfile = open(listfile, 'w')
314 outfile.writelines((x+'\n' for x in entries))
315 outfile.close()
316
318 """Retrieve all obsolete PDB entries not present in the local obsolete
319 PDB copy.
320
321 Writes a list file containing all PDB codes (optional, if listfile is
322 given).
323 """
324 entries = self.get_all_obsolete()
325 for pdb_code in entries:
326 self.retrieve_pdb_file(pdb_code, obsolete=1)
327
328
329 if listfile:
330 outfile = open(listfile, 'w')
331 outfile.writelines((x+'\n' for x in entries))
332 outfile.close()
333
334
335
336
337
339 """Retrieves a (big) file containing all the sequences of PDB entries
340 and writes it to a file.
341 """
342 print "retrieving sequence file. Takes about 15 MB."
343 url = urllib.urlopen(self.pdb_server +
344 '/pub/pdb/derived_data/pdb_seqres.txt')
345 lines = url.readlines()
346 outfile = open(savefile, 'w')
347 outfile.writelines(lines)
348 outfile.close()
349
350
351 if __name__ == '__main__':
352
353 import sys
354
355 doc = """PDBList.py
356 (c) Kristian Rother 2003, Contributed to BioPython
357
358 Usage:
359 PDBList.py update <pdb_path> [options] - write weekly PDB updates to
360 local pdb tree.
361 PDBList.py all <pdb_path> [options] - write all PDB entries to
362 local pdb tree.
363 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB
364 entries to local pdb tree.
365 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure
366
367 Options:
368 -d A single directory will be used as <pdb_path>, not a tree.
369 -o Overwrite existing structure files.
370 """
371 print doc
372
373 if len(sys.argv)>2:
374 pdb_path = sys.argv[2]
375 pl = PDBList(pdb=pdb_path)
376 if len(sys.argv)>3:
377 for option in sys.argv[3:]:
378 if option == '-d': pl.flat_tree = 1
379 elif option == '-o': pl.overwrite = 1
380
381 else:
382 pdb_path = os.getcwd()
383 pl = PDBList()
384 pl.flat_tree = 1
385
386 if len(sys.argv) > 1:
387 if sys.argv[1] == 'update':
388
389 print "updating local PDB at "+pdb_path
390 pl.update_pdb()
391
392 elif sys.argv[1] == 'all':
393
394 pl.download_entire_pdb()
395
396 elif sys.argv[1] == 'obsol':
397
398 pl.download_obsolete_entries(pdb_path)
399
400 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit():
401
402 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path)
403