1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 """Parse the header of a PDB file."""
25
26 import re
27
28
37
39
40
41 references=[]
42 actref=""
43 for l in inl:
44 if re.search("\AREMARK 1",l):
45 if re.search("\AREMARK 1 REFERENCE",l):
46 if actref!="":
47 actref=re.sub("\s\s+"," ",actref)
48 if actref!=" ":
49 references.append(actref)
50 actref=""
51 else:
52 actref+=l[19:72].lower()
53
54 if actref!="":
55 actref=re.sub("\s\s+"," ",actref)
56 if actref!=" ":
57 references.append(actref)
58 return references
59
60
61
78
79
81 """Chops lines ending with ' 1CSA 14' and the like."""
82 return re.sub("\s\s\s\s+[\w]{4}.\s+\d*\Z","",line)
83
85 """Chops lines ending with ' 14-JUL-97 1CSA' and the like."""
86 return re.sub("\s\s\s\s+.*\Z","",line)
87
89 """Makes A Lowercase String With Capitals."""
90 l=line.lower()
91 s=""
92 i=0
93 nextCap=1
94 while i<len(l):
95 c=l[i]
96 if c>='a' and c<='z' and nextCap:
97 c=c.upper()
98 nextCap=0
99 elif c==' ' or c=='.' or c==',' or c==';' or c==':' or c=='\t' or\
100 c=='-' or c=='_':
101 nextCap=1
102 s+=c
103 i+=1
104 return s
105
107 """
108 Returns the header lines of a pdb file as a dictionary.
109
110 Dictionary keys are: head, deposition_date, release_date, structure_method,
111 resolution, structure_reference, journal_reference, author and
112 compound.
113 """
114 header = []
115 do_close = False
116 if isinstance(infile, basestring):
117 f = open(infile,'r')
118 do_close = True
119 else:
120 f = infile
121 for l in f:
122 record_type=l[0:6]
123 if record_type=='ATOM ' or record_type=='HETATM' or record_type=='MODEL ':
124 break
125 else:
126 header.append(l)
127 if do_close:
128 f.close()
129 return _parse_pdb_header_list(header)
130
132
133 dict={'name':"",
134 'head':'',
135 'deposition_date' : "1909-01-08",
136 'release_date' : "1909-01-08",
137 'structure_method' : "unknown",
138 'resolution' : 0.0,
139 'structure_reference' : "unknown",
140 'journal_reference' : "unknown",
141 'author' : "",
142 'compound':{'1':{'misc':''}},'source':{'1':{'misc':''}}}
143
144 dict['structure_reference'] = _get_references(header)
145 dict['journal_reference'] = _get_journal(header)
146 comp_molid="1"
147 src_molid="1"
148 last_comp_key="misc"
149 last_src_key="misc"
150
151 for hh in header:
152 h=re.sub("[\s\n\r]*\Z","",hh)
153
154 key = h[:6].strip()
155
156 tail = h[10:].strip()
157
158
159
160 if key=="TITLE":
161 name=_chop_end_codes(tail).lower()
162 if 'name' in dict:
163 dict['name'] += " "+name
164 else:
165 dict['name']=name
166 elif key=="HEADER":
167 rr=re.search("\d\d-\w\w\w-\d\d",tail)
168 if rr!=None:
169 dict['deposition_date']=_format_date(_nice_case(rr.group()))
170 head=_chop_end_misc(tail).lower()
171 dict['head']=head
172 elif key=="COMPND":
173 tt=re.sub("\;\s*\Z","",_chop_end_codes(tail)).lower()
174
175 rec = re.search('\d+\.\d+\.\d+\.\d+',tt)
176 if rec:
177 dict['compound'][comp_molid]['ec_number']=rec.group()
178 tt=re.sub("\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)","",tt)
179 tok=tt.split(":")
180 if len(tok)>=2:
181 ckey=tok[0]
182 cval=re.sub("\A\s*","",tok[1])
183 if ckey=='mol_id':
184 dict['compound'][cval]={'misc':''}
185 comp_molid=cval
186 last_comp_key="misc"
187 else:
188 dict['compound'][comp_molid][ckey]=cval
189 last_comp_key=ckey
190 else:
191 dict['compound'][comp_molid][last_comp_key]+=tok[0]+" "
192 elif key=="SOURCE":
193 tt=re.sub("\;\s*\Z","",_chop_end_codes(tail)).lower()
194 tok=tt.split(":")
195
196 if len(tok)>=2:
197 ckey=tok[0]
198 cval=re.sub("\A\s*","",tok[1])
199 if ckey=='mol_id':
200 dict['source'][cval]={'misc':''}
201 comp_molid=cval
202 last_src_key="misc"
203 else:
204 dict['source'][comp_molid][ckey]=cval
205 last_src_key=ckey
206 else:
207 dict['source'][comp_molid][last_src_key]+=tok[0]+" "
208 elif key=="KEYWDS":
209 kwd=_chop_end_codes(tail).lower()
210 if 'keywords' in dict:
211 dict['keywords']+=" "+kwd
212 else:
213 dict['keywords']=kwd
214 elif key=="EXPDTA":
215 expd=_chop_end_codes(tail)
216
217 expd=re.sub('\s\s\s\s\s\s\s.*\Z','',expd)
218
219
220 dict['structure_method']=expd.lower()
221 elif key=="CAVEAT":
222
223 pass
224 elif key=="REVDAT":
225 rr=re.search("\d\d-\w\w\w-\d\d",tail)
226 if rr!=None:
227 dict['release_date']=_format_date(_nice_case(rr.group()))
228 elif key=="JRNL":
229
230 if 'journal' in dict:
231 dict['journal']+=tail
232 else:
233 dict['journal']=tail
234 elif key=="AUTHOR":
235 auth = _nice_case(_chop_end_codes(tail))
236 if 'author' in dict:
237 dict['author']+=auth
238 else:
239 dict['author']=auth
240 elif key=="REMARK":
241 if re.search("REMARK 2 RESOLUTION.",hh):
242 r=_chop_end_codes(re.sub("REMARK 2 RESOLUTION.",'',hh))
243 r=re.sub("\s+ANGSTROM.*","",r)
244 try:
245 dict['resolution']=float(r)
246 except:
247
248 dict['resolution']=None
249 else:
250
251 pass
252 if dict['structure_method']=='unknown':
253 if dict['resolution']>0.0: dict['structure_method']='x-ray diffraction'
254 return dict
255
256 if __name__=='__main__':
257
258
259 import sys
260 filename = sys.argv[1]
261 handle = open(filename,'r')
262 data_dict = parse_pdb_header(handle)
263 handle.close()
264
265
266 for k, y in data_dict.iteritems():
267 print "-"*40
268 print k
269 print y
270