1
2
3
4
5
6
7
8
9 """Bio.SeqIO support for the "uniprot-xml" file format.
10
11 See also:
12
13 http://www.uniprot.org
14
15 The UniProt XML format essentially replaces the old plain text file format
16 originally introduced by SwissProt ("swiss" format in Bio.SeqIO).
17 """
18 import sys
19
20 from Bio import Seq
21 from Bio import SeqFeature
22 from Bio import Alphabet
23 from Bio.SeqRecord import SeqRecord
24 try:
25 from cStringIO import StringIO
26 except ImportError:
27 from StringIO import StringIO
28 import warnings
29 try:
30 if (3,0,0) <= sys.version_info[:3] <= (3,1,2):
31
32 from xml.etree import ElementTree as ElementTree
33 else:
34 from xml.etree import cElementTree as ElementTree
35 except ImportError:
36 try:
37 from xml.etree import ElementTree as ElementTree
38 except ImportError:
39
40 try:
41 from lxml import etree as ElementTree
42 except ImportError:
43 try:
44 import cElementTree as ElementTree
45 except ImportError:
46 try:
47 from elementtree import ElementTree
48 except ImportError:
49 ElementTree = None
50
51
52
53
54
55
56
57
58 NS = "{http://uniprot.org/uniprot}"
59 REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)"
60
62 '''Generator Function
63 parses an XML entry at a time from any UniProt XML file
64 returns a SeqRecord for each iteration
65
66 This generator can be used in Bio.SeqIO
67
68 return_raw_comments = True --> comment fields are returned as complete xml to allow further processing
69 skip_parsing_errors = True --> if parsing errors are found, skip to next entry
70 '''
71 if isinstance(alphabet, Alphabet.NucleotideAlphabet):
72 raise ValueError, "Wrong alphabet %r" % alphabet
73 if isinstance(alphabet, Alphabet.Gapped):
74 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet):
75 raise ValueError, "Wrong alphabet %r" % alphabet
76
77 if not hasattr(handle, "read"):
78 if type(handle)==type(''):
79 handle=StringIO(handle)
80 else:
81 raise Exception('An XML-containing handler or an XML string must be passed')
82
83 if ElementTree is None:
84 from Bio import MissingExternalDependencyError
85 raise MissingExternalDependencyError(
86 "No ElementTree module was found. "
87 "Use Python 2.5+, lxml or elementtree if you "
88 "want to use Bio.SeqIO.UniprotIO.")
89
90 for event, elem in ElementTree.iterparse(handle, events=("start", "end")):
91 if event=="end" and elem.tag == NS + "entry":
92 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse()
93 elem.clear()
94
96 '''Parse a UniProt XML entry to a SeqRecord
97 return_raw_comments=True to get back the complete comment field in XML format
98 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet.
99 '''
101 self.entry=elem
102 self.alphabet=alphabet
103 self.return_raw_comments=return_raw_comments
104
114
115 def _parse_name(element):
116 '''use name as name'''
117 self.ParsedSeqRecord.name=element.text
118 '''add name to dbxrefs'''
119 self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
120
121 def _parse_accession(element):
122 append_to_annotations('accessions', element.text)
123 '''add accessions to dbxrefs'''
124 self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
125
126 def _parse_protein(element):
127 '''Parse protein names'''
128 descr_set=False
129 for protein_element in element.getchildren():
130 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']:
131 '''use protein fields for name and description '''
132 for rec_name in protein_element.getchildren():
133 ann_key='%s_%s' % (protein_element.tag.replace(NS,''), rec_name.tag.replace(NS,''))
134 append_to_annotations(ann_key, rec_name.text)
135 if (rec_name.tag==NS + 'fullName') and not descr_set:
136 self.ParsedSeqRecord.description=rec_name.text
137 descr_set=True
138 elif protein_element.tag==NS + 'component':
139 pass
140 elif protein_element.tag==NS + 'domain':
141 pass
142
143 def _parse_gene(element):
144 for genename_element in element.getchildren():
145 if genename_element.attrib.has_key('type'):
146 ann_key='gene_%s_%s' % (genename_element.tag.replace(NS,''), genename_element.attrib['type'])
147 if genename_element.attrib['type']=='primary':
148 self.ParsedSeqRecord.annotations[ann_key]=genename_element.text
149 else:
150 append_to_annotations(ann_key,genename_element.text)
151
152 def _parse_geneLocation(element):
153 append_to_annotations('geneLocation', element.attrib['type'])
154
155 def _parse_organism(element):
156 organism_name = com_name = sci_name = ''
157 for organism_element in element.getchildren():
158 if organism_element.tag==NS + 'name':
159 if organism_element.text:
160 if organism_element.attrib['type'] == 'scientific':
161 sci_name = organism_element.text
162 elif organism_element.attrib['type'] == 'common':
163 com_name = organism_element.text
164 else:
165
166 append_to_annotations("organism_name", organism_element.text)
167 elif organism_element.tag==NS + 'dbReference':
168 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type']+':'+organism_element.attrib['id'])
169 elif organism_element.tag==NS + 'lineage':
170 for taxon_element in organism_element.getchildren():
171 if taxon_element.tag==NS + 'taxon':
172 append_to_annotations('taxonomy',taxon_element.text)
173 if sci_name and com_name:
174 organism_name = '%s (%s)' % (sci_name, com_name)
175 elif sci_name:
176 organism_name = sci_name
177 elif com_name:
178 organism_name = com_name
179 self.ParsedSeqRecord.annotations['organism']=organism_name
180
181 def _parse_organismHost(element):
182 for organism_element in element.getchildren():
183 if organism_element.tag==NS + 'name':
184 append_to_annotations("organism_host", organism_element.text)
185
186 def _parse_keyword(element):
187 append_to_annotations('keywords',element.text)
188
189 def _parse_comment(element):
190 '''Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
191 To store all the contained data, more complex data structures are needed, such as
192 annidated dictionaries. This is left to end user, by optionally setting:
193
194 return_raw_comments=True
195
196 the orginal XMLs is returned in the annotation fields.
197
198 available comment types at december 2009:
199 "allergen"
200 "alternative products"
201 "biotechnology"
202 "biophysicochemical properties"
203 "catalytic activity"
204 "caution"
205 "cofactor"
206 "developmental stage"
207 "disease"
208 "domain"
209 "disruption phenotype"
210 "enzyme regulation"
211 "function"
212 "induction"
213 "miscellaneous"
214 "pathway"
215 "pharmaceutical"
216 "polymorphism"
217 "PTM"
218 "RNA editing"
219 "similarity"
220 "subcellular location"
221 "sequence caution"
222 "subunit"
223 "tissue specificity"
224 "toxic dose"
225 "online information"
226 "mass spectrometry"
227 "interaction"
228 '''
229
230 simple_comments=["allergen",
231 "biotechnology",
232 "biophysicochemical properties",
233 "catalytic activity",
234 "caution",
235 "cofactor",
236 "developmental stage",
237 "disease",
238 "domain",
239 "disruption phenotype",
240 "enzyme regulation",
241 "function",
242 "induction",
243 "miscellaneous",
244 "pathway",
245 "pharmaceutical",
246 "polymorphism",
247 "PTM",
248 "RNA editing",
249 "similarity",
250 "subunit",
251 "tissue specificity",
252 "toxic dose",
253 ]
254
255 if element.attrib['type'] in simple_comments:
256 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
257 for text_element in element.getiterator(NS + 'text'):
258 if text_element.text:
259 append_to_annotations(ann_key,text_element.text)
260 elif element.attrib['type']=='subcellular location':
261 for subloc_element in element.getiterator(NS + 'subcellularLocation'):
262 for el in subloc_element.getchildren():
263 if el.text:
264 ann_key='comment_%s_%s' % (element.attrib['type'].replace(' ',''), el.tag.replace(NS,''))
265 append_to_annotations(ann_key,el.text)
266 elif element.attrib['type']=='interaction':
267 for interact_element in element.getiterator(NS +'interactant'):
268 ann_key='comment_%s_intactId' % element.attrib['type']
269 append_to_annotations(ann_key,interact_element.attrib['intactId'])
270 elif element.attrib['type']=='alternative products':
271 for alt_element in element.getiterator(NS +'isoform'):
272 ann_key='comment_%s_isoform' % element.attrib['type'].replace(' ','')
273 for id_element in alt_element.getiterator(NS +'id'):
274 append_to_annotations(ann_key,id_element.text)
275 elif element.attrib['type']=='mass spectrometry':
276 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
277 start=end=0
278 for loc_element in element.getiterator(NS +'location'):
279 pos_els=loc_element.getiterator(NS +'position')
280 pos_els=list(pos_els)
281
282 try:
283 if pos_els:
284 end=int(pos_els[0].attrib['position'])
285 start=end-1
286 else:
287 start=int(loc_element.getiterator(NS +'begin')[0].attrib['position'])-1
288 end=int(loc_element.getiterator(NS +'end')[0].attrib['position'])
289 except :
290 pass
291 mass=element.attrib['mass']
292 method=element.attrib['mass']
293 if start==end==0:
294 append_to_annotations(ann_key,'undefined:%s|%s'%(mass,method))
295 else:
296 append_to_annotations(ann_key,'%s..%s:%s|%s'%(start,end,mass,method))
297 elif element.attrib['type']=='sequence caution':
298 pass
299 elif element.attrib['type']=='online information':
300 for link_element in element.getiterator(NS +'link'):
301 ann_key='comment_%s' % element.attrib['type'].replace(' ','')
302 for id_element in link_element.getiterator(NS +'link'):
303 append_to_annotations(ann_key,'%s@%s'%(element.attrib['name'],link_element.attrib['uri']))
304
305 '''return raw XML comments if needed '''
306 if self.return_raw_comments:
307 ann_key='comment_%s_xml' % element.attrib['type'].replace(' ','')
308 append_to_annotations(ann_key,ElementTree.tostring(element))
309
310
311 def _parse_dbReference(element):
312 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type']+':'+element.attrib['id'])
313 '''<dbReference type="PDB" key="11" id="2GEZ">
314 <property value="X-ray" type="method"/>
315 <property value="2.60 A" type="resolution"/>
316 <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/>
317 </dbReference>'''
318 if 'type' in element.attrib:
319 if element.attrib['type'] == 'PDB':
320 method=""
321 resolution=""
322 for ref_element in element.getchildren():
323 if ref_element.tag==NS + 'property':
324 dat_type=ref_element.attrib['type']
325 if dat_type=='method':
326 method=ref_element.attrib['value']
327 if dat_type=='resolution':
328 resolution=ref_element.attrib['value']
329 if dat_type=='chains':
330 pairs=ref_element.attrib['value'].split(',')
331 for elem in pairs:
332 pair=elem.strip().split('=')
333 if pair[1]!='-':
334
335 feature=SeqFeature.SeqFeature()
336 feature.type=element.attrib['type']
337 feature.qualifiers['name']=element.attrib['id']
338 feature.qualifiers['method']=method
339 feature.qualifiers['resolution']=resolution
340 feature.qualifiers['chains']=pair[0].split('/')
341 start=int(pair[1].split('-')[0])-1
342 end=int(pair[1].split('-')[1])
343 feature.location=SeqFeature.FeatureLocation(start,end)
344
345
346 for ref_element in element.getchildren():
347 if ref_element.tag==NS + 'property':
348 pass
349
350 def _parse_reference(element):
351 reference=SeqFeature.Reference()
352 authors=[]
353 scopes=[]
354 tissues=[]
355 journal_name=''
356 pub_type=''
357 pub_date=''
358 for ref_element in element.getchildren():
359 if ref_element.tag==NS + 'citation':
360 pub_type=ref_element.attrib['type']
361 if pub_type=='submission':
362 pub_type+=' to the '+ref_element.attrib['db']
363 if ref_element.attrib.has_key('name'):
364 journal_name=ref_element.attrib['name']
365 if ref_element.attrib.has_key('date'):
366 pub_date=ref_element.attrib['date']
367 else:
368 pub_date=''
369 if ref_element.attrib.has_key('volume'):
370 j_volume=ref_element.attrib['volume']
371 else:
372 j_volume=''
373 if ref_element.attrib.has_key('first'):
374 j_first=ref_element.attrib['first']
375 else:
376 j_first=''
377 if ref_element.attrib.has_key('last'):
378 j_last=ref_element.attrib['last']
379 else:
380 j_last=''
381 for cit_element in ref_element.getchildren():
382 if cit_element.tag==NS + 'title':
383 reference.title=cit_element.text
384 elif cit_element.tag==NS + 'authorList':
385 for person_element in cit_element.getchildren():
386 authors.append(person_element.attrib['name'])
387 elif cit_element.tag==NS + 'dbReference':
388 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']+':'+cit_element.attrib['id'])
389 if cit_element.attrib['type']=='PubMed':
390 reference.pubmed_id=cit_element.attrib['id']
391 elif ref_element.attrib['type']=='MEDLINE':
392 reference.medline_id=cit_element.attrib['id']
393 elif ref_element.tag==NS + 'scope':
394 scopes.append(ref_element.text)
395 elif ref_element.tag==NS + 'source':
396 for source_element in ref_element.getchildren():
397 if source_element.tag==NS + 'tissue':
398 tissues.append(source_element.text)
399 if scopes:
400 scopes_str='Scope: '+', '.join(scopes)
401 else:
402 scopes_str=''
403 if tissues:
404 tissues_str='Tissue: '+', '.join(tissues)
405 else:
406 tissues_str=''
407
408 reference.location = []
409 reference.authors = ', '.join(authors)
410 if journal_name:
411 if pub_date and j_volume and j_first and j_last:
412 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name,
413 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date)
414 else:
415 reference.journal = journal_name
416 reference.comment = ' | '.join((pub_type,pub_date,scopes_str,tissues_str))
417 append_to_annotations('references', reference)
418
419 def _parse_position(element, offset=0):
420 try:
421 position=int(element.attrib['position']) + offset
422 except KeyError, err:
423 position=None
424 status = element.attrib.get('status', '')
425 if status == 'unknown':
426 assert position is None
427 return SeqFeature.UnknownPosition()
428 elif not status:
429 return SeqFeature.ExactPosition(position)
430 elif status == 'greater than':
431 return SeqFeature.AfterPosition(position)
432 elif status == 'less than':
433 return SeqFeature.BeforePosition(position)
434 elif status == 'uncertain':
435 return SeqFeature.UncertainPosition(position)
436 else:
437 raise NotImplementedError("Position status %r" % status)
438
439 def _parse_feature(element):
440 feature=SeqFeature.SeqFeature()
441 for k,v in element.attrib.items():
442 feature.qualifiers[k]=v
443 if element.attrib.has_key('type'):
444 feature.type=element.attrib['type']
445 else:
446 feature.type=''
447 if element.attrib.has_key('type'):
448 feature.type=element.attrib['type']
449 if element.attrib.has_key('id'):
450 feature.id=element.attrib['id']
451 for feature_element in element.getchildren():
452 if feature_element.tag==NS + 'location':
453 position_elements=feature_element.findall(NS + 'position')
454 if position_elements:
455 element = position_elements[0]
456 start_position = _parse_position(element, -1)
457 end_position = _parse_position(element)
458 else:
459 element = feature_element.findall(NS + 'begin')[0]
460 start_position=_parse_position(element, -1)
461 element = feature_element.findall(NS + 'end')[0]
462 end_position=_parse_position(element)
463 feature.location=SeqFeature.FeatureLocation(start_position,end_position)
464 else:
465 try:
466 feature.qualifiers[feature_element.tag.replace(NS,'')]=feature_element.text
467 except:
468 pass
469 self.ParsedSeqRecord.features.append(feature)
470
471 def _parse_proteinExistence(element):
472 append_to_annotations('proteinExistence', element.attrib['type'])
473
474 def _parse_evidence(element):
475 for k, v in element.attrib.items():
476 ann_key = k
477 append_to_annotations(ann_key, v)
478
479 def _parse_sequence(element):
480 for k, v in element.attrib.items():
481 if k in ("length", "mass", "version"):
482 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v)
483 else:
484 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v
485 seq=''.join((element.text.split()))
486 self.ParsedSeqRecord.seq=Seq.Seq(seq,self.alphabet)
487
488
489 '''Initialize SeqRecord '''
490 self.ParsedSeqRecord=SeqRecord('', id='')
491
492 '''Entry attribs parsing '''
493 if self.entry.attrib.has_key('dataset'):
494 self.dbname=self.entry.attrib['dataset']
495 else:
496 self.dbname='UnknownDataset'
497 '''add attribs to annotations '''
498 for k, v in self.entry.attrib.items():
499 if k in ("version"):
500 '''original'''
501
502 '''to cope with swissProt plain text parser. this can cause errors
503 if the attrib has the same name of an other annotation'''
504 self.ParsedSeqRecord.annotations[k] = int(v)
505 else:
506
507 self.ParsedSeqRecord.annotations[k] = v
508
509 '''Top-to-bottom entry children parsing '''
510 for element in self.entry.getchildren():
511 if element.tag==NS + 'name':
512 _parse_name(element)
513 elif element.tag==NS + 'accession':
514 _parse_accession(element)
515 elif element.tag==NS + 'protein':
516 _parse_protein(element)
517 elif element.tag==NS + 'gene':
518 _parse_gene(element)
519 elif element.tag==NS + 'geneLocation':
520 _parse_geneLocation(element)
521 elif element.tag==NS + 'organism':
522 _parse_organism(element)
523 elif element.tag==NS + 'organismHost':
524 _parse_organismHost(element)
525 elif element.tag==NS + 'keyword':
526 _parse_keyword(element)
527 elif element.tag==NS + 'comment':
528 _parse_comment(element)
529 elif element.tag==NS + 'dbReference':
530 _parse_dbReference(element)
531 elif element.tag==NS + 'reference':
532 _parse_reference(element)
533 elif element.tag==NS + 'feature':
534 _parse_feature(element)
535 elif element.tag==NS + 'proteinExistence':
536 _parse_proteinExistence(element)
537 elif element.tag==NS + 'evidence':
538 _parse_evidence(element)
539 elif element.tag==NS + 'sequence':
540 _parse_sequence(element)
541 else:
542 pass
543
544 self.ParsedSeqRecord.dbxrefs=list(set(self.ParsedSeqRecord.dbxrefs))
545 self.ParsedSeqRecord.dbxrefs.sort()
546
547
548 if not self.ParsedSeqRecord.id:
549 self.ParsedSeqRecord.id=self.ParsedSeqRecord.annotations['accessions'][0]
550
551
552
553 return self.ParsedSeqRecord
554