Package Bio :: Package SeqIO :: Module UniprotIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.UniprotIO

  1  # Copyright 2010 by Andrea Pierleoni 
  2  # Revisions copyright 2010 by Peter Cock 
  3  # All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8   
  9  """Bio.SeqIO support for the "uniprot-xml" file format. 
 10   
 11  See also: 
 12   
 13  http://www.uniprot.org 
 14   
 15  The UniProt XML format essentially replaces the old plain text file format 
 16  originally introduced by SwissProt ("swiss" format in Bio.SeqIO). 
 17  """ 
 18  import sys 
 19   
 20  from Bio import Seq 
 21  from Bio import SeqFeature 
 22  from Bio import Alphabet 
 23  from Bio.SeqRecord import SeqRecord 
 24  try: 
 25      from cStringIO import StringIO 
 26  except ImportError: 
 27      from StringIO import StringIO 
 28  import warnings 
 29  try: 
 30      if (3,0,0) <= sys.version_info[:3] <= (3,1,2): 
 31          #workaround for bug in python 3 to 3.1.2  see http://bugs.python.org/issue9257 
 32          from xml.etree import ElementTree as ElementTree 
 33      else: 
 34          from xml.etree import cElementTree as ElementTree 
 35  except ImportError: 
 36      try: 
 37          from xml.etree import ElementTree as ElementTree 
 38      except ImportError: 
 39          # Python 2.4 -- check for 3rd-party implementations 
 40          try: 
 41              from lxml import etree as ElementTree 
 42          except ImportError: 
 43              try: 
 44                  import cElementTree as ElementTree 
 45              except ImportError: 
 46                  try: 
 47                      from elementtree import ElementTree 
 48                  except ImportError: 
 49                      ElementTree = None 
 50                      #TODO - Clean this up after we drop Python 2.4, 
 51                      #for now delay the error so the tests pass on Python 2.4 
 52                      #from Bio import MissingPythonDependencyError 
 53                      #raise MissingPythonDependencyError( 
 54                      #        "No ElementTree module was found. " 
 55                      #        "Use Python 2.5+, lxml or elementtree if you " 
 56                      #        "want to use Bio.SeqIO.UniprotIO.") 
 57   
 58  NS = "{http://uniprot.org/uniprot}" 
 59  REFERENCE_JOURNAL = "%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)" 
 60   
61 -def UniprotIterator(handle, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
62 '''Generator Function 63 parses an XML entry at a time from any UniProt XML file 64 returns a SeqRecord for each iteration 65 66 This generator can be used in Bio.SeqIO 67 68 return_raw_comments = True --> comment fields are returned as complete xml to allow further processing 69 skip_parsing_errors = True --> if parsing errors are found, skip to next entry 70 ''' 71 if isinstance(alphabet, Alphabet.NucleotideAlphabet): 72 raise ValueError, "Wrong alphabet %r" % alphabet 73 if isinstance(alphabet, Alphabet.Gapped): 74 if isinstance(alphabet.alphabet, Alphabet.NucleotideAlphabet): 75 raise ValueError, "Wrong alphabet %r" % alphabet 76 77 if not hasattr(handle, "read"): 78 if type(handle)==type(''): 79 handle=StringIO(handle) 80 else: 81 raise Exception('An XML-containing handler or an XML string must be passed') 82 83 if ElementTree is None: 84 from Bio import MissingExternalDependencyError 85 raise MissingExternalDependencyError( 86 "No ElementTree module was found. " 87 "Use Python 2.5+, lxml or elementtree if you " 88 "want to use Bio.SeqIO.UniprotIO.") 89 90 for event, elem in ElementTree.iterparse(handle, events=("start", "end")): 91 if event=="end" and elem.tag == NS + "entry": 92 yield Parser(elem, alphabet=alphabet, return_raw_comments=return_raw_comments).parse() 93 elem.clear()
94
95 -class Parser(object):
96 '''Parse a UniProt XML entry to a SeqRecord 97 return_raw_comments=True to get back the complete comment field in XML format 98 alphabet=Alphabet.ProteinAlphabet() can be modified if needed, default is protein alphabet. 99 '''
100 - def __init__(self, elem, alphabet=Alphabet.ProteinAlphabet(), return_raw_comments=False):
101 self.entry=elem 102 self.alphabet=alphabet 103 self.return_raw_comments=return_raw_comments
104
105 - def parse(self):
106 '''parse the input ''' 107 assert self.entry.tag == NS + 'entry' 108 109 def append_to_annotations(key, value): 110 if not self.ParsedSeqRecord.annotations.has_key(key): 111 self.ParsedSeqRecord.annotations[key]=[] 112 if value not in self.ParsedSeqRecord.annotations[key]: 113 self.ParsedSeqRecord.annotations[key].append(value)
114 115 def _parse_name(element): 116 '''use name as name''' 117 self.ParsedSeqRecord.name=element.text 118 '''add name to dbxrefs''' 119 self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text)
120 121 def _parse_accession(element): 122 append_to_annotations('accessions', element.text)# to cope with SwissProt plain text parser 123 '''add accessions to dbxrefs''' 124 self.ParsedSeqRecord.dbxrefs.append(self.dbname+':'+element.text) 125 126 def _parse_protein(element): 127 '''Parse protein names''' 128 descr_set=False 129 for protein_element in element.getchildren(): 130 if protein_element.tag in [NS + 'recommendedName', NS + 'alternativeName']:#recommendedName tag are parsed before 131 '''use protein fields for name and description ''' 132 for rec_name in protein_element.getchildren(): 133 ann_key='%s_%s' % (protein_element.tag.replace(NS,''), rec_name.tag.replace(NS,'')) 134 append_to_annotations(ann_key, rec_name.text) 135 if (rec_name.tag==NS + 'fullName') and not descr_set: 136 self.ParsedSeqRecord.description=rec_name.text 137 descr_set=True 138 elif protein_element.tag==NS + 'component': 139 pass #not parsed 140 elif protein_element.tag==NS + 'domain': 141 pass #not parsed 142 143 def _parse_gene(element): 144 for genename_element in element.getchildren(): 145 if genename_element.attrib.has_key('type'): 146 ann_key='gene_%s_%s' % (genename_element.tag.replace(NS,''), genename_element.attrib['type']) 147 if genename_element.attrib['type']=='primary': 148 self.ParsedSeqRecord.annotations[ann_key]=genename_element.text 149 else: 150 append_to_annotations(ann_key,genename_element.text) 151 152 def _parse_geneLocation(element): 153 append_to_annotations('geneLocation', element.attrib['type']) 154 155 def _parse_organism(element): 156 organism_name = com_name = sci_name = '' 157 for organism_element in element.getchildren(): 158 if organism_element.tag==NS + 'name': 159 if organism_element.text: 160 if organism_element.attrib['type'] == 'scientific': 161 sci_name = organism_element.text 162 elif organism_element.attrib['type'] == 'common': 163 com_name = organism_element.text 164 else: 165 #e.g. synonym 166 append_to_annotations("organism_name", organism_element.text) 167 elif organism_element.tag==NS + 'dbReference': 168 self.ParsedSeqRecord.dbxrefs.append(organism_element.attrib['type']+':'+organism_element.attrib['id']) 169 elif organism_element.tag==NS + 'lineage': 170 for taxon_element in organism_element.getchildren(): 171 if taxon_element.tag==NS + 'taxon': 172 append_to_annotations('taxonomy',taxon_element.text) 173 if sci_name and com_name: 174 organism_name = '%s (%s)' % (sci_name, com_name) 175 elif sci_name: 176 organism_name = sci_name 177 elif com_name: 178 organism_name = com_name 179 self.ParsedSeqRecord.annotations['organism']=organism_name 180 181 def _parse_organismHost(element): 182 for organism_element in element.getchildren(): 183 if organism_element.tag==NS + 'name': 184 append_to_annotations("organism_host", organism_element.text) 185 186 def _parse_keyword(element): 187 append_to_annotations('keywords',element.text) 188 189 def _parse_comment(element): 190 '''Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. 191 To store all the contained data, more complex data structures are needed, such as 192 annidated dictionaries. This is left to end user, by optionally setting: 193 194 return_raw_comments=True 195 196 the orginal XMLs is returned in the annotation fields. 197 198 available comment types at december 2009: 199 "allergen" 200 "alternative products" 201 "biotechnology" 202 "biophysicochemical properties" 203 "catalytic activity" 204 "caution" 205 "cofactor" 206 "developmental stage" 207 "disease" 208 "domain" 209 "disruption phenotype" 210 "enzyme regulation" 211 "function" 212 "induction" 213 "miscellaneous" 214 "pathway" 215 "pharmaceutical" 216 "polymorphism" 217 "PTM" 218 "RNA editing" 219 "similarity" 220 "subcellular location" 221 "sequence caution" 222 "subunit" 223 "tissue specificity" 224 "toxic dose" 225 "online information" 226 "mass spectrometry" 227 "interaction" 228 ''' 229 230 simple_comments=["allergen", 231 "biotechnology", 232 "biophysicochemical properties", 233 "catalytic activity", 234 "caution", 235 "cofactor", 236 "developmental stage", 237 "disease", 238 "domain", 239 "disruption phenotype", 240 "enzyme regulation", 241 "function", 242 "induction", 243 "miscellaneous", 244 "pathway", 245 "pharmaceutical", 246 "polymorphism", 247 "PTM", 248 "RNA editing",#positions not parsed 249 "similarity", 250 "subunit", 251 "tissue specificity", 252 "toxic dose", 253 ] 254 255 if element.attrib['type'] in simple_comments: 256 ann_key='comment_%s' % element.attrib['type'].replace(' ','') 257 for text_element in element.getiterator(NS + 'text'): 258 if text_element.text: 259 append_to_annotations(ann_key,text_element.text) 260 elif element.attrib['type']=='subcellular location': 261 for subloc_element in element.getiterator(NS + 'subcellularLocation'): 262 for el in subloc_element.getchildren(): 263 if el.text: 264 ann_key='comment_%s_%s' % (element.attrib['type'].replace(' ',''), el.tag.replace(NS,'')) 265 append_to_annotations(ann_key,el.text) 266 elif element.attrib['type']=='interaction': 267 for interact_element in element.getiterator(NS +'interactant'): 268 ann_key='comment_%s_intactId' % element.attrib['type'] 269 append_to_annotations(ann_key,interact_element.attrib['intactId']) 270 elif element.attrib['type']=='alternative products': 271 for alt_element in element.getiterator(NS +'isoform'): 272 ann_key='comment_%s_isoform' % element.attrib['type'].replace(' ','') 273 for id_element in alt_element.getiterator(NS +'id'): 274 append_to_annotations(ann_key,id_element.text) 275 elif element.attrib['type']=='mass spectrometry': 276 ann_key='comment_%s' % element.attrib['type'].replace(' ','') 277 start=end=0 278 for loc_element in element.getiterator(NS +'location'): 279 pos_els=loc_element.getiterator(NS +'position') 280 pos_els=list(pos_els) 281 # this try should be avoided, maybe it is safer to skip postion parsing for mass spectrometry 282 try: 283 if pos_els: 284 end=int(pos_els[0].attrib['position']) 285 start=end-1 286 else: 287 start=int(loc_element.getiterator(NS +'begin')[0].attrib['position'])-1 288 end=int(loc_element.getiterator(NS +'end')[0].attrib['position']) 289 except :#undefined positions or erroneusly mapped 290 pass 291 mass=element.attrib['mass'] 292 method=element.attrib['mass'] 293 if start==end==0: 294 append_to_annotations(ann_key,'undefined:%s|%s'%(mass,method)) 295 else: 296 append_to_annotations(ann_key,'%s..%s:%s|%s'%(start,end,mass,method)) 297 elif element.attrib['type']=='sequence caution': 298 pass#not parsed: few information, complex structure 299 elif element.attrib['type']=='online information': 300 for link_element in element.getiterator(NS +'link'): 301 ann_key='comment_%s' % element.attrib['type'].replace(' ','') 302 for id_element in link_element.getiterator(NS +'link'): 303 append_to_annotations(ann_key,'%s@%s'%(element.attrib['name'],link_element.attrib['uri'])) 304 305 '''return raw XML comments if needed ''' 306 if self.return_raw_comments: 307 ann_key='comment_%s_xml' % element.attrib['type'].replace(' ','') 308 append_to_annotations(ann_key,ElementTree.tostring(element)) 309 310 311 def _parse_dbReference(element): 312 self.ParsedSeqRecord.dbxrefs.append(element.attrib['type']+':'+element.attrib['id']) 313 '''<dbReference type="PDB" key="11" id="2GEZ"> 314 <property value="X-ray" type="method"/> 315 <property value="2.60 A" type="resolution"/> 316 <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> 317 </dbReference>''' 318 if 'type' in element.attrib: 319 if element.attrib['type'] == 'PDB': 320 method="" 321 resolution="" 322 for ref_element in element.getchildren(): 323 if ref_element.tag==NS + 'property': 324 dat_type=ref_element.attrib['type'] 325 if dat_type=='method': 326 method=ref_element.attrib['value'] 327 if dat_type=='resolution': 328 resolution=ref_element.attrib['value'] 329 if dat_type=='chains': 330 pairs=ref_element.attrib['value'].split(',') 331 for elem in pairs: 332 pair=elem.strip().split('=') 333 if pair[1]!='-': 334 #TODO - How best to store these, do SeqFeatures make sense? 335 feature=SeqFeature.SeqFeature() 336 feature.type=element.attrib['type'] 337 feature.qualifiers['name']=element.attrib['id'] 338 feature.qualifiers['method']=method 339 feature.qualifiers['resolution']=resolution 340 feature.qualifiers['chains']=pair[0].split('/') 341 start=int(pair[1].split('-')[0])-1 342 end=int(pair[1].split('-')[1]) 343 feature.location=SeqFeature.FeatureLocation(start,end) 344 #self.ParsedSeqRecord.features.append(feature) 345 346 for ref_element in element.getchildren(): 347 if ref_element.tag==NS + 'property': 348 pass# this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs 349 350 def _parse_reference(element): 351 reference=SeqFeature.Reference() 352 authors=[] 353 scopes=[] 354 tissues=[] 355 journal_name='' 356 pub_type='' 357 pub_date='' 358 for ref_element in element.getchildren(): 359 if ref_element.tag==NS + 'citation': 360 pub_type=ref_element.attrib['type'] 361 if pub_type=='submission': 362 pub_type+=' to the '+ref_element.attrib['db'] 363 if ref_element.attrib.has_key('name'): 364 journal_name=ref_element.attrib['name'] 365 if ref_element.attrib.has_key('date'): 366 pub_date=ref_element.attrib['date'] 367 else: 368 pub_date='' 369 if ref_element.attrib.has_key('volume'): 370 j_volume=ref_element.attrib['volume'] 371 else: 372 j_volume='' 373 if ref_element.attrib.has_key('first'): 374 j_first=ref_element.attrib['first'] 375 else: 376 j_first='' 377 if ref_element.attrib.has_key('last'): 378 j_last=ref_element.attrib['last'] 379 else: 380 j_last='' 381 for cit_element in ref_element.getchildren(): 382 if cit_element.tag==NS + 'title': 383 reference.title=cit_element.text 384 elif cit_element.tag==NS + 'authorList': 385 for person_element in cit_element.getchildren(): 386 authors.append(person_element.attrib['name']) 387 elif cit_element.tag==NS + 'dbReference': 388 self.ParsedSeqRecord.dbxrefs.append(cit_element.attrib['type']+':'+cit_element.attrib['id']) 389 if cit_element.attrib['type']=='PubMed': 390 reference.pubmed_id=cit_element.attrib['id'] 391 elif ref_element.attrib['type']=='MEDLINE': 392 reference.medline_id=cit_element.attrib['id'] 393 elif ref_element.tag==NS + 'scope': 394 scopes.append(ref_element.text) 395 elif ref_element.tag==NS + 'source': 396 for source_element in ref_element.getchildren(): 397 if source_element.tag==NS + 'tissue': 398 tissues.append(source_element.text) 399 if scopes: 400 scopes_str='Scope: '+', '.join(scopes) 401 else: 402 scopes_str='' 403 if tissues: 404 tissues_str='Tissue: '+', '.join(tissues) 405 else: 406 tissues_str='' 407 408 reference.location = [] #locations cannot be parsed since they are actually written in free text inside scopes so all the references are put in the annotation. 409 reference.authors = ', '.join(authors) 410 if journal_name: 411 if pub_date and j_volume and j_first and j_last: 412 reference.journal = REFERENCE_JOURNAL % dict(name=journal_name, 413 volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) 414 else: 415 reference.journal = journal_name 416 reference.comment = ' | '.join((pub_type,pub_date,scopes_str,tissues_str)) 417 append_to_annotations('references', reference) 418 419 def _parse_position(element, offset=0): 420 try: 421 position=int(element.attrib['position']) + offset 422 except KeyError, err: 423 position=None 424 status = element.attrib.get('status', '') 425 if status == 'unknown': 426 assert position is None 427 return SeqFeature.UnknownPosition() 428 elif not status: 429 return SeqFeature.ExactPosition(position) 430 elif status == 'greater than': 431 return SeqFeature.AfterPosition(position) 432 elif status == 'less than': 433 return SeqFeature.BeforePosition(position) 434 elif status == 'uncertain': 435 return SeqFeature.UncertainPosition(position) 436 else: 437 raise NotImplementedError("Position status %r" % status) 438 439 def _parse_feature(element): 440 feature=SeqFeature.SeqFeature() 441 for k,v in element.attrib.items(): 442 feature.qualifiers[k]=v 443 if element.attrib.has_key('type'): 444 feature.type=element.attrib['type'] 445 else: 446 feature.type='' 447 if element.attrib.has_key('type'): 448 feature.type=element.attrib['type'] 449 if element.attrib.has_key('id'): 450 feature.id=element.attrib['id'] 451 for feature_element in element.getchildren(): 452 if feature_element.tag==NS + 'location': 453 position_elements=feature_element.findall(NS + 'position') 454 if position_elements: 455 element = position_elements[0] 456 start_position = _parse_position(element, -1) 457 end_position = _parse_position(element) 458 else: 459 element = feature_element.findall(NS + 'begin')[0] 460 start_position=_parse_position(element, -1) 461 element = feature_element.findall(NS + 'end')[0] 462 end_position=_parse_position(element) 463 feature.location=SeqFeature.FeatureLocation(start_position,end_position) 464 else: 465 try: 466 feature.qualifiers[feature_element.tag.replace(NS,'')]=feature_element.text 467 except: 468 pass#skip unparsable tag 469 self.ParsedSeqRecord.features.append(feature) 470 471 def _parse_proteinExistence(element): 472 append_to_annotations('proteinExistence', element.attrib['type']) 473 474 def _parse_evidence(element): 475 for k, v in element.attrib.items(): 476 ann_key = k 477 append_to_annotations(ann_key, v) 478 479 def _parse_sequence(element): 480 for k, v in element.attrib.items(): 481 if k in ("length", "mass", "version"): 482 self.ParsedSeqRecord.annotations['sequence_%s' % k] = int(v) 483 else: 484 self.ParsedSeqRecord.annotations['sequence_%s' % k] = v 485 seq=''.join((element.text.split())) 486 self.ParsedSeqRecord.seq=Seq.Seq(seq,self.alphabet) 487 488 #============================================# 489 '''Initialize SeqRecord ''' 490 self.ParsedSeqRecord=SeqRecord('', id='') 491 492 '''Entry attribs parsing ''' 493 if self.entry.attrib.has_key('dataset'): 494 self.dbname=self.entry.attrib['dataset'] 495 else: 496 self.dbname='UnknownDataset'#this should not happen! 497 '''add attribs to annotations ''' 498 for k, v in self.entry.attrib.items(): 499 if k in ("version"): 500 '''original''' 501 #self.ParsedSeqRecord.annotations["entry_%s" % k] = int(v) 502 '''to cope with swissProt plain text parser. this can cause errors 503 if the attrib has the same name of an other annotation''' 504 self.ParsedSeqRecord.annotations[k] = int(v) 505 else: 506 #self.ParsedSeqRecord.annotations["entry_%s" % k] = v 507 self.ParsedSeqRecord.annotations[k] = v # to cope with swissProt plain text parser 508 509 '''Top-to-bottom entry children parsing ''' 510 for element in self.entry.getchildren(): 511 if element.tag==NS + 'name': 512 _parse_name(element) 513 elif element.tag==NS + 'accession': 514 _parse_accession(element) 515 elif element.tag==NS + 'protein': 516 _parse_protein(element) 517 elif element.tag==NS + 'gene': 518 _parse_gene(element) 519 elif element.tag==NS + 'geneLocation': 520 _parse_geneLocation(element) 521 elif element.tag==NS + 'organism': 522 _parse_organism(element) 523 elif element.tag==NS + 'organismHost': 524 _parse_organismHost(element) 525 elif element.tag==NS + 'keyword': 526 _parse_keyword(element) 527 elif element.tag==NS + 'comment': 528 _parse_comment(element) 529 elif element.tag==NS + 'dbReference': 530 _parse_dbReference(element) 531 elif element.tag==NS + 'reference': 532 _parse_reference(element) 533 elif element.tag==NS + 'feature': 534 _parse_feature(element) 535 elif element.tag==NS + 'proteinExistence': 536 _parse_proteinExistence(element) 537 elif element.tag==NS + 'evidence': 538 _parse_evidence(element) 539 elif element.tag==NS + 'sequence': 540 _parse_sequence(element) 541 else: 542 pass 543 544 self.ParsedSeqRecord.dbxrefs=list(set(self.ParsedSeqRecord.dbxrefs))#remove duplicate dbxrefs 545 self.ParsedSeqRecord.dbxrefs.sort() 546 547 # use first accession as id 548 if not self.ParsedSeqRecord.id: 549 self.ParsedSeqRecord.id=self.ParsedSeqRecord.annotations['accessions'][0] 550 551 552 553 return self.ParsedSeqRecord 554