Package Pyblio :: Package Parsers :: Package Semantic :: Module PubMed
[hide private]
[frames] | no frames]

Source Code for Module Pyblio.Parsers.Semantic.PubMed

  1  # This file is part of pybliographer 
  2  #  
  3  # Copyright (C) 1998-2006 Frederic GOBRY 
  4  # Email : gobry@pybliographer.org 
  5  #           
  6  # This program is free software; you can redistribute it and/or 
  7  # modify it under the terms of the GNU General Public License 
  8  # as published by the Free Software Foundation; either version 2  
  9  # of the License, or (at your option) any later version. 
 10  #    
 11  # This program is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU General Public License for more details.  
 15  #  
 16  # You should have received a copy of the GNU General Public License 
 17  # along with this program; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA. 
 19  #  
 20  """ 
 21  Parser for the XML format returned by PubMed's Web API 
 22  """ 
 23   
 24  import logging 
 25   
 26  from gettext import gettext as _ 
 27   
 28  from Pyblio import Attribute, Store, Compat 
 29   
 30  _DEBUG = False 
 31   
32 -class Reader(object):
33 """ Parse records as returned by PubMed's web service.""" 34 35 log = logging.getLogger('pyblio.import.pubmed') 36
37 - def uid(self):
38 """ Generate the display name of a record. 39 40 Used when outputting a warning for instance.""" 41 try: 42 return 'PMID:' + self.record['pmid'][0] 43 except KeyError: 44 return repr(self.record.key)
45 46 # Supported fields
47 - def do_default(self, node):
48 """ Called when no specific handler exist.""" 49 50 self.log.warn('%s: unhandled attribute %s' % ( 51 self.uid(), repr(node.tag))) 52 return
53
54 - def do_MedlineJournalInfo(self, node):
55 # this can contain the journal title, but as a fallback 56 self._fallback_journal = node.findtext('./MedlineTA')
57
58 - def do_Article(self, node):
59 for child in node: 60 fn = getattr(self, 'do_Article_' + child.tag, 61 self.do_default) 62 fn(child) 63 return
64
65 - def do_Article_ArticleTitle(self, node):
66 self.record.add('title', node.text, Attribute.Text)
67
68 - def do_Article_Abstract(self, node):
69 abstract = node.find('./AbstractText') 70 self.record.add('abstract', abstract.text, Attribute.Text)
71
72 - def do_Article_Journal(self, node):
73 def maybe(dst, key, conv): 74 v = node.find(key) 75 if v is not None: 76 self.record.add(dst, v.text, conv)
77 78 # optionally, the title can come from the MedlineTA field 79 maybe('journal', 'Title', Attribute.Text) 80 maybe('journal.issn', 'ISSN', Attribute.ID) 81 82 maybe('journal.volume', 'JournalIssue/Volume', Attribute.Text) 83 maybe('journal.issue', 'JournalIssue/Issue', Attribute.Text) 84 85 maybe('journal.year', 'JournalIssue/PubDate/Year', Attribute.Text) 86 maybe('journal.month', 'JournalIssue/PubDate/Month', Attribute.Text)
87
88 - def do_Article_AuthorList(self, node):
89 def v(n, k): 90 l = n.find(k) 91 if l is not None: 92 return l.text 93 return None
94 95 # believe it if you want, but some records specify a 96 # "ForeName", and others a "FirstName"... 97 for au in node.findall('./Author'): 98 person = Attribute.Person( 99 last=v(au, './LastName'), 100 first=v(au, './ForeName') or v(au, './FirstName')) 101 self.record.add('author', person) 102
103 - def do_Article_Pagination(self, node):
104 v = node.find('./MedlinePgn') 105 if v is not None and v.text: 106 # pubmed will return abbreviated page ranges (1234-45 107 # meaning 1234-1245). We transform them into full ranges, 108 # as this is only some kind of space saving convention. 109 pages = v.text 110 textual_pair = pages.split('-') 111 try: 112 pair = [int(x) for x in textual_pair] 113 except ValueError: 114 pair = [] 115 if len(pair) == 2 and pair[1] < pair[0]: 116 # we could play with logs to find out the actual cut 117 # point, but using the textual representation is 118 # probably more natural 119 left, right = textual_pair 120 full_right = left[:len(left)-len(right)] + right 121 if int(full_right) > pair[0]: 122 pages = '%s-%s' % (left, full_right) 123 self.record.add('journal.pages', pages, Attribute.Text)
124
125 - def do_PMID(self, node):
126 self.record.add('pmid', node.text, Attribute.ID)
127 128 # Parsing logic and hooks 129
130 - def record_begin (self):
131 pass
132
133 - def record_end(self):
134 # in some cases, the journal title wasn't in the Journal node, 135 # but can be recovered from the MedlineTA field. 136 j = self.record.get('journal') 137 if j and not j[0].is_complete() and self._fallback_journal: 138 self.record.add('journal', self._fallback_journal, Attribute.Text)
139
140 - def parse(self, fd, db, rs=None):
141 142 if rs is None: 143 rs = db.rs.new() 144 rs.name = _('Imported from PubMed') 145 146 self.db = db 147 148 for item in fd.findall('./PubmedArticle/MedlineCitation'): 149 self.record = Store.Record() 150 self.record_begin() 151 152 if _DEBUG: 153 Compat.ElementTree.dump(item) 154 for child in item: 155 fn = getattr(self, 'do_' + child.tag, 156 self.do_default) 157 fn(child) 158 159 self.record_end() 160 161 k = db.add(self.record) 162 rs.add(k) 163 164 return rs
165