Trees | Indices | Help |
---|
|
1 # Copyright 2008 by Michiel de Hoon. All rights reserved. 2 # This code is part of the Biopython distribution and governed by its 3 # license. Please see the LICENSE file that should have been included 4 # as part of this package. 5 6 """Parser for XML results returned by NCBI's Entrez Utilities. This 7 parser is used by the read() function in Bio.Entrez, and is not intended 8 be used directly. 9 """ 10 11 # The question is how to represent an XML file as Python objects. Some 12 # XML files returned by NCBI look like lists, others look like dictionaries, 13 # and others look like a mix of lists and dictionaries. 14 # 15 # My approach is to classify each possible element in the XML as a plain 16 # string, an integer, a list, a dictionary, or a structure. The latter is a 17 # dictionary where the same key can occur multiple times; in Python, it is 18 # represented as a dictionary where that key occurs once, pointing to a list 19 # of values found in the XML file. 20 # 21 # The parser then goes through the XML and creates the appropriate Python 22 # object for each element. The different levels encountered in the XML are 23 # preserved on the Python side. So a subelement of a subelement of an element 24 # is a value in a dictionary that is stored in a list which is a value in 25 # some other dictionary (or a value in a list which itself belongs to a list 26 # which is a value in a dictionary, and so on). Attributes encountered in 27 # the XML are stored as a dictionary in a member .attributes of each element, 28 # and the tag name is saved in a member .tag. 29 # 30 # To decide which kind of Python object corresponds to each element in the 31 # XML, the parser analyzes the DTD referred at the top of (almost) every 32 # XML file returned by the Entrez Utilities. This is preferred over a hand- 33 # written solution, since the number of DTDs is rather large and their 34 # contents may change over time. About half the code in this parser deals 35 # wih parsing the DTD, and the other half with the XML itself. 36 # 37 # One type of query (EFetch on the Journals database) returns an XML without 38 # an associated DTD. These files are handled using the hand-written 39 # SerialSet.py 40 41 42 import os.path 43 from xml.parsers import expat 44 45 # The following four classes are used to add a member .attributes to integers, 46 # strings, lists, and dictionaries, respectively. 47 49 51 53 55 57 58 # A StructureElement is like a dictionary, but some of its keys can have 59 # multiple values associated with it. These values are stored in a list 60 # under each key.7263 dict.__init__(self) 64 for key in keys: 65 dict.__setitem__(self, key, []) 66 self.listkeys = keys7426476 self.stack = [] 77 self.errors = [] 78 self.integers = [] 79 self.strings = [] 80 self.lists = [] 81 self.dictionaries = [] 82 self.structures = {} 83 self.items = [] 84 self.dtd_dir = dtd_dir 85 self.initialized = False8688 """Set up the parser and let it parse the XML results""" 89 self.parser = expat.ParserCreate() 90 self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) 91 self.parser.StartElementHandler = self.startElement 92 self.parser.EndElementHandler = self.endElement 93 self.parser.CharacterDataHandler = self.characters 94 self.parser.ExternalEntityRefHandler = self.external_entity_ref_handler 95 self.parser.ParseFile(handle) 96 self.parser = None 97 return self.object98100 if not self.initialized: 101 # This XML file does not have a DTD; load its definitions here 102 # using the first element in the XML. As far as I know, this 103 # occurs only for EFetch results downloaded from the Journals 104 # database. 105 self.load_definitions(name) 106 self.content = "" 107 if name in self.lists: 108 object = ListElement() 109 elif name in self.dictionaries: 110 object = DictionaryElement() 111 elif name in self.structures: 112 object = StructureElement(self.structures[name]) 113 elif name in self.items: # Only appears in ESummary 114 name = str(attrs["Name"]) # convert from Unicode 115 del attrs["Name"] 116 itemtype = str(attrs["Type"]) # convert from Unicode 117 del attrs["Type"] 118 if itemtype=="Structure": 119 object = DictionaryElement() 120 elif name in ("ArticleIds", "History"): 121 object = StructureElement(["pubmed", "medline"]) 122 elif itemtype=="List": 123 object = ListElement() 124 else: 125 object = StringElement() 126 object.itemname = name 127 object.itemtype = itemtype 128 elif name in self.strings + self.errors + self.integers: 129 self.attributes = attrs 130 return 131 if object!="": 132 object.tag = name 133 if attrs: 134 object.attributes = dict(attrs) 135 if len(self.stack)!=0: 136 current = self.stack[-1] 137 try: 138 current.append(object) 139 except AttributeError: 140 current[name] = object 141 self.stack.append(object)142144 value = self.content 145 if name in self.errors: 146 if value=="": 147 return 148 else: 149 raise RuntimeError(value) 150 elif name in self.integers: 151 value = IntegerElement(value) 152 elif name in self.strings: 153 # Convert Unicode strings to plain strings if possible 154 try: 155 value = StringElement(value) 156 except UnicodeEncodeError: 157 value = UnicodeElement(value) 158 elif name in self.items: 159 self.object = self.stack.pop() 160 if self.object.itemtype in ("List", "Structure"): 161 return 162 elif self.object.itemtype=="Integer": 163 value = IntegerElement(value) 164 else: 165 # Convert Unicode strings to plain strings if possible 166 try: 167 value = StringElement(value) 168 except UnicodeEncodeError: 169 value = UnicodeElement(value) 170 name = self.object.itemname 171 else: 172 self.object = self.stack.pop() 173 return 174 value.tag = name 175 if self.attributes: 176 value.attributes = dict(self.attributes) 177 del self.attributes 178 current = self.stack[-1] 179 try: 180 current.append(value) 181 except AttributeError: 182 current[name] = value183 186188 """This callback function is called for each element declaration: 189 <!ELEMENT name (...)> 190 encountered in a DTD. The purpose of this function is to determine 191 whether this element should be regarded as a string, integer, list 192 dictionary, structure, or error.""" 193 if name.upper()=="ERROR": 194 self.errors.append(name) 195 return 196 if name=='Item' and model==(expat.model.XML_CTYPE_MIXED, 197 expat.model.XML_CQUANT_REP, 198 None, ((expat.model.XML_CTYPE_NAME, 199 expat.model.XML_CQUANT_NONE, 200 'Item', 201 () 202 ), 203 ) 204 ): 205 # Special case. As far as I can tell, this only occurs in the 206 # eSummary DTD. 207 self.items.append(name) 208 return 209 # First, remove ignorable parentheses around declarations 210 while (model[0] in (expat.model.XML_CTYPE_SEQ, 211 expat.model.XML_CTYPE_CHOICE) 212 and model[1] in (expat.model.XML_CQUANT_NONE, 213 expat.model.XML_CQUANT_OPT) 214 and len(model[3])==1): 215 model = model[3][0] 216 # PCDATA declarations correspond to strings 217 if model[0] in (expat.model.XML_CTYPE_MIXED, 218 expat.model.XML_CTYPE_EMPTY): 219 self.strings.append(name) 220 return 221 # List-type elements 222 if (model[0] in (expat.model.XML_CTYPE_CHOICE, 223 expat.model.XML_CTYPE_SEQ) and 224 model[1] in (expat.model.XML_CQUANT_PLUS, 225 expat.model.XML_CQUANT_REP)): 226 self.lists.append(name) 227 return 228 # This is the tricky case. Check which keys can occur multiple 229 # times. If only one key is possible, and it can occur multiple 230 # times, then this is a list. If more than one key is possible, 231 # but none of them can occur multiple times, then this is a 232 # dictionary. Otherwise, this is a structure. 233 # In 'single' and 'multiple', we keep track which keys can occur 234 # only once, and which can occur multiple times. 235 single = [] 236 multiple = [] 237 # The 'count' function is called recursively to make sure all the 238 # children in this model are counted. Error keys are ignored; 239 # they raise an exception in Python. 240 def count(model): 241 quantifier, name, children = model[1:] 242 if name==None: 243 if quantifier in (expat.model.XML_CQUANT_PLUS, 244 expat.model.XML_CQUANT_REP): 245 for child in children: 246 multiple.append(child[2]) 247 else: 248 for child in children: 249 count(child) 250 elif name.upper()!="ERROR": 251 if quantifier in (expat.model.XML_CQUANT_NONE, 252 expat.model.XML_CQUANT_OPT): 253 single.append(name) 254 elif quantifier in (expat.model.XML_CQUANT_PLUS, 255 expat.model.XML_CQUANT_REP): 256 multiple.append(name)257 count(model) 258 if len(single)==0 and len(multiple)==1: 259 self.lists.append(name) 260 elif len(multiple)==0: 261 self.dictionaries.append(name) 262 else: 263 self.structures.update({name: multiple})266 """The purpose of this function is to load the DTD locally, instead 267 of downloading it from the URL specified in the XML. Using the local 268 DTD results in much faster parsing. If the DTD is not found locally, 269 we try to download it. In practice, this may fail though, if the XML 270 relies on many interrelated DTDs. If new DTDs appear, putting them in 271 Bio/Entrez/DTDs will allow the parser to see them.""" 272 self.initialized = True 273 location, filename = os.path.split(systemId) 274 path = os.path.join(self.dtd_dir, filename) 275 try: 276 handle = open(path) 277 except IOError: 278 import warnings, urllib 279 warnings.warn("DTD file %s not found in Biopython installation; trying to retrieve it from NCBI" % filename) 280 handle = urllib.urlopen(systemId) 281 parser = self.parser.ExternalEntityParserCreate(context) 282 parser.ElementDeclHandler = self.elementDecl 283 parser.ParseFile(handle) 284 return 1285287 """This function is only needed if the XML does not specify a DTD. 288 As far as I can tell, this only occurs for EFetch results from the 289 Journals database. Use a hand-written set of definitions instead.""" 290 self.initialized = True 291 if filename=="SerialSet": 292 # EFetch results from the Journals database 293 import SerialSet as module 294 else: 295 import warnings 296 warnings.warn("No parser available for %s; skipping its elements" % filename) 297 return 298 self.errors.extend(module.errors) 299 self.integers.extend(module.integers) 300 self.strings.extend(module.strings) 301 self.lists.extend(module.lists) 302 self.dictionaries.extend(module.dictionaries) 303 self.structures.update(module.structures) 304 self.items.extend(module.items)305
Trees | Indices | Help |
---|
Generated by Epydoc 3.0.1 on Thu Dec 25 10:43:52 2008 | http://epydoc.sourceforge.net |