Package dns :: Module tokenizer
[hide private]
[frames] | no frames]

Source Code for Module dns.tokenizer

  1  # Copyright (C) 2003-2007, 2009-2011 Nominum, Inc. 
  2  # 
  3  # Permission to use, copy, modify, and distribute this software and its 
  4  # documentation for any purpose with or without fee is hereby granted, 
  5  # provided that the above copyright notice and this permission notice 
  6  # appear in all copies. 
  7  # 
  8  # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES 
  9  # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 
 10  # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR 
 11  # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 
 12  # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
 13  # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
 14  # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
 15   
 16  """Tokenize DNS master file format""" 
 17   
 18  import cStringIO 
 19  import sys 
 20   
 21  import dns.exception 
 22  import dns.name 
 23  import dns.ttl 
 24   
 25  _DELIMITERS = { 
 26      ' ' : True, 
 27      '\t' : True, 
 28      '\n' : True, 
 29      ';' : True, 
 30      '(' : True, 
 31      ')' : True, 
 32      '"' : True } 
 33   
 34  _QUOTING_DELIMITERS = { '"' : True } 
 35   
 36  EOF = 0 
 37  EOL = 1 
 38  WHITESPACE = 2 
 39  IDENTIFIER = 3 
 40  QUOTED_STRING = 4 
 41  COMMENT = 5 
 42  DELIMITER = 6 
 43   
44 -class UngetBufferFull(dns.exception.DNSException):
45 """Raised when an attempt is made to unget a token when the unget 46 buffer is full.""" 47 pass
48
49 -class Token(object):
50 """A DNS master file format token. 51 52 @ivar ttype: The token type 53 @type ttype: int 54 @ivar value: The token value 55 @type value: string 56 @ivar has_escape: Does the token value contain escapes? 57 @type has_escape: bool 58 """ 59
60 - def __init__(self, ttype, value='', has_escape=False):
61 """Initialize a token instance. 62 63 @param ttype: The token type 64 @type ttype: int 65 @param value: The token value 66 @type value: string 67 @param has_escape: Does the token value contain escapes? 68 @type has_escape: bool 69 """ 70 self.ttype = ttype 71 self.value = value 72 self.has_escape = has_escape
73
74 - def is_eof(self):
75 return self.ttype == EOF
76
77 - def is_eol(self):
78 return self.ttype == EOL
79
80 - def is_whitespace(self):
81 return self.ttype == WHITESPACE
82
83 - def is_identifier(self):
84 return self.ttype == IDENTIFIER
85
86 - def is_quoted_string(self):
87 return self.ttype == QUOTED_STRING
88
89 - def is_comment(self):
90 return self.ttype == COMMENT
91
92 - def is_delimiter(self):
93 return self.ttype == DELIMITER
94
95 - def is_eol_or_eof(self):
96 return (self.ttype == EOL or self.ttype == EOF)
97
98 - def __eq__(self, other):
99 if not isinstance(other, Token): 100 return False 101 return (self.ttype == other.ttype and 102 self.value == other.value)
103
104 - def __ne__(self, other):
105 if not isinstance(other, Token): 106 return True 107 return (self.ttype != other.ttype or 108 self.value != other.value)
109
110 - def __str__(self):
111 return '%d "%s"' % (self.ttype, self.value)
112
113 - def unescape(self):
114 if not self.has_escape: 115 return self 116 unescaped = '' 117 l = len(self.value) 118 i = 0 119 while i < l: 120 c = self.value[i] 121 i += 1 122 if c == '\\': 123 if i >= l: 124 raise dns.exception.UnexpectedEnd 125 c = self.value[i] 126 i += 1 127 if c.isdigit(): 128 if i >= l: 129 raise dns.exception.UnexpectedEnd 130 c2 = self.value[i] 131 i += 1 132 if i >= l: 133 raise dns.exception.UnexpectedEnd 134 c3 = self.value[i] 135 i += 1 136 if not (c2.isdigit() and c3.isdigit()): 137 raise dns.exception.SyntaxError 138 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 139 unescaped += c 140 return Token(self.ttype, unescaped)
141 142 # compatibility for old-style tuple tokens 143
144 - def __len__(self):
145 return 2
146
147 - def __iter__(self):
148 return iter((self.ttype, self.value))
149
150 - def __getitem__(self, i):
151 if i == 0: 152 return self.ttype 153 elif i == 1: 154 return self.value 155 else: 156 raise IndexError
157
158 -class Tokenizer(object):
159 """A DNS master file format tokenizer. 160 161 A token is a (type, value) tuple, where I{type} is an int, and 162 I{value} is a string. The valid types are EOF, EOL, WHITESPACE, 163 IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER. 164 165 @ivar file: The file to tokenize 166 @type file: file 167 @ivar ungotten_char: The most recently ungotten character, or None. 168 @type ungotten_char: string 169 @ivar ungotten_token: The most recently ungotten token, or None. 170 @type ungotten_token: (int, string) token tuple 171 @ivar multiline: The current multiline level. This value is increased 172 by one every time a '(' delimiter is read, and decreased by one every time 173 a ')' delimiter is read. 174 @type multiline: int 175 @ivar quoting: This variable is true if the tokenizer is currently 176 reading a quoted string. 177 @type quoting: bool 178 @ivar eof: This variable is true if the tokenizer has encountered EOF. 179 @type eof: bool 180 @ivar delimiters: The current delimiter dictionary. 181 @type delimiters: dict 182 @ivar line_number: The current line number 183 @type line_number: int 184 @ivar filename: A filename that will be returned by the L{where} method. 185 @type filename: string 186 """ 187
188 - def __init__(self, f=sys.stdin, filename=None):
189 """Initialize a tokenizer instance. 190 191 @param f: The file to tokenize. The default is sys.stdin. 192 This parameter may also be a string, in which case the tokenizer 193 will take its input from the contents of the string. 194 @type f: file or string 195 @param filename: the name of the filename that the L{where} method 196 will return. 197 @type filename: string 198 """ 199 200 if isinstance(f, str): 201 f = cStringIO.StringIO(f) 202 if filename is None: 203 filename = '<string>' 204 else: 205 if filename is None: 206 if f is sys.stdin: 207 filename = '<stdin>' 208 else: 209 filename = '<file>' 210 self.file = f 211 self.ungotten_char = None 212 self.ungotten_token = None 213 self.multiline = 0 214 self.quoting = False 215 self.eof = False 216 self.delimiters = _DELIMITERS 217 self.line_number = 1 218 self.filename = filename
219
220 - def _get_char(self):
221 """Read a character from input. 222 @rtype: string 223 """ 224 225 if self.ungotten_char is None: 226 if self.eof: 227 c = '' 228 else: 229 c = self.file.read(1) 230 if c == '': 231 self.eof = True 232 elif c == '\n': 233 self.line_number += 1 234 else: 235 c = self.ungotten_char 236 self.ungotten_char = None 237 return c
238
239 - def where(self):
240 """Return the current location in the input. 241 242 @rtype: (string, int) tuple. The first item is the filename of 243 the input, the second is the current line number. 244 """ 245 246 return (self.filename, self.line_number)
247
248 - def _unget_char(self, c):
249 """Unget a character. 250 251 The unget buffer for characters is only one character large; it is 252 an error to try to unget a character when the unget buffer is not 253 empty. 254 255 @param c: the character to unget 256 @type c: string 257 @raises UngetBufferFull: there is already an ungotten char 258 """ 259 260 if not self.ungotten_char is None: 261 raise UngetBufferFull 262 self.ungotten_char = c
263
264 - def skip_whitespace(self):
265 """Consume input until a non-whitespace character is encountered. 266 267 The non-whitespace character is then ungotten, and the number of 268 whitespace characters consumed is returned. 269 270 If the tokenizer is in multiline mode, then newlines are whitespace. 271 272 @rtype: int 273 """ 274 275 skipped = 0 276 while True: 277 c = self._get_char() 278 if c != ' ' and c != '\t': 279 if (c != '\n') or not self.multiline: 280 self._unget_char(c) 281 return skipped 282 skipped += 1
283
284 - def get(self, want_leading = False, want_comment = False):
285 """Get the next token. 286 287 @param want_leading: If True, return a WHITESPACE token if the 288 first character read is whitespace. The default is False. 289 @type want_leading: bool 290 @param want_comment: If True, return a COMMENT token if the 291 first token read is a comment. The default is False. 292 @type want_comment: bool 293 @rtype: Token object 294 @raises dns.exception.UnexpectedEnd: input ended prematurely 295 @raises dns.exception.SyntaxError: input was badly formed 296 """ 297 298 if not self.ungotten_token is None: 299 token = self.ungotten_token 300 self.ungotten_token = None 301 if token.is_whitespace(): 302 if want_leading: 303 return token 304 elif token.is_comment(): 305 if want_comment: 306 return token 307 else: 308 return token 309 skipped = self.skip_whitespace() 310 if want_leading and skipped > 0: 311 return Token(WHITESPACE, ' ') 312 token = '' 313 ttype = IDENTIFIER 314 has_escape = False 315 while True: 316 c = self._get_char() 317 if c == '' or c in self.delimiters: 318 if c == '' and self.quoting: 319 raise dns.exception.UnexpectedEnd 320 if token == '' and ttype != QUOTED_STRING: 321 if c == '(': 322 self.multiline += 1 323 self.skip_whitespace() 324 continue 325 elif c == ')': 326 if not self.multiline > 0: 327 raise dns.exception.SyntaxError 328 self.multiline -= 1 329 self.skip_whitespace() 330 continue 331 elif c == '"': 332 if not self.quoting: 333 self.quoting = True 334 self.delimiters = _QUOTING_DELIMITERS 335 ttype = QUOTED_STRING 336 continue 337 else: 338 self.quoting = False 339 self.delimiters = _DELIMITERS 340 self.skip_whitespace() 341 continue 342 elif c == '\n': 343 return Token(EOL, '\n') 344 elif c == ';': 345 while 1: 346 c = self._get_char() 347 if c == '\n' or c == '': 348 break 349 token += c 350 if want_comment: 351 self._unget_char(c) 352 return Token(COMMENT, token) 353 elif c == '': 354 if self.multiline: 355 raise dns.exception.SyntaxError('unbalanced parentheses') 356 return Token(EOF) 357 elif self.multiline: 358 self.skip_whitespace() 359 token = '' 360 continue 361 else: 362 return Token(EOL, '\n') 363 else: 364 # This code exists in case we ever want a 365 # delimiter to be returned. It never produces 366 # a token currently. 367 token = c 368 ttype = DELIMITER 369 else: 370 self._unget_char(c) 371 break 372 elif self.quoting: 373 if c == '\\': 374 c = self._get_char() 375 if c == '': 376 raise dns.exception.UnexpectedEnd 377 if c.isdigit(): 378 c2 = self._get_char() 379 if c2 == '': 380 raise dns.exception.UnexpectedEnd 381 c3 = self._get_char() 382 if c == '': 383 raise dns.exception.UnexpectedEnd 384 if not (c2.isdigit() and c3.isdigit()): 385 raise dns.exception.SyntaxError 386 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 387 elif c == '\n': 388 raise dns.exception.SyntaxError('newline in quoted string') 389 elif c == '\\': 390 # 391 # It's an escape. Put it and the next character into 392 # the token; it will be checked later for goodness. 393 # 394 token += c 395 has_escape = True 396 c = self._get_char() 397 if c == '' or c == '\n': 398 raise dns.exception.UnexpectedEnd 399 token += c 400 if token == '' and ttype != QUOTED_STRING: 401 if self.multiline: 402 raise dns.exception.SyntaxError('unbalanced parentheses') 403 ttype = EOF 404 return Token(ttype, token, has_escape)
405
406 - def unget(self, token):
407 """Unget a token. 408 409 The unget buffer for tokens is only one token large; it is 410 an error to try to unget a token when the unget buffer is not 411 empty. 412 413 @param token: the token to unget 414 @type token: Token object 415 @raises UngetBufferFull: there is already an ungotten token 416 """ 417 418 if not self.ungotten_token is None: 419 raise UngetBufferFull 420 self.ungotten_token = token
421
422 - def next(self):
423 """Return the next item in an iteration. 424 @rtype: (int, string) 425 """ 426 427 token = self.get() 428 if token.is_eof(): 429 raise StopIteration 430 return token
431
432 - def __iter__(self):
433 return self
434 435 # Helpers 436
437 - def get_int(self):
438 """Read the next token and interpret it as an integer. 439 440 @raises dns.exception.SyntaxError: 441 @rtype: int 442 """ 443 444 token = self.get().unescape() 445 if not token.is_identifier(): 446 raise dns.exception.SyntaxError('expecting an identifier') 447 if not token.value.isdigit(): 448 raise dns.exception.SyntaxError('expecting an integer') 449 return int(token.value)
450
451 - def get_uint8(self):
452 """Read the next token and interpret it as an 8-bit unsigned 453 integer. 454 455 @raises dns.exception.SyntaxError: 456 @rtype: int 457 """ 458 459 value = self.get_int() 460 if value < 0 or value > 255: 461 raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value) 462 return value
463
464 - def get_uint16(self):
465 """Read the next token and interpret it as a 16-bit unsigned 466 integer. 467 468 @raises dns.exception.SyntaxError: 469 @rtype: int 470 """ 471 472 value = self.get_int() 473 if value < 0 or value > 65535: 474 raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value) 475 return value
476
477 - def get_uint32(self):
478 """Read the next token and interpret it as a 32-bit unsigned 479 integer. 480 481 @raises dns.exception.SyntaxError: 482 @rtype: int 483 """ 484 485 token = self.get().unescape() 486 if not token.is_identifier(): 487 raise dns.exception.SyntaxError('expecting an identifier') 488 if not token.value.isdigit(): 489 raise dns.exception.SyntaxError('expecting an integer') 490 value = long(token.value) 491 if value < 0 or value > 4294967296L: 492 raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value) 493 return value
494
495 - def get_string(self, origin=None):
496 """Read the next token and interpret it as a string. 497 498 @raises dns.exception.SyntaxError: 499 @rtype: string 500 """ 501 502 token = self.get().unescape() 503 if not (token.is_identifier() or token.is_quoted_string()): 504 raise dns.exception.SyntaxError('expecting a string') 505 return token.value
506
507 - def get_identifier(self, origin=None):
508 """Read the next token and raise an exception if it is not an identifier. 509 510 @raises dns.exception.SyntaxError: 511 @rtype: string 512 """ 513 514 token = self.get().unescape() 515 if not token.is_identifier(): 516 raise dns.exception.SyntaxError('expecting an identifier') 517 return token.value
518
519 - def get_name(self, origin=None):
520 """Read the next token and interpret it as a DNS name. 521 522 @raises dns.exception.SyntaxError: 523 @rtype: dns.name.Name object""" 524 525 token = self.get() 526 if not token.is_identifier(): 527 raise dns.exception.SyntaxError('expecting an identifier') 528 return dns.name.from_text(token.value, origin)
529
530 - def get_eol(self):
531 """Read the next token and raise an exception if it isn't EOL or 532 EOF. 533 534 @raises dns.exception.SyntaxError: 535 @rtype: string 536 """ 537 538 token = self.get() 539 if not token.is_eol_or_eof(): 540 raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value)) 541 return token.value
542
543 - def get_ttl(self):
544 token = self.get().unescape() 545 if not token.is_identifier(): 546 raise dns.exception.SyntaxError('expecting an identifier') 547 return dns.ttl.from_text(token.value)
548