Package dns :: Module tokenizer
[hide private]
[frames] | no frames]

Source Code for Module dns.tokenizer

  1  # Copyright (C) 2003-2007, 2009-2011 Nominum, Inc. 
  2  # 
  3  # Permission to use, copy, modify, and distribute this software and its 
  4  # documentation for any purpose with or without fee is hereby granted, 
  5  # provided that the above copyright notice and this permission notice 
  6  # appear in all copies. 
  7  # 
  8  # THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES 
  9  # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 
 10  # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR 
 11  # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 
 12  # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 
 13  # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
 14  # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
 15   
 16  """Tokenize DNS master file format""" 
 17   
 18  import io 
 19  import sys 
 20   
 21  import dns.exception 
 22  import dns.name 
 23  import dns.ttl 
 24   
 25  _DELIMITERS = frozenset(' \t\n;()"') 
 26  _QUOTING_DELIMITERS = frozenset('"') 
 27   
 28  EOF = 0 
 29  EOL = 1 
 30  WHITESPACE = 2 
 31  IDENTIFIER = 3 
 32  QUOTED_STRING = 4 
 33  COMMENT = 5 
 34  DELIMITER = 6 
 35   
36 -class UngetBufferFull(dns.exception.DNSException):
37 """Raised when an attempt is made to unget a token when the unget 38 buffer is full.""" 39 pass
40
41 -class Token(object):
42 """A DNS master file format token. 43 44 @ivar ttype: The token type 45 @type ttype: int 46 @ivar value: The token value 47 @type value: string 48 @ivar has_escape: Does the token value contain escapes? 49 @type has_escape: bool 50 """ 51
52 - def __init__(self, ttype, value='', has_escape=False):
53 """Initialize a token instance. 54 55 @param ttype: The token type 56 @type ttype: int 57 @param value: The token value 58 @type value: string 59 @param has_escape: Does the token value contain escapes? 60 @type has_escape: bool 61 """ 62 self.ttype = ttype 63 self.value = value 64 self.has_escape = has_escape
65
66 - def is_eof(self):
67 return self.ttype == EOF
68
69 - def is_eol(self):
70 return self.ttype == EOL
71
72 - def is_whitespace(self):
73 return self.ttype == WHITESPACE
74
75 - def is_identifier(self):
76 return self.ttype == IDENTIFIER
77
78 - def is_quoted_string(self):
79 return self.ttype == QUOTED_STRING
80
81 - def is_comment(self):
82 return self.ttype == COMMENT
83
84 - def is_delimiter(self):
85 return self.ttype == DELIMITER
86
87 - def is_eol_or_eof(self):
88 return (self.ttype == EOL or self.ttype == EOF)
89
90 - def __eq__(self, other):
91 if not isinstance(other, Token): 92 return False 93 return (self.ttype == other.ttype and 94 self.value == other.value)
95
96 - def __ne__(self, other):
97 if not isinstance(other, Token): 98 return True 99 return (self.ttype != other.ttype or 100 self.value != other.value)
101
102 - def __str__(self):
103 return '%d "%s"' % (self.ttype, self.value)
104
105 - def unescape(self):
106 if not self.has_escape: 107 return self 108 unescaped = '' 109 l = len(self.value) 110 i = 0 111 while i < l: 112 c = self.value[i] 113 i += 1 114 if c == '\\': 115 if i >= l: 116 raise dns.exception.UnexpectedEnd 117 c = self.value[i] 118 i += 1 119 if c.isdigit(): 120 if i >= l: 121 raise dns.exception.UnexpectedEnd 122 c2 = self.value[i] 123 i += 1 124 if i >= l: 125 raise dns.exception.UnexpectedEnd 126 c3 = self.value[i] 127 i += 1 128 if not (c2.isdigit() and c3.isdigit()): 129 raise dns.exception.SyntaxError 130 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 131 unescaped += c 132 return Token(self.ttype, unescaped)
133 134 # compatibility for old-style tuple tokens 135
136 - def __len__(self):
137 return 2
138
139 - def __iter__(self):
140 return iter((self.ttype, self.value))
141
142 - def __getitem__(self, i):
143 if i == 0: 144 return self.ttype 145 elif i == 1: 146 return self.value 147 else: 148 raise IndexError
149
150 -class Tokenizer(object):
151 """A DNS master file format tokenizer. 152 153 A token is a (type, value) tuple, where I{type} is an int, and 154 I{value} is a string. The valid types are EOF, EOL, WHITESPACE, 155 IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER. 156 157 @ivar file: The file to tokenize 158 @type file: file 159 @ivar ungotten_char: The most recently ungotten character, or None. 160 @type ungotten_char: string 161 @ivar ungotten_token: The most recently ungotten token, or None. 162 @type ungotten_token: (int, string) token tuple 163 @ivar multiline: The current multiline level. This value is increased 164 by one every time a '(' delimiter is read, and decreased by one every time 165 a ')' delimiter is read. 166 @type multiline: int 167 @ivar quoting: This variable is true if the tokenizer is currently 168 reading a quoted string. 169 @type quoting: bool 170 @ivar eof: This variable is true if the tokenizer has encountered EOF. 171 @type eof: bool 172 @ivar delimiters: The current delimiter dictionary. 173 @type delimiters: dict 174 @ivar line_number: The current line number 175 @type line_number: int 176 @ivar filename: A filename that will be returned by the L{where} method. 177 @type filename: string 178 """ 179
180 - def __init__(self, f=sys.stdin, filename=None):
181 """Initialize a tokenizer instance. 182 183 @param f: The file to tokenize. The default is sys.stdin. 184 This parameter may also be a string, in which case the tokenizer 185 will take its input from the contents of the string. 186 @type f: file or string 187 @param filename: the name of the filename that the L{where} method 188 will return. 189 @type filename: string 190 """ 191 192 if isinstance(f, str): 193 f = io.StringIO(f) 194 if filename is None: 195 filename = '<string>' 196 else: 197 if filename is None: 198 if f is sys.stdin: 199 filename = '<stdin>' 200 else: 201 filename = '<file>' 202 self.file = f 203 self.ungotten_char = None 204 self.ungotten_token = None 205 self.multiline = 0 206 self.quoting = False 207 self.eof = False 208 self.delimiters = _DELIMITERS 209 self.line_number = 1 210 self.filename = filename
211
212 - def _get_char(self):
213 """Read a character from input. 214 @rtype: string 215 """ 216 217 if self.ungotten_char is None: 218 if self.eof: 219 c = '' 220 else: 221 c = self.file.read(1) 222 if c == '': 223 self.eof = True 224 elif c == '\n': 225 self.line_number += 1 226 else: 227 c = self.ungotten_char 228 self.ungotten_char = None 229 return c
230
231 - def where(self):
232 """Return the current location in the input. 233 234 @rtype: (string, int) tuple. The first item is the filename of 235 the input, the second is the current line number. 236 """ 237 238 return (self.filename, self.line_number)
239
240 - def _unget_char(self, c):
241 """Unget a character. 242 243 The unget buffer for characters is only one character large; it is 244 an error to try to unget a character when the unget buffer is not 245 empty. 246 247 @param c: the character to unget 248 @type c: string 249 @raises UngetBufferFull: there is already an ungotten char 250 """ 251 252 if not self.ungotten_char is None: 253 raise UngetBufferFull 254 self.ungotten_char = c
255
256 - def skip_whitespace(self):
257 """Consume input until a non-whitespace character is encountered. 258 259 The non-whitespace character is then ungotten, and the number of 260 whitespace characters consumed is returned. 261 262 If the tokenizer is in multiline mode, then newlines are whitespace. 263 264 @rtype: int 265 """ 266 267 skipped = 0 268 while True: 269 c = self._get_char() 270 if c != ' ' and c != '\t': 271 if (c != '\n') or not self.multiline: 272 self._unget_char(c) 273 return skipped 274 skipped += 1
275
276 - def get(self, want_leading = False, want_comment = False):
277 """Get the next token. 278 279 @param want_leading: If True, return a WHITESPACE token if the 280 first character read is whitespace. The default is False. 281 @type want_leading: bool 282 @param want_comment: If True, return a COMMENT token if the 283 first token read is a comment. The default is False. 284 @type want_comment: bool 285 @rtype: Token object 286 @raises dns.exception.UnexpectedEnd: input ended prematurely 287 @raises dns.exception.SyntaxError: input was badly formed 288 """ 289 290 if not self.ungotten_token is None: 291 token = self.ungotten_token 292 self.ungotten_token = None 293 if token.is_whitespace(): 294 if want_leading: 295 return token 296 elif token.is_comment(): 297 if want_comment: 298 return token 299 else: 300 return token 301 skipped = self.skip_whitespace() 302 if want_leading and skipped > 0: 303 return Token(WHITESPACE, ' ') 304 token = '' 305 ttype = IDENTIFIER 306 has_escape = False 307 while True: 308 c = self._get_char() 309 if c == '' or c in self.delimiters: 310 if c == '' and self.quoting: 311 raise dns.exception.UnexpectedEnd 312 if token == '' and ttype != QUOTED_STRING: 313 if c == '(': 314 self.multiline += 1 315 self.skip_whitespace() 316 continue 317 elif c == ')': 318 if not self.multiline > 0: 319 raise dns.exception.SyntaxError 320 self.multiline -= 1 321 self.skip_whitespace() 322 continue 323 elif c == '"': 324 if not self.quoting: 325 self.quoting = True 326 self.delimiters = _QUOTING_DELIMITERS 327 ttype = QUOTED_STRING 328 continue 329 else: 330 self.quoting = False 331 self.delimiters = _DELIMITERS 332 self.skip_whitespace() 333 continue 334 elif c == '\n': 335 return Token(EOL, '\n') 336 elif c == ';': 337 while 1: 338 c = self._get_char() 339 if c == '\n' or c == '': 340 break 341 token += c 342 if want_comment: 343 self._unget_char(c) 344 return Token(COMMENT, token) 345 elif c == '': 346 if self.multiline: 347 raise dns.exception.SyntaxError('unbalanced parentheses') 348 return Token(EOF) 349 elif self.multiline: 350 self.skip_whitespace() 351 token = '' 352 continue 353 else: 354 return Token(EOL, '\n') 355 else: 356 # This code exists in case we ever want a 357 # delimiter to be returned. It never produces 358 # a token currently. 359 token = c 360 ttype = DELIMITER 361 else: 362 self._unget_char(c) 363 break 364 elif self.quoting: 365 if c == '\\': 366 c = self._get_char() 367 if c == '': 368 raise dns.exception.UnexpectedEnd 369 if c.isdigit(): 370 c2 = self._get_char() 371 if c2 == '': 372 raise dns.exception.UnexpectedEnd 373 c3 = self._get_char() 374 if c == '': 375 raise dns.exception.UnexpectedEnd 376 if not (c2.isdigit() and c3.isdigit()): 377 raise dns.exception.SyntaxError 378 c = chr(int(c) * 100 + int(c2) * 10 + int(c3)) 379 elif c == '\n': 380 raise dns.exception.SyntaxError('newline in quoted string') 381 elif c == '\\': 382 # 383 # It's an escape. Put it and the next character into 384 # the token; it will be checked later for goodness. 385 # 386 token += c 387 has_escape = True 388 c = self._get_char() 389 if c == '' or c == '\n': 390 raise dns.exception.UnexpectedEnd 391 token += c 392 if token == '' and ttype != QUOTED_STRING: 393 if self.multiline: 394 raise dns.exception.SyntaxError('unbalanced parentheses') 395 ttype = EOF 396 return Token(ttype, token, has_escape)
397
398 - def unget(self, token):
399 """Unget a token. 400 401 The unget buffer for tokens is only one token large; it is 402 an error to try to unget a token when the unget buffer is not 403 empty. 404 405 @param token: the token to unget 406 @type token: Token object 407 @raises UngetBufferFull: there is already an ungotten token 408 """ 409 410 if not self.ungotten_token is None: 411 raise UngetBufferFull 412 self.ungotten_token = token
413
414 - def __next__(self):
415 """Return the next item in an iteration. 416 @rtype: (int, string) 417 """ 418 419 token = self.get() 420 if token.is_eof(): 421 raise StopIteration 422 return token
423
424 - def __iter__(self):
425 return self
426 427 # Helpers 428
429 - def get_int(self):
430 """Read the next token and interpret it as an integer. 431 432 @raises dns.exception.SyntaxError: 433 @rtype: int 434 """ 435 436 token = self.get().unescape() 437 if not token.is_identifier(): 438 raise dns.exception.SyntaxError('expecting an identifier') 439 if not token.value.isdigit(): 440 raise dns.exception.SyntaxError('expecting an integer') 441 return int(token.value)
442
443 - def get_uint8(self):
444 """Read the next token and interpret it as an 8-bit unsigned 445 integer. 446 447 @raises dns.exception.SyntaxError: 448 @rtype: int 449 """ 450 451 value = self.get_int() 452 if value < 0 or value > 255: 453 raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value) 454 return value
455
456 - def get_uint16(self):
457 """Read the next token and interpret it as a 16-bit unsigned 458 integer. 459 460 @raises dns.exception.SyntaxError: 461 @rtype: int 462 """ 463 464 value = self.get_int() 465 if value < 0 or value > 65535: 466 raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value) 467 return value
468
469 - def get_uint32(self):
470 """Read the next token and interpret it as a 32-bit unsigned 471 integer. 472 473 @raises dns.exception.SyntaxError: 474 @rtype: int 475 """ 476 477 token = self.get().unescape() 478 if not token.is_identifier(): 479 raise dns.exception.SyntaxError('expecting an identifier') 480 if not token.value.isdigit(): 481 raise dns.exception.SyntaxError('expecting an integer') 482 value = int(token.value) 483 if value < 0 or value > 4294967296: 484 raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value) 485 return value
486
487 - def get_string(self, origin=None):
488 """Read the next token and interpret it as a string. 489 490 @raises dns.exception.SyntaxError: 491 @rtype: string 492 """ 493 494 token = self.get().unescape() 495 if not (token.is_identifier() or token.is_quoted_string()): 496 raise dns.exception.SyntaxError('expecting a string') 497 return token.value
498
499 - def get_identifier(self, origin=None):
500 """Read the next token and raise an exception if it is not an identifier. 501 502 @raises dns.exception.SyntaxError: 503 @rtype: string 504 """ 505 506 token = self.get().unescape() 507 if not token.is_identifier(): 508 raise dns.exception.SyntaxError('expecting an identifier') 509 return token.value
510
511 - def get_name(self, origin=None):
512 """Read the next token and interpret it as a DNS name. 513 514 @raises dns.exception.SyntaxError: 515 @rtype: dns.name.Name object""" 516 517 token = self.get() 518 if not token.is_identifier(): 519 raise dns.exception.SyntaxError('expecting an identifier') 520 return dns.name.from_text(token.value, origin)
521
522 - def get_eol(self):
523 """Read the next token and raise an exception if it isn't EOL or 524 EOF. 525 526 @raises dns.exception.SyntaxError: 527 @rtype: string 528 """ 529 530 token = self.get() 531 if not token.is_eol_or_eof(): 532 raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value)) 533 return token.value
534
535 - def get_ttl(self):
536 token = self.get().unescape() 537 if not token.is_identifier(): 538 raise dns.exception.SyntaxError('expecting an identifier') 539 return dns.ttl.from_text(token.value)
540