1
2
3
4
5
6
7
8
9
10 """
11 Basic data classes for representing context free grammars. A
12 X{grammar} specifies which trees can represent the structure of a
13 given text. Each of these trees is called a X{parse tree} for the
14 text (or simply a X{parse}). In a X{context free} grammar, the set of
15 parse trees for any piece of a text can depend only on that piece, and
16 not on the rest of the text (i.e., the piece's context). Context free
17 grammars are often used to find possible syntactic structures for
18 sentences. In this context, the leaves of a parse tree are word
19 tokens; and the node values are phrasal categories, such as C{NP}
20 and C{VP}.
21
22 The L{Grammar} class is used to encode context free grammars. Each C{Grammar}
23 consists of a start symbol and a set of productions. The X{start
24 symbol} specifies the root node value for parse trees. For example,
25 the start symbol for syntactic parsing is usually C{S}. Start
26 symbols are encoded using the C{Nonterminal} class, which is discussed
27 below.
28
29 A Grammar's X{productions} specify what parent-child relationships a parse
30 tree can contain. Each production specifies that a particular
31 node can be the parent of a particular set of children. For example,
32 the production C{<S> -> <NP> <VP>} specifies that an C{S} node can
33 be the parent of an C{NP} node and a C{VP} node.
34
35 Grammar productions are implemented by the C{Production} class.
36 Each C{Production} consists of a left hand side and a right hand
37 side. The X{left hand side} is a C{Nonterminal} that specifies the
38 node type for a potential parent; and the X{right hand side} is a list
39 that specifies allowable children for that parent. This lists
40 consists of C{Nonterminals} and text types: each C{Nonterminal}
41 indicates that the corresponding child may be a C{TreeToken} with the
42 specified node type; and each text type indicates that the
43 corresponding child may be a C{Token} with the with that type.
44
45 The C{Nonterminal} class is used to distinguish node values from leaf
46 values. This prevents the grammar from accidentally using a leaf
47 value (such as the English word "A") as the node of a subtree. Within
48 a C{Grammar}, all node values are wrapped in the C{Nonterminal} class.
49 Note, however, that the trees that are specified by the grammar do
50 B{not} include these C{Nonterminal} wrappers.
51
52 Grammars can also be given a more procedural interpretation. According to
53 this interpretation, a Grammar specifies any tree structure M{tree} that
54 can be produced by the following procedure:
55
56 - Set M{tree} to the start symbol
57 - Repeat until M{tree} contains no more nonterminal leaves:
58 - Choose a production M{prod} with whose left hand side
59 M{lhs} is a nonterminal leaf of M{tree}.
60 - Replace the nonterminal leaf with a subtree, whose node
61 value is the value wrapped by the nonterminal M{lhs}, and
62 whose children are the right hand side of M{prod}.
63
64 The operation of replacing the left hand side (M{lhs}) of a production
65 with the right hand side (M{rhs}) in a tree (M{tree}) is known as
66 X{expanding} M{lhs} to M{rhs} in M{tree}.
67 """
68
69 import re
70
71
72
73
74
75
77 """
78 A non-terminal symbol for a context free grammar. C{Nonterminal}
79 is a wrapper class for node values; it is used by
80 C{Production}s to distinguish node values from leaf values.
81 The node value that is wrapped by a C{Nonterminal} is known as its
82 X{symbol}. Symbols are typically strings representing phrasal
83 categories (such as C{"NP"} or C{"VP"}). However, more complex
84 symbol types are sometimes used (e.g., for lexicalized grammars).
85 Since symbols are node values, they must be immutable and
86 hashable. Two C{Nonterminal}s are considered equal if their
87 symbols are equal.
88
89 @see: L{Grammar}
90 @see: L{Production}
91 @type _symbol: (any)
92 @ivar _symbol: The node value corresponding to this
93 C{Nonterminal}. This value must be immutable and hashable.
94 """
96 """
97 Construct a new non-terminal from the given symbol.
98
99 @type symbol: (any)
100 @param symbol: The node value corresponding to this
101 C{Nonterminal}. This value must be immutable and
102 hashable.
103 """
104 self._symbol = symbol
105 self._hash = hash(symbol)
106
108 """
109 @return: The node value corresponding to this C{Nonterminal}.
110 @rtype: (any)
111 """
112 return self._symbol
113
115 """
116 @return: True if this non-terminal is equal to C{other}. In
117 particular, return true iff C{other} is a C{Nonterminal}
118 and this non-terminal's symbol is equal to C{other}'s
119 symbol.
120 @rtype: C{boolean}
121 """
122 try:
123 return ((self._symbol == other._symbol) \
124 and isinstance(other, self.__class__))
125 except AttributeError:
126 return False
127
129 """
130 @return: True if this non-terminal is not equal to C{other}. In
131 particular, return true iff C{other} is not a C{Nonterminal}
132 or this non-terminal's symbol is not equal to C{other}'s
133 symbol.
134 @rtype: C{boolean}
135 """
136 return not (self==other)
137
139 if self == other: return 0
140 else: return -1
141
144
146 """
147 @return: A string representation for this C{Nonterminal}.
148 The string representation for a C{Nonterminal} whose
149 symbol is C{M{s}} is C{<M{s}>}.
150 @rtype: C{string}
151 """
152
153 return '<%s>' % (self._symbol,)
154
156 """
157 @return: A string representation for this C{Nonterminal}.
158 The string representation for a C{Nonterminal} whose
159 symbol is C{M{s}} is C{M{s}}.
160 @rtype: C{string}
161 """
162 return '%s' % (self._symbol,)
163
165 """
166 @return: A new nonterminal whose symbol is C{M{A}/M{B}}, where
167 C{M{A}} is the symbol for this nonterminal, and C{M{B}}
168 is the symbol for rhs.
169 @rtype: L{Nonterminal}
170 @param rhs: The nonterminal used to form the right hand side
171 of the new nonterminal.
172 @type rhs: L{Nonterminal}
173 """
174 return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
175
177 """
178 Given a string containing a list of symbol names, return a list of
179 C{Nonterminals} constructed from those symbols.
180
181 @param symbols: The symbol name string. This string can be
182 delimited by either spaces or commas.
183 @type symbols: C{string}
184 @return: A list of C{Nonterminals} constructed from the symbol
185 names given in C{symbols}. The C{Nonterminals} are sorted
186 in the same order as the symbols names.
187 @rtype: C{list} of L{Nonterminal}
188 """
189 if ',' in symbols: symbol_list = symbols.split(',')
190 else: symbol_list = symbols.split()
191 return [Nonterminal(s.strip()) for s in symbol_list]
192
193
194
195
196
198 """
199 A context-free grammar production. Each production
200 expands a single C{Nonterminal} (the X{left-hand side}) to a
201 sequence of terminals and C{Nonterminals} (the X{right-hand
202 side}). X{terminals} can be any immutable hashable object that is
203 not a C{Nonterminal}. Typically, terminals are strings
204 representing word types, such as C{"dog"} or C{"under"}.
205
206 Abstractly, a Grammar production indicates that the right-hand side is
207 a possible X{instantiation} of the left-hand side. Grammar
208 productions are X{context-free}, in the sense that this
209 instantiation should not depend on the context of the left-hand
210 side or of the right-hand side.
211
212 @see: L{Grammar}
213 @see: L{Nonterminal}
214 @type _lhs: L{Nonterminal}
215 @ivar _lhs: The left-hand side of the production.
216 @type _rhs: C{tuple} of (C{Nonterminal} and (terminal))
217 @ivar _rhs: The right-hand side of the production.
218 """
219
221 """
222 Construct a new C{Production}.
223
224 @param lhs: The left-hand side of the new C{Production}.
225 @type lhs: L{Nonterminal}
226 @param rhs: The right-hand side of the new C{Production}.
227 @type rhs: sequence of (C{Nonterminal} and (terminal))
228 """
229 if isinstance(rhs, (str, unicode)):
230 raise TypeError, 'production right hand side should be a list, not a string'
231 self._lhs = lhs
232 self._rhs = tuple(rhs)
233 self._hash = hash((self._lhs, self._rhs))
234
236 """
237 @return: the left-hand side of this C{Production}.
238 @rtype: L{Nonterminal}
239 """
240 return self._lhs
241
243 """
244 @return: the right-hand side of this C{Production}.
245 @rtype: sequence of (C{Nonterminal} and (terminal))
246 """
247 return self._rhs
248
250 """
251 @return: A verbose string representation of the
252 C{Production}.
253 @rtype: C{string}
254 """
255 str = '%s ->' % (self._lhs.symbol(),)
256 for elt in self._rhs:
257 if isinstance(elt, Nonterminal):
258 str += ' %s' % (elt.symbol(),)
259 else:
260 str += ' %r' % (elt,)
261 return str
262
264 """
265 @return: A concise string representation of the
266 C{Production}.
267 @rtype: C{string}
268 """
269 return '%s' % self
270
272 """
273 @return: true if this C{Production} is equal to C{other}.
274 @rtype: C{boolean}
275 """
276 return (isinstance(other, self.__class__) and
277 self._lhs == other._lhs and
278 self._rhs == other._rhs)
279
281 return not (self == other)
282
284 if not isinstance(other, self.__class__): return -1
285 return cmp((self._lhs, self._rhs), (other._lhs, other._rhs))
286
288 """
289 @return: A hash value for the C{Production}.
290 @rtype: C{int}
291 """
292 return self._hash
293
294
296 """
297 A context-free grammar. A Grammar consists of a start state and a set
298 of productions. The set of terminals and nonterminals is
299 implicitly specified by the productions.
300
301 If you need efficient key-based access to productions, you
302 can use a subclass to implement it.
303 """
304 - def __init__(self, start, productions):
305 """
306 Create a new context-free grammar, from the given start state
307 and set of C{Production}s.
308
309 @param start: The start symbol
310 @type start: L{Nonterminal}
311 @param productions: The list of productions that defines the grammar
312 @type productions: C{list} of L{Production}
313 """
314 self._start = start
315 self._productions = productions
316 self._lhs_index = {}
317 self._rhs_index = {}
318 for prod in self._productions:
319 if prod._lhs not in self._lhs_index:
320 self._lhs_index[prod._lhs] = []
321 if prod._rhs and prod._rhs[0] not in self._rhs_index:
322 self._rhs_index[prod._rhs[0]] = []
323 self._lhs_index[prod._lhs].append(prod)
324 if prod._rhs:
325 self._rhs_index[prod._rhs[0]].append(prod)
326
329
330
331
333
334 if not lhs and not rhs:
335 return self._productions
336
337
338 elif lhs and not rhs:
339 if lhs in self._lhs_index:
340 return self._lhs_index[lhs]
341 else:
342 return []
343
344
345 elif rhs and not lhs:
346 if rhs in self._rhs_index:
347 return self._rhs_index[rhs]
348 else:
349 return []
350
351
352 else:
353 if lhs in self._lhs_index:
354 return [prod for prod in self._lhs_index[lhs]
355 if prod in self._rhs_index[rhs]]
356 else:
357 return []
358
360 return '<Grammar with %d productions>' % len(self._productions)
361
363 str = 'Grammar with %d productions' % len(self._productions)
364 str += ' (start state = %s)' % self._start
365 for production in self._productions:
366 str += '\n %s' % production
367 return str
368
369
370
371
372
373 _PARSE_RE = re.compile(r'''^\s* # leading whitespace
374 (\w+(?:/\w+)?)\s* # lhs
375 (?:[-=]+>)\s* # arrow
376 (?:( # rhs:
377 "[^"]+" # doubled-quoted terminal
378 | '[^']+' # single-quoted terminal
379 | \w+(?:/\w+)? # non-terminal
380 | \| # disjunction
381 )
382 \s*) # trailing space
383 *$''',
384 re.VERBOSE)
385 _SPLIT_RE = re.compile(r'''(\w+(?:/\w+)?|[-=]+>|"[^"]+"|'[^']+'|\|)''')
386
388 """
389 Returns a list of productions
390 """
391
392 if not _PARSE_RE.match(s):
393 raise ValueError, 'Bad production string'
394
395 pieces = _SPLIT_RE.split(s)
396 pieces = [p for i,p in enumerate(pieces) if i%2==1]
397 lhside = Nonterminal(pieces[0])
398 rhsides = [[]]
399 found_terminal = found_non_terminal = False
400 for piece in pieces[2:]:
401 if piece == '|':
402 rhsides.append([])
403 found_terminal = found_non_terminal = False
404 elif piece[0] in ('"', "'"):
405 rhsides[-1].append(piece[1:-1])
406 found_terminal = True
407 else:
408 rhsides[-1].append(Nonterminal(piece))
409 found_non_terminal = True
410 if found_terminal and found_non_terminal:
411 raise ValueError, 'Bad right-hand-side: do not mix terminals and non-terminals'
412 return [Production(lhside, rhside) for rhside in rhsides]
413
426
427
428
429
430
432 """
433 A demonstration showing how C{Grammar}s can be created and used.
434 """
435
436 from nltk_lite.parse import cfg
437
438
439 S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP')
440 N, V, P, Det = cfg.nonterminals('N, V, P, Det')
441 VP_slash_NP = VP/NP
442
443 print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP]
444 print ' S.symbol() =>', `S.symbol()`
445 print
446
447 print cfg.Production(S, [NP])
448
449
450 grammar = cfg.parse_cfg("""
451 S -> NP VP
452 PP -> P NP
453 NP -> Det N | NP PP
454 VP -> V NP | VP PP
455 Det -> 'a' | 'the'
456 N -> 'dog' | 'cat'
457 V -> 'chased' | 'sat'
458 P -> 'on' | 'in'
459 """)
460
461 print 'A Grammar:', `grammar`
462 print ' grammar.start() =>', `grammar.start()`
463 print ' grammar.productions() =>',
464
465 print `grammar.productions()`.replace(',', ',\n'+' '*25)
466 print
467
468 if __name__ == '__main__': demo()
469