Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package semantics :: Module cfg
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.semantics.cfg

  1  # Natural Language Toolkit: Context Free Grammars 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Steven Bird <sb@csse.unimelb.edu.au> 
  5  #         Edward Loper <edloper@ldc.upenn.edu> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8  # 
  9   
 10  """ 
 11  Basic data classes for representing context free grammars.  A 
 12  X{grammar} specifies which trees can represent the structure of a 
 13  given text.  Each of these trees is called a X{parse tree} for the 
 14  text (or simply a X{parse}).  In a X{context free} grammar, the set of 
 15  parse trees for any piece of a text can depend only on that piece, and 
 16  not on the rest of the text (i.e., the piece's context).  Context free 
 17  grammars are often used to find possible syntactic structures for 
 18  sentences.  In this context, the leaves of a parse tree are word 
 19  tokens; and the node values are phrasal categories, such as C{NP} 
 20  and C{VP}. 
 21   
 22  The L{Grammar} class is used to encode context free grammars.  Each C{Grammar} 
 23  consists of a start symbol and a set of productions.  The X{start 
 24  symbol} specifies the root node value for parse trees.  For example, 
 25  the start symbol for syntactic parsing is usually C{S}.  Start 
 26  symbols are encoded using the C{Nonterminal} class, which is discussed 
 27  below. 
 28   
 29  A Grammar's X{productions} specify what parent-child relationships a parse 
 30  tree can contain.  Each production specifies that a particular 
 31  node can be the parent of a particular set of children.  For example, 
 32  the production C{<S> -> <NP> <VP>} specifies that an C{S} node can 
 33  be the parent of an C{NP} node and a C{VP} node. 
 34   
 35  Grammar productions are implemented by the C{Production} class. 
 36  Each C{Production} consists of a left hand side and a right hand 
 37  side.  The X{left hand side} is a C{Nonterminal} that specifies the 
 38  node type for a potential parent; and the X{right hand side} is a list 
 39  that specifies allowable children for that parent.  This lists 
 40  consists of C{Nonterminals} and text types: each C{Nonterminal} 
 41  indicates that the corresponding child may be a C{TreeToken} with the 
 42  specified node type; and each text type indicates that the 
 43  corresponding child may be a C{Token} with the with that type. 
 44   
 45  The C{Nonterminal} class is used to distinguish node values from leaf 
 46  values.  This prevents the grammar from accidentally using a leaf 
 47  value (such as the English word "A") as the node of a subtree.  Within 
 48  a C{Grammar}, all node values are wrapped in the C{Nonterminal} class. 
 49  Note, however, that the trees that are specified by the grammar do 
 50  B{not} include these C{Nonterminal} wrappers. 
 51   
 52  Grammars can also be given a more procedural interpretation.  According to 
 53  this interpretation, a Grammar specifies any tree structure M{tree} that 
 54  can be produced by the following procedure: 
 55   
 56      - Set M{tree} to the start symbol 
 57      - Repeat until M{tree} contains no more nonterminal leaves: 
 58        - Choose a production M{prod} with whose left hand side 
 59          M{lhs} is a nonterminal leaf of M{tree}. 
 60        - Replace the nonterminal leaf with a subtree, whose node 
 61          value is the value wrapped by the nonterminal M{lhs}, and 
 62          whose children are the right hand side of M{prod}. 
 63   
 64  The operation of replacing the left hand side (M{lhs}) of a production 
 65  with the right hand side (M{rhs}) in a tree (M{tree}) is known as 
 66  X{expanding} M{lhs} to M{rhs} in M{tree}. 
 67  """ 
 68   
 69  import re 
 70   
 71   
 72  ################################################################# 
 73  # Nonterminal 
 74  ################################################################# 
 75   
76 -class Nonterminal(object):
77 """ 78 A non-terminal symbol for a context free grammar. C{Nonterminal} 79 is a wrapper class for node values; it is used by 80 C{Production}s to distinguish node values from leaf values. 81 The node value that is wrapped by a C{Nonterminal} is known as its 82 X{symbol}. Symbols are typically strings representing phrasal 83 categories (such as C{"NP"} or C{"VP"}). However, more complex 84 symbol types are sometimes used (e.g., for lexicalized grammars). 85 Since symbols are node values, they must be immutable and 86 hashable. Two C{Nonterminal}s are considered equal if their 87 symbols are equal. 88 89 @see: L{Grammar} 90 @see: L{Production} 91 @type _symbol: (any) 92 @ivar _symbol: The node value corresponding to this 93 C{Nonterminal}. This value must be immutable and hashable. 94 """
95 - def __init__(self, symbol):
96 """ 97 Construct a new non-terminal from the given symbol. 98 99 @type symbol: (any) 100 @param symbol: The node value corresponding to this 101 C{Nonterminal}. This value must be immutable and 102 hashable. 103 """ 104 self._symbol = symbol 105 self._hash = hash(symbol)
106
107 - def symbol(self):
108 """ 109 @return: The node value corresponding to this C{Nonterminal}. 110 @rtype: (any) 111 """ 112 return self._symbol
113
114 - def __eq__(self, other):
115 """ 116 @return: True if this non-terminal is equal to C{other}. In 117 particular, return true iff C{other} is a C{Nonterminal} 118 and this non-terminal's symbol is equal to C{other}'s 119 symbol. 120 @rtype: C{boolean} 121 """ 122 try: 123 return ((self._symbol == other._symbol) \ 124 and isinstance(other, self.__class__)) 125 except AttributeError: 126 return False
127
128 - def __ne__(self, other):
129 """ 130 @return: True if this non-terminal is not equal to C{other}. In 131 particular, return true iff C{other} is not a C{Nonterminal} 132 or this non-terminal's symbol is not equal to C{other}'s 133 symbol. 134 @rtype: C{boolean} 135 """ 136 return not (self==other)
137
138 - def __cmp__(self, other):
139 if self == other: return 0 140 else: return -1
141
142 - def __hash__(self):
143 return self._hash
144
145 - def __repr__(self):
146 """ 147 @return: A string representation for this C{Nonterminal}. 148 The string representation for a C{Nonterminal} whose 149 symbol is C{M{s}} is C{<M{s}>}. 150 @rtype: C{string} 151 """ 152 # [XX] not a good repr! Token uses this now!! 153 return '<%s>' % (self._symbol,)
154
155 - def __str__(self):
156 """ 157 @return: A string representation for this C{Nonterminal}. 158 The string representation for a C{Nonterminal} whose 159 symbol is C{M{s}} is C{M{s}}. 160 @rtype: C{string} 161 """ 162 return '%s' % (self._symbol,)
163
164 - def __div__(self, rhs):
165 """ 166 @return: A new nonterminal whose symbol is C{M{A}/M{B}}, where 167 C{M{A}} is the symbol for this nonterminal, and C{M{B}} 168 is the symbol for rhs. 169 @rtype: L{Nonterminal} 170 @param rhs: The nonterminal used to form the right hand side 171 of the new nonterminal. 172 @type rhs: L{Nonterminal} 173 """ 174 return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
175
176 -def nonterminals(symbols):
177 """ 178 Given a string containing a list of symbol names, return a list of 179 C{Nonterminals} constructed from those symbols. 180 181 @param symbols: The symbol name string. This string can be 182 delimited by either spaces or commas. 183 @type symbols: C{string} 184 @return: A list of C{Nonterminals} constructed from the symbol 185 names given in C{symbols}. The C{Nonterminals} are sorted 186 in the same order as the symbols names. 187 @rtype: C{list} of L{Nonterminal} 188 """ 189 if ',' in symbols: symbol_list = symbols.split(',') 190 else: symbol_list = symbols.split() 191 return [Nonterminal(s.strip()) for s in symbol_list]
192 193 ################################################################# 194 # Production and Grammar 195 ################################################################# 196
197 -class Production(object):
198 """ 199 A context-free grammar production. Each production 200 expands a single C{Nonterminal} (the X{left-hand side}) to a 201 sequence of terminals and C{Nonterminals} (the X{right-hand 202 side}). X{terminals} can be any immutable hashable object that is 203 not a C{Nonterminal}. Typically, terminals are strings 204 representing word types, such as C{"dog"} or C{"under"}. 205 206 Abstractly, a Grammar production indicates that the right-hand side is 207 a possible X{instantiation} of the left-hand side. Grammar 208 productions are X{context-free}, in the sense that this 209 instantiation should not depend on the context of the left-hand 210 side or of the right-hand side. 211 212 @see: L{Grammar} 213 @see: L{Nonterminal} 214 @type _lhs: L{Nonterminal} 215 @ivar _lhs: The left-hand side of the production. 216 @type _rhs: C{tuple} of (C{Nonterminal} and (terminal)) 217 @ivar _rhs: The right-hand side of the production. 218 """ 219
220 - def __init__(self, lhs, rhs):
221 """ 222 Construct a new C{Production}. 223 224 @param lhs: The left-hand side of the new C{Production}. 225 @type lhs: L{Nonterminal} 226 @param rhs: The right-hand side of the new C{Production}. 227 @type rhs: sequence of (C{Nonterminal} and (terminal)) 228 """ 229 if isinstance(rhs, (str, unicode)): 230 raise TypeError, 'production right hand side should be a list, not a string' 231 self._lhs = lhs 232 self._rhs = tuple(rhs) 233 self._hash = hash((self._lhs, self._rhs))
234
235 - def lhs(self):
236 """ 237 @return: the left-hand side of this C{Production}. 238 @rtype: L{Nonterminal} 239 """ 240 return self._lhs
241
242 - def rhs(self):
243 """ 244 @return: the right-hand side of this C{Production}. 245 @rtype: sequence of (C{Nonterminal} and (terminal)) 246 """ 247 return self._rhs
248
249 - def __str__(self):
250 """ 251 @return: A verbose string representation of the 252 C{Production}. 253 @rtype: C{string} 254 """ 255 str = '%s ->' % (self._lhs.symbol(),) 256 for elt in self._rhs: 257 if isinstance(elt, Nonterminal): 258 str += ' %s' % (elt.symbol(),) 259 else: 260 str += ' %r' % (elt,) 261 return str
262
263 - def __repr__(self):
264 """ 265 @return: A concise string representation of the 266 C{Production}. 267 @rtype: C{string} 268 """ 269 return '%s' % self
270
271 - def __eq__(self, other):
272 """ 273 @return: true if this C{Production} is equal to C{other}. 274 @rtype: C{boolean} 275 """ 276 return (isinstance(other, self.__class__) and 277 self._lhs == other._lhs and 278 self._rhs == other._rhs)
279
280 - def __ne__(self, other):
281 return not (self == other)
282
283 - def __cmp__(self, other):
284 if not isinstance(other, self.__class__): return -1 285 return cmp((self._lhs, self._rhs), (other._lhs, other._rhs))
286
287 - def __hash__(self):
288 """ 289 @return: A hash value for the C{Production}. 290 @rtype: C{int} 291 """ 292 return self._hash
293 294
295 -class Grammar(object):
296 """ 297 A context-free grammar. A Grammar consists of a start state and a set 298 of productions. The set of terminals and nonterminals is 299 implicitly specified by the productions. 300 301 If you need efficient key-based access to productions, you 302 can use a subclass to implement it. 303 """
304 - def __init__(self, start, productions):
305 """ 306 Create a new context-free grammar, from the given start state 307 and set of C{Production}s. 308 309 @param start: The start symbol 310 @type start: L{Nonterminal} 311 @param productions: The list of productions that defines the grammar 312 @type productions: C{list} of L{Production} 313 """ 314 self._start = start 315 self._productions = productions 316 self._lhs_index = {} 317 self._rhs_index = {} 318 for prod in self._productions: 319 if prod._lhs not in self._lhs_index: 320 self._lhs_index[prod._lhs] = [] 321 if prod._rhs and prod._rhs[0] not in self._rhs_index: 322 self._rhs_index[prod._rhs[0]] = [] 323 self._lhs_index[prod._lhs].append(prod) 324 if prod._rhs: 325 self._rhs_index[prod._rhs[0]].append(prod)
326
327 - def start(self):
328 return self._start
329 330 # tricky to balance readability and efficiency here! 331 # can't use set operations as they don't preserve ordering
332 - def productions(self, lhs=None, rhs=None):
333 # no constraints so return everything 334 if not lhs and not rhs: 335 return self._productions 336 337 # only lhs specified so look up its index 338 elif lhs and not rhs: 339 if lhs in self._lhs_index: 340 return self._lhs_index[lhs] 341 else: 342 return [] 343 344 # only rhs specified so look up its index 345 elif rhs and not lhs: 346 if rhs in self._rhs_index: 347 return self._rhs_index[rhs] 348 else: 349 return [] 350 351 # intersect 352 else: 353 if lhs in self._lhs_index: 354 return [prod for prod in self._lhs_index[lhs] 355 if prod in self._rhs_index[rhs]] 356 else: 357 return []
358
359 - def __repr__(self):
360 return '<Grammar with %d productions>' % len(self._productions)
361
362 - def __str__(self):
363 str = 'Grammar with %d productions' % len(self._productions) 364 str += ' (start state = %s)' % self._start 365 for production in self._productions: 366 str += '\n %s' % production 367 return str
368 369 _PARSE_RE = re.compile(r'''^(\w+)\s* # lhs 370 (?:-+>|=+>)\s* # arrow 371 (?:( # rhs: 372 "[^"]+" # doubled-quoted terminal 373 |'[^']+' # single-quoted terminal 374 |\w+| # non-terminal 375 \| # disjunction 376 ) 377 \s*) # trailing space 378 *$''', 379 re.VERBOSE) 380 _SPLIT_RE = re.compile(r'''(\w+|-+>|=+>|"[^"]+"|'[^']+'|\|)''') 381
382 -def parse_production(s):
383 """ 384 Returns a list of productions 385 """ 386 # Use _PARSE_RE to check that it's valid. 387 if not _PARSE_RE.match(s): 388 raise ValueError, 'Bad production string' 389 # Use _SPLIT_RE to process it. 390 pieces = _SPLIT_RE.split(s) 391 pieces = [p for i,p in enumerate(pieces) if i%2==1] 392 lhside = Nonterminal(pieces[0]) 393 rhsides = [[]] 394 for piece in pieces[2:]: 395 if piece == '|': 396 rhsides.append([]) # Vertical bar 397 elif piece[0] in ('"', "'"): 398 rhsides[-1].append(piece[1:-1]) # Terminal 399 else: 400 rhsides[-1].append(Nonterminal(piece)) # Nonterminal 401 return [Production(lhside, rhside) for rhside in rhsides]
402
403 -def parse_grammar(s):
404 productions = [] 405 for linenum, line in enumerate(s.split('\n')): 406 line = line.strip() 407 if line.startswith('#') or line=='': continue 408 try: productions += parse_production(line) 409 except ValueError: 410 raise ValueError, 'Unable to parse line %s' % linenum 411 if len(productions) == 0: 412 raise ValueError, 'No productions found!' 413 start = productions[0].lhs() 414 return Grammar(start, productions)
415 416 ################################################################# 417 # Demonstration 418 ################################################################# 419
420 -def demo():
421 """ 422 A demonstration showing how C{Grammar}s can be created and used. 423 """ 424 425 from nltk_lite.parse import cfg 426 427 # Create some nonterminals 428 S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP') 429 N, V, P, Det = cfg.nonterminals('N, V, P, Det') 430 VP_slash_NP = VP/NP 431 432 print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP] 433 print ' S.symbol() =>', `S.symbol()` 434 print 435 436 print cfg.Production(S, [NP]) 437 438 # Create some Grammar Productions 439 grammar = cfg.parse_grammar(""" 440 S -> NP VP 441 PP -> P NP 442 NP -> Det N 443 NP -> NP PP 444 VP -> V NP 445 VP -> VP PP 446 Det -> 'a' 447 Det -> 'the' 448 N -> 'dog' 449 N -> 'cat' 450 V -> 'chased' 451 V -> 'sat' 452 P -> 'on' 453 P -> 'in' 454 """) 455 456 print 'A Grammar:', `grammar` 457 print ' grammar.start() =>', `grammar.start()` 458 print ' grammar.productions() =>', 459 # Use string.replace(...) is to line-wrap the output. 460 print `grammar.productions()`.replace(',', ',\n'+' '*25) 461 print
462 463 if __name__ == '__main__': demo() 464