Package nltk_lite :: Package parse :: Module tree
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.parse.tree

  1  # Natural Language Toolkit: Text Trees 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Class for representing hierarchical language structures, such as 
 11  syntax trees and morphological trees. 
 12  """ 
 13   
 14  import re, types, string 
 15  from nltk_lite import tokenize 
 16  from nltk_lite.probability import ProbabilisticMixIn 
 17  from nltk_lite.parse import * 
 18   
 19  ###################################################################### 
 20  ## Trees 
 21  ###################################################################### 
 22   
23 -class Tree(list):
24 """ 25 A hierarchical structure. 26 27 Each C{Tree} represents a single hierarchical grouping of 28 leaves and subtrees. For example, each constituent in a syntax 29 tree is represented by a single C{Tree}. 30 31 A tree's children are encoded as a C{list} of leaves and subtrees, 32 where a X{leaf} is a basic (non-tree) value; and a X{subtree} is a 33 nested C{Tree}. 34 35 Any other properties that a C{Tree} defines are known as 36 X{node properties}, and are used to add information about 37 individual hierarchical groupings. For example, syntax trees use a 38 NODE property to label syntactic constituents with phrase tags, 39 such as \"NP\" and\"VP\". 40 41 Several C{Tree} methods use X{tree positions} to specify 42 children or descendants of a tree. Tree positions are defined as 43 follows: 44 45 - The tree position M{i} specifies a C{Tree}'s M{i}th child. 46 - The tree position C{()} specifies the C{Tree} itself. 47 - If C{M{p}} is the tree position of descendant M{d}, then 48 C{M{p}+(M{i})} specifies the C{M{i}}th child of M{d}. 49 50 I.e., every tree position is either a single index C{M{i}}, 51 specifying C{self[M{i}]}; or a sequence C{(M{i1}, M{i2}, ..., 52 M{iN})}, specifying 53 C{self[M{i1}][M{i2}]...[M{iN}]}. 54 """
55 - def __init__(self, node, children):
56 """ 57 Construct a new tree. 58 """ 59 if isinstance(children, (str, unicode)): 60 raise TypeError, 'children should be a list, not a string' 61 list.__init__(self, children) 62 self.node = node
63 64 #//////////////////////////////////////////////////////////// 65 # Comparison operators 66 #//////////////////////////////////////////////////////////// 67
68 - def __cmp__(self, other):
69 c = cmp(self.node, other.node) 70 if c != 0: return c 71 else: return list.__cmp__(self, other)
72 - def __eq__(self, other):
73 if not isinstance(other, Tree): return False 74 return self.node == other.node and list.__eq__(self, other)
75 - def __ne__(self, other):
76 return not (self == other)
77 - def __lt__(self, other):
78 return cmp(self, other) < 0
79 - def __le__(self, other):
80 return cmp(self, other) <= 0
81 - def __gt__(self, other):
82 return cmp(self, other) > 0
83 - def __ge__(self, other):
84 return cmp(self, other) >= 0
85 86 #//////////////////////////////////////////////////////////// 87 # Disabled list operations 88 #//////////////////////////////////////////////////////////// 89
90 - def __mul__(self, v):
91 raise TypeError('Tree does not support multiplication')
92 - def __rmul__(self, v):
93 raise TypeError('Tree does not support multiplication')
94 - def __add__(self, v):
95 raise TypeError('Tree does not support addition')
96 - def __radd__(self, v):
97 raise TypeError('Tree does not support addition')
98 99 #//////////////////////////////////////////////////////////// 100 # Indexing (with support for tree positions) 101 #//////////////////////////////////////////////////////////// 102
103 - def __getitem__(self, index):
104 if isinstance(index, int): 105 return list.__getitem__(self, index) 106 else: 107 if len(index) == 0: 108 return self 109 elif len(index) == 1: 110 return self[int(index[0])] 111 else: 112 return self[int(index[0])][index[1:]]
113
114 - def __setitem__(self, index, value):
115 if isinstance(index, int): 116 return list.__setitem__(self, index, value) 117 else: 118 if len(index) == 0: 119 raise IndexError('The tree position () may not be ' 120 'assigned to.') 121 elif len(index) == 1: 122 self[index[0]] = value 123 else: 124 self[index[0]][index[1:]] = value
125
126 - def __delitem__(self, index):
127 if isinstance(index, int): 128 return list.__delitem__(self, index) 129 else: 130 if len(index) == 0: 131 raise IndexError('The tree position () may not be deleted.') 132 elif len(index) == 1: 133 del self[index[0]] 134 else: 135 del self[index[0]][index[1:]]
136 137 #//////////////////////////////////////////////////////////// 138 # Basic tree operations 139 #//////////////////////////////////////////////////////////// 140
141 - def leaves(self):
142 """ 143 @return: a list containing this tree's leaves. The 144 order of leaves in the tuple reflects the order of the 145 leaves in the tree's hierarchical structure. 146 @rtype: C{list} 147 """ 148 leaves = [] 149 for child in self: 150 if isinstance(child, Tree): 151 leaves.extend(child.leaves()) 152 else: 153 leaves.append(child) 154 return leaves
155
156 - def flatten(self):
157 """ 158 @return: a tree consisting of this tree's root connected directly to 159 its leaves, omitting all intervening non-terminal nodes. 160 @rtype: C{Tree} 161 """ 162 return Tree(self.node, self.leaves())
163
164 - def height(self):
165 """ 166 @return: The height of this tree. The height of a tree 167 containing no children is 1; the height of a tree 168 containing only leaves is 2; and the height of any other 169 tree is one plus the maximum of its children's 170 heights. 171 @rtype: C{int} 172 """ 173 max_child_height = 0 174 for child in self: 175 if isinstance(child, Tree): 176 max_child_height = max(max_child_height, child.height()) 177 else: 178 max_child_height = max(max_child_height, 1) 179 return 1 + max_child_height
180
181 - def treepositions(self, order='preorder'):
182 """ 183 @param order: One of: C{preorder}, C{postorder}, C{bothorder}, 184 C{leaves}. 185 """ 186 positions = [] 187 if order in ('preorder', 'bothorder'): positions.append( () ) 188 for i, child in enumerate(self): 189 if isinstance(child, Tree): 190 childpos = child.treepositions(order) 191 positions.extend((i,)+p for p in childpos) 192 else: 193 positions.append( (i,) ) 194 if order in ('postorder', 'bothorder'): positions.append( () ) 195 return positions
196
197 - def subtrees(self, filter=None):
198 """ 199 Generate all the subtrees of this tree, optionally restricted 200 to trees matching the filter function. 201 @type: filter: C{function} 202 @param: filter: the function to filter all local trees 203 """ 204 if not filter or filter(self): 205 yield self 206 for child in self: 207 if isinstance(child, Tree): 208 for subtree in child.subtrees(filter): 209 yield subtree
210
211 - def productions(self):
212 """ 213 Generate the productions that correspond to the non-terminal nodes of the tree. 214 For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the 215 form P -> C1 C2 ... Cn. 216 217 @rtype: list of C{Production}s 218 """ 219 220 if not isinstance(self.node, str): 221 raise TypeError, 'Productions can only be generated from trees having node labels that are strings' 222 223 prods = [Production(Nonterminal(self.node), _child_names(self))] 224 for child in self: 225 if isinstance(child, Tree): 226 prods += child.productions() 227 return prods
228 229 #//////////////////////////////////////////////////////////// 230 # Convert, copy 231 #//////////////////////////////////////////////////////////// 232 233 # [classmethod]
234 - def convert(cls, val):
235 """ 236 Convert a tree between different subtypes of Tree. C{cls} determines 237 which class will be used to encode the new tree. 238 239 @type val: L{Tree} 240 @param val: The tree that should be converted. 241 @return: The new C{Tree}. 242 """ 243 if isinstance(val, Tree): 244 children = [cls.convert(child) for child in val] 245 return cls(val.node, children) 246 else: 247 return val
248 convert = classmethod(convert) 249
250 - def copy(self, deep=False):
251 if not deep: return self.__class__(self.node, self) 252 else: return self.__class__.convert(self)
253
254 - def _frozen_class(self): return ImmutableTree
255 - def freeze(self, leaf_freezer=None):
256 frozen_class = self._frozen_class() 257 if leaf_freezer is None: 258 newcopy = frozen_class.convert(self) 259 else: 260 newcopy = self.copy(deep=True) 261 for pos in newcopy.treepositions('leaves'): 262 newcopy[pos] = leaf_freezer(newcopy[pos]) 263 newcopy = frozen_class.convert(newcopy) 264 hash(newcopy) # Make sure the leaves are hashable. 265 return newcopy
266 267 #//////////////////////////////////////////////////////////// 268 # Visualization & String Representation 269 #//////////////////////////////////////////////////////////// 270
271 - def draw(self):
272 """ 273 Open a new window containing a graphical diagram of this tree. 274 """ 275 from nltk_lite.draw.tree import draw_trees 276 draw_trees(self)
277
278 - def __repr__(self):
279 childstr = string.join(repr(c) for c in self) 280 return '(%s: %s)' % (repr(self.node), childstr)
281
282 - def __str__(self):
283 return self.pp()
284
285 - def _ppflat(self, nodesep, parens, quotes):
286 childstrs = [] 287 for child in self: 288 if isinstance(child, Tree): 289 childstrs.append(child._ppflat(nodesep, parens, quotes)) 290 elif isinstance(child, str) and not quotes: 291 childstrs.append('%s' % child) 292 else: 293 childstrs.append('%s' % child.__repr__()) 294 return '%s%s%s %s%s' % (parens[0], self.node, nodesep, 295 string.join(childstrs), parens[1])
296
297 - def pp(self, margin=70, indent=0, nodesep=':', parens='()', quotes=True):
298 """ 299 @return: A pretty-printed string representation of this tree. 300 @rtype: C{string} 301 @param margin: The right margin at which to do line-wrapping. 302 @type margin: C{int} 303 @param indent: The indentation level at which printing 304 begins. This number is used to decide how far to indent 305 subsequent lines. 306 @type indent: C{int} 307 @param nodesep: A string that is used to separate the node 308 from the children. E.g., the default value C{':'} gives 309 trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}. 310 """ 311 312 # Try writing it on one line. 313 s = self._ppflat(nodesep, parens, quotes) 314 if len(s)+indent < margin: 315 return s 316 317 # If it doesn't fit on one line, then write it on multi-lines. 318 s = '%s%s%s' % (parens[0], self.node, nodesep) 319 for child in self: 320 try: 321 s += '\n'+' '*(indent+2)+child.pp(margin, indent+2, 322 nodesep, parens, quotes) 323 except AttributeError: 324 s += '\n'+' '*(indent+2)+repr(child) 325 return s+parens[1]
326
327 - def pp_treebank(self, margin=70, indent=0):
328 return self.pp(margin, indent, nodesep='', quotes=False)
329
330 - def pp_latex_qtree(self):
331 r""" 332 Returns a representation of the tree compatible with the 333 LaTeX qtree package. This consists of the string C{\Tree} 334 followed by the parse tree represented in bracketed notation. 335 336 For example, the following result was generated from a parse tree of 337 the sentence C{The announcement astounded us}:: 338 339 \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] 340 [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ] 341 342 See U{http://www.ling.upenn.edu/advice/latex.html} for the LaTeX 343 style file for the qtree package. 344 345 @return: A latex qtree representation of this tree. 346 @rtype: C{string} 347 """ 348 return r'\Tree ' + self.pp(indent=6, nodesep='', parens=('[.', ' ]'))
349
350 -class ImmutableTree(Tree):
351 - def __setitem__(self):
352 raise ValueError, 'ImmutableTrees may not be modified'
353 - def __setslice__(self):
354 raise ValueError, 'ImmutableTrees may not be modified'
355 - def __delitem__(self):
356 raise ValueError, 'ImmutableTrees may not be modified'
357 - def __delslice__(self):
358 raise ValueError, 'ImmutableTrees may not be modified'
359 - def __iadd__(self):
360 raise ValueError, 'ImmutableTrees may not be modified'
361 - def __imul__(self):
362 raise ValueError, 'ImmutableTrees may not be modified'
363 - def append(self, v):
364 raise ValueError, 'ImmutableTrees may not be modified'
365 - def extend(self, v):
366 raise ValueError, 'ImmutableTrees may not be modified'
367 - def pop(self, v=None):
368 raise ValueError, 'ImmutableTrees may not be modified'
369 - def remove(self, v):
370 raise ValueError, 'ImmutableTrees may not be modified'
371 - def reverse(self):
372 raise ValueError, 'ImmutableTrees may not be modified'
373 - def sort(self):
374 raise ValueError, 'ImmutableTrees may not be modified'
375 - def __hash__(self):
376 return hash( (self.node, tuple(self)) )
377 378 379 ###################################################################### 380 ## Probabilistic trees 381 ######################################################################
382 -class ProbabilisticTree(Tree, ProbabilisticMixIn):
383 - def __init__(self, node, children, **prob_kwargs):
384 ProbabilisticMixIn.__init__(self, **prob_kwargs) 385 Tree.__init__(self, node, children)
386 387 # We have to patch up these methods to make them work right:
389 - def __repr__(self):
390 return '%s (p=%s)' % (Tree.__repr__(self), self.prob())
391 - def __str__(self):
392 return '%s (p=%s)' % (self.pp(margin=60), self.prob())
393 - def __cmp__(self, other):
394 c = Tree.__cmp__(self, other) 395 if c != 0: return c 396 return cmp(self.prob(), other.prob())
397 - def __eq__(self, other):
398 if not isinstance(other, Tree): return False 399 return Tree.__eq__(self, other) and self.prob()==other.prob()
400 - def copy(self, deep=False):
401 if not deep: return self.__class__(self.node, self, prob=self.prob()) 402 else: return self.__class__.convert(self)
403 - def convert(cls, val):
404 if isinstance(val, Tree): 405 children = [cls.convert(child) for child in val] 406 if isinstance(val, ProbabilisticMixIn): 407 return cls(val.node, children, prob=val.prob()) 408 else: 409 return cls(val.node, children, prob=1.0) 410 else: 411 return val
412 convert = classmethod(convert)
413
414 -class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
415 - def __init__(self, node, children, **prob_kwargs):
416 ProbabilisticMixIn.__init__(self, **prob_kwargs) 417 ImmutableTree.__init__(self, node, children)
418 419 # We have to patch up these methods to make them work right:
421 - def __repr__(self):
422 return '%s [%s]' % (Tree.__repr__(self), self.prob())
423 - def __str__(self):
424 return '%s [%s]' % (self.pp(margin=60), self.prob())
425 - def __cmp__(self, other):
426 c = Tree.__cmp__(self, other) 427 if c != 0: return c 428 return cmp(self.prob(), other.prob())
429 - def __eq__(self, other):
430 if not isinstance(other, Tree): return False 431 return Tree.__eq__(self, other) and self.prob()==other.prob()
432 - def copy(self, deep=False):
433 if not deep: return self.__class__(self.node, self, prob=self.prob()) 434 else: return self.__class__.convert(self)
435 - def convert(cls, val):
436 if isinstance(val, Tree): 437 children = [cls.convert(child) for child in val] 438 if isinstance(val, ProbabilisticMixIn): 439 return cls(val.node, children, prob=val.prob()) 440 else: 441 return cls(val.node, children, prob=1) 442 else: 443 return val
444 convert = classmethod(convert)
445 446
447 -def _child_names(tree):
448 names = [] 449 for child in tree: 450 if isinstance(child, Tree): 451 names.append(Nonterminal(child.node)) 452 else: 453 names.append(child) 454 return names
455 456 ###################################################################### 457 ## Parsing 458 ###################################################################### 459
460 -def bracket_parse(s):
461 """ 462 Parse a treebank string and return a tree. Trees are represented 463 as nested brackettings, e.g. (S (NP (NNP John)) (VP (V runs))). 464 465 @return: A tree corresponding to the string representation. 466 @rtype: C{tree} 467 @param s: The string to be converted 468 @type s: C{string} 469 """ 470 471 SPACE = re.compile(r'\s*') 472 WORD = re.compile(r'\s*([^\s\(\)]*)\s*') 473 474 # Skip any initial whitespace. 475 pos = SPACE.match(s).end() 476 477 stack = [] 478 while pos < len(s): 479 # Beginning of a tree/subtree. 480 if s[pos] == '(': 481 match = WORD.match(s, pos+1) 482 stack.append(Tree(match.group(1), [])) 483 pos = match.end() 484 485 # End of a tree/subtree. 486 elif s[pos] == ')': 487 pos = SPACE.match(s, pos+1).end() 488 if len(stack) == 1: 489 if pos != len(s): raise ValueError 490 tree = stack[0] 491 # If the tree has an extra level with node='', then get 492 # rid of it. (E.g., "((S (NP ...) (VP ...)))") 493 if tree.node == '': 494 tree = tree[0] 495 return tree 496 stack[-2].append(stack[-1]) 497 stack.pop() 498 499 # Leaf token. 500 else: 501 match = WORD.match(s, pos) 502 leaf = match.group(1) 503 stack[-1].append(leaf) 504 pos = match.end() 505 506 raise ValueError, 'mismatched parens'
507 508
509 -def sinica_parse(s):
510 """ 511 Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, 512 as shown in the following example (X represents a Chinese character): 513 S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) 514 515 @return: A tree corresponding to the string representation. 516 @rtype: C{tree} 517 @param s: The string to be converted 518 @type s: C{string} 519 """ 520 tokens = re.split(r'([()| ])', s) 521 for i in range(len(tokens)): 522 if tokens[i] == '(': 523 tokens[i-1], tokens[i] = tokens[i], tokens[i-1] # pull nonterminal inside parens 524 elif ':' in tokens[i]: 525 fields = tokens[i].split(':') 526 if len(fields) == 2: # non-terminal 527 tokens[i] = fields[1] 528 else: 529 tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" 530 elif tokens[i] == '|': 531 tokens[i] = '' 532 533 treebank_string = string.join(tokens) 534 return bracket_parse(treebank_string)
535 536 # s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier 537 # s = re.sub(r'\w+:', '', s) # remove role tags 538 539 # return s 540 541 ###################################################################### 542 ## Demonstration 543 ###################################################################### 544
545 -def demo():
546 """ 547 A demonstration showing how C{Tree}s and C{Tree}s can be 548 used. This demonstration creates a C{Tree}, and loads a 549 C{Tree} from the L{treebank<nltk.corpus.treebank>} corpus, 550 and shows the results of calling several of their methods. 551 """ 552 553 from nltk_lite.parse import tree 554 555 # Demonstrate tree parsing. 556 s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))' 557 t = tree.bracket_parse(s) 558 print "Convert bracketed string into tree:" 559 print t 560 561 print "Display tree properties:" 562 print t.node # tree's constituent type 563 print t[0] # tree's first child 564 print t[1] # tree's second child 565 print t.height() 566 print t.leaves() 567 print t[1] 568 print t[1,1] 569 print t[1,1,0] 570 571 # Demonstrate tree modification. 572 the_cat = t[0] 573 the_cat.insert(1, tree.bracket_parse('(JJ big)')) 574 print "Tree modification:" 575 print t 576 t[1,1,1] = tree.bracket_parse('(NN cake)') 577 print t 578 print 579 580 # Demonstrate probabilistic trees. 581 582 pt = tree.ProbabilisticTree('x', ['y', 'z'], prob=0.5) 583 print "Probabilistic Tree:" 584 print pt 585 print 586 587 # Demonstrate parsing of treebank output format. 588 t = tree.bracket_parse(t.pp_treebank())[0] 589 print "Convert tree to bracketed string and back again:" 590 print t.pp_treebank() 591 print t 592 print 593 594 # Demonstrate LaTeX output 595 print "LaTeX output:" 596 print t.pp_latex_qtree() 597 print 598 599 # Demonstrate Productions 600 print "Production output:" 601 print t.productions() 602 print 603 604 # Demonstrate tree nodes containing objects other than strings 605 t.node = ('test', 3) 606 print t
607 608 if __name__ == '__main__': 609 demo() 610