1
2
3
4
5
6
7
8
9 """
10 Class for representing hierarchical language structures, such as
11 syntax trees and morphological trees.
12 """
13
14 import re, types, string
15 from nltk_lite import tokenize
16 from nltk_lite.probability import ProbabilisticMixIn
17 from nltk_lite.parse import *
18
19
20
21
22
24 """
25 A hierarchical structure.
26
27 Each C{Tree} represents a single hierarchical grouping of
28 leaves and subtrees. For example, each constituent in a syntax
29 tree is represented by a single C{Tree}.
30
31 A tree's children are encoded as a C{list} of leaves and subtrees,
32 where a X{leaf} is a basic (non-tree) value; and a X{subtree} is a
33 nested C{Tree}.
34
35 Any other properties that a C{Tree} defines are known as
36 X{node properties}, and are used to add information about
37 individual hierarchical groupings. For example, syntax trees use a
38 NODE property to label syntactic constituents with phrase tags,
39 such as \"NP\" and\"VP\".
40
41 Several C{Tree} methods use X{tree positions} to specify
42 children or descendants of a tree. Tree positions are defined as
43 follows:
44
45 - The tree position M{i} specifies a C{Tree}'s M{i}th child.
46 - The tree position C{()} specifies the C{Tree} itself.
47 - If C{M{p}} is the tree position of descendant M{d}, then
48 C{M{p}+(M{i})} specifies the C{M{i}}th child of M{d}.
49
50 I.e., every tree position is either a single index C{M{i}},
51 specifying C{self[M{i}]}; or a sequence C{(M{i1}, M{i2}, ...,
52 M{iN})}, specifying
53 C{self[M{i1}][M{i2}]...[M{iN}]}.
54 """
63
64
65
66
67
69 c = cmp(self.node, other.node)
70 if c != 0: return c
71 else: return list.__cmp__(self, other)
73 if not isinstance(other, Tree): return False
74 return self.node == other.node and list.__eq__(self, other)
76 return not (self == other)
78 return cmp(self, other) < 0
80 return cmp(self, other) <= 0
82 return cmp(self, other) > 0
84 return cmp(self, other) >= 0
85
86
87
88
89
91 raise TypeError('Tree does not support multiplication')
93 raise TypeError('Tree does not support multiplication')
95 raise TypeError('Tree does not support addition')
97 raise TypeError('Tree does not support addition')
98
99
100
101
102
113
125
136
137
138
139
140
142 """
143 @return: a list containing this tree's leaves. The
144 order of leaves in the tuple reflects the order of the
145 leaves in the tree's hierarchical structure.
146 @rtype: C{list}
147 """
148 leaves = []
149 for child in self:
150 if isinstance(child, Tree):
151 leaves.extend(child.leaves())
152 else:
153 leaves.append(child)
154 return leaves
155
157 """
158 @return: a tree consisting of this tree's root connected directly to
159 its leaves, omitting all intervening non-terminal nodes.
160 @rtype: C{Tree}
161 """
162 return Tree(self.node, self.leaves())
163
165 """
166 @return: The height of this tree. The height of a tree
167 containing no children is 1; the height of a tree
168 containing only leaves is 2; and the height of any other
169 tree is one plus the maximum of its children's
170 heights.
171 @rtype: C{int}
172 """
173 max_child_height = 0
174 for child in self:
175 if isinstance(child, Tree):
176 max_child_height = max(max_child_height, child.height())
177 else:
178 max_child_height = max(max_child_height, 1)
179 return 1 + max_child_height
180
182 """
183 @param order: One of: C{preorder}, C{postorder}, C{bothorder},
184 C{leaves}.
185 """
186 positions = []
187 if order in ('preorder', 'bothorder'): positions.append( () )
188 for i, child in enumerate(self):
189 if isinstance(child, Tree):
190 childpos = child.treepositions(order)
191 positions.extend((i,)+p for p in childpos)
192 else:
193 positions.append( (i,) )
194 if order in ('postorder', 'bothorder'): positions.append( () )
195 return positions
196
198 """
199 Generate all the subtrees of this tree, optionally restricted
200 to trees matching the filter function.
201 @type: filter: C{function}
202 @param: filter: the function to filter all local trees
203 """
204 if not filter or filter(self):
205 yield self
206 for child in self:
207 if isinstance(child, Tree):
208 for subtree in child.subtrees(filter):
209 yield subtree
210
212 """
213 Generate the productions that correspond to the non-terminal nodes of the tree.
214 For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
215 form P -> C1 C2 ... Cn.
216
217 @rtype: list of C{Production}s
218 """
219
220 if not isinstance(self.node, str):
221 raise TypeError, 'Productions can only be generated from trees having node labels that are strings'
222
223 prods = [Production(Nonterminal(self.node), _child_names(self))]
224 for child in self:
225 if isinstance(child, Tree):
226 prods += child.productions()
227 return prods
228
229
230
231
232
233
235 """
236 Convert a tree between different subtypes of Tree. C{cls} determines
237 which class will be used to encode the new tree.
238
239 @type val: L{Tree}
240 @param val: The tree that should be converted.
241 @return: The new C{Tree}.
242 """
243 if isinstance(val, Tree):
244 children = [cls.convert(child) for child in val]
245 return cls(val.node, children)
246 else:
247 return val
248 convert = classmethod(convert)
249
250 - def copy(self, deep=False):
251 if not deep: return self.__class__(self.node, self)
252 else: return self.__class__.convert(self)
253
255 - def freeze(self, leaf_freezer=None):
256 frozen_class = self._frozen_class()
257 if leaf_freezer is None:
258 newcopy = frozen_class.convert(self)
259 else:
260 newcopy = self.copy(deep=True)
261 for pos in newcopy.treepositions('leaves'):
262 newcopy[pos] = leaf_freezer(newcopy[pos])
263 newcopy = frozen_class.convert(newcopy)
264 hash(newcopy)
265 return newcopy
266
267
268
269
270
277
279 childstr = string.join(repr(c) for c in self)
280 return '(%s: %s)' % (repr(self.node), childstr)
281
284
285 - def _ppflat(self, nodesep, parens, quotes):
296
297 - def pp(self, margin=70, indent=0, nodesep=':', parens='()', quotes=True):
298 """
299 @return: A pretty-printed string representation of this tree.
300 @rtype: C{string}
301 @param margin: The right margin at which to do line-wrapping.
302 @type margin: C{int}
303 @param indent: The indentation level at which printing
304 begins. This number is used to decide how far to indent
305 subsequent lines.
306 @type indent: C{int}
307 @param nodesep: A string that is used to separate the node
308 from the children. E.g., the default value C{':'} gives
309 trees like C{(S: (NP: I) (VP: (V: saw) (NP: it)))}.
310 """
311
312
313 s = self._ppflat(nodesep, parens, quotes)
314 if len(s)+indent < margin:
315 return s
316
317
318 s = '%s%s%s' % (parens[0], self.node, nodesep)
319 for child in self:
320 try:
321 s += '\n'+' '*(indent+2)+child.pp(margin, indent+2,
322 nodesep, parens, quotes)
323 except AttributeError:
324 s += '\n'+' '*(indent+2)+repr(child)
325 return s+parens[1]
326
328 return self.pp(margin, indent, nodesep='', quotes=False)
329
331 r"""
332 Returns a representation of the tree compatible with the
333 LaTeX qtree package. This consists of the string C{\Tree}
334 followed by the parse tree represented in bracketed notation.
335
336 For example, the following result was generated from a parse tree of
337 the sentence C{The announcement astounded us}::
338
339 \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ]
340 [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
341
342 See U{http://www.ling.upenn.edu/advice/latex.html} for the LaTeX
343 style file for the qtree package.
344
345 @return: A latex qtree representation of this tree.
346 @rtype: C{string}
347 """
348 return r'\Tree ' + self.pp(indent=6, nodesep='', parens=('[.', ' ]'))
349
352 raise ValueError, 'ImmutableTrees may not be modified'
354 raise ValueError, 'ImmutableTrees may not be modified'
356 raise ValueError, 'ImmutableTrees may not be modified'
358 raise ValueError, 'ImmutableTrees may not be modified'
360 raise ValueError, 'ImmutableTrees may not be modified'
362 raise ValueError, 'ImmutableTrees may not be modified'
364 raise ValueError, 'ImmutableTrees may not be modified'
366 raise ValueError, 'ImmutableTrees may not be modified'
367 - def pop(self, v=None):
368 raise ValueError, 'ImmutableTrees may not be modified'
370 raise ValueError, 'ImmutableTrees may not be modified'
372 raise ValueError, 'ImmutableTrees may not be modified'
374 raise ValueError, 'ImmutableTrees may not be modified'
377
378
379
380
381
383 - def __init__(self, node, children, **prob_kwargs):
386
387
392 return '%s (p=%s)' % (self.pp(margin=60), self.prob())
394 c = Tree.__cmp__(self, other)
395 if c != 0: return c
396 return cmp(self.prob(), other.prob())
398 if not isinstance(other, Tree): return False
399 return Tree.__eq__(self, other) and self.prob()==other.prob()
400 - def copy(self, deep=False):
401 if not deep: return self.__class__(self.node, self, prob=self.prob())
402 else: return self.__class__.convert(self)
412 convert = classmethod(convert)
413
415 - def __init__(self, node, children, **prob_kwargs):
418
419
424 return '%s [%s]' % (self.pp(margin=60), self.prob())
426 c = Tree.__cmp__(self, other)
427 if c != 0: return c
428 return cmp(self.prob(), other.prob())
430 if not isinstance(other, Tree): return False
431 return Tree.__eq__(self, other) and self.prob()==other.prob()
432 - def copy(self, deep=False):
433 if not deep: return self.__class__(self.node, self, prob=self.prob())
434 else: return self.__class__.convert(self)
444 convert = classmethod(convert)
445
446
455
456
457
458
459
507
508
510 """
511 Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings,
512 as shown in the following example (X represents a Chinese character):
513 S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)
514
515 @return: A tree corresponding to the string representation.
516 @rtype: C{tree}
517 @param s: The string to be converted
518 @type s: C{string}
519 """
520 tokens = re.split(r'([()| ])', s)
521 for i in range(len(tokens)):
522 if tokens[i] == '(':
523 tokens[i-1], tokens[i] = tokens[i], tokens[i-1]
524 elif ':' in tokens[i]:
525 fields = tokens[i].split(':')
526 if len(fields) == 2:
527 tokens[i] = fields[1]
528 else:
529 tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
530 elif tokens[i] == '|':
531 tokens[i] = ''
532
533 treebank_string = string.join(tokens)
534 return bracket_parse(treebank_string)
535
536
537
538
539
540
541
542
543
544
546 """
547 A demonstration showing how C{Tree}s and C{Tree}s can be
548 used. This demonstration creates a C{Tree}, and loads a
549 C{Tree} from the L{treebank<nltk.corpus.treebank>} corpus,
550 and shows the results of calling several of their methods.
551 """
552
553 from nltk_lite.parse import tree
554
555
556 s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
557 t = tree.bracket_parse(s)
558 print "Convert bracketed string into tree:"
559 print t
560
561 print "Display tree properties:"
562 print t.node
563 print t[0]
564 print t[1]
565 print t.height()
566 print t.leaves()
567 print t[1]
568 print t[1,1]
569 print t[1,1,0]
570
571
572 the_cat = t[0]
573 the_cat.insert(1, tree.bracket_parse('(JJ big)'))
574 print "Tree modification:"
575 print t
576 t[1,1,1] = tree.bracket_parse('(NN cake)')
577 print t
578 print
579
580
581
582 pt = tree.ProbabilisticTree('x', ['y', 'z'], prob=0.5)
583 print "Probabilistic Tree:"
584 print pt
585 print
586
587
588 t = tree.bracket_parse(t.pp_treebank())[0]
589 print "Convert tree to bracketed string and back again:"
590 print t.pp_treebank()
591 print t
592 print
593
594
595 print "LaTeX output:"
596 print t.pp_latex_qtree()
597 print
598
599
600 print "Production output:"
601 print t.productions()
602 print
603
604
605 t.node = ('test', 3)
606 print t
607
608 if __name__ == '__main__':
609 demo()
610