1
2
3
4
5
6
7
8
9
10
11
12 from __init__ import *
13 from tree import Tree
14 import cfg
15
16 """
17 Data classes and parser implementations for \"chart parsers\", which
18 use dynamic programming to efficiently parse a text. A X{chart
19 parser} derives parse trees for a text by iteratively adding \"edges\"
20 to a \"chart.\" Each X{edge} represents a hypothesis about the tree
21 structure for a subsequence of the text. The X{chart} is a
22 \"blackboard\" for composing and combining these hypotheses.
23
24 When a chart parser begins parsing a text, it creates a new (empty)
25 chart, spanning the text. It then incrementally adds new edges to the
26 chart. A set of X{chart rules} specifies the conditions under which
27 new edges should be added to the chart. Once the chart reaches a
28 stage where none of the chart rules adds any new edges, parsing is
29 complete.
30
31 Charts are encoded with the L{Chart} class, and edges are encoded with
32 the L{TreeEdge} and L{LeafEdge} classes. The chart parser module
33 defines three chart parsers:
34
35 - C{ChartParse} is a simple and flexible chart parser. Given a
36 set of chart rules, it will apply those rules to the chart until
37 no more edges are added.
38
39 - C{SteppingChartParse} is a subclass of C{ChartParse} that can
40 be used to step through the parsing process.
41
42 - C{EarleyChartParse} is an implementation of the Earley chart parsing
43 algorithm. It makes a single left-to-right pass through the
44 chart, and applies one of three rules (predictor, scanner, and
45 completer) to each edge it encounters.
46 """
47
48 import re
49
50
51
52
53
55 """
56 A hypothesis about the structure of part of a sentence.
57 Each edge records the fact that a structure is (partially)
58 consistent with the sentence. An edge contains:
59
60 - A X{span}, indicating what part of the sentence is
61 consistent with the hypothesized structure.
62
63 - A X{left-hand side}, specifying what kind of structure is
64 hypothesized.
65
66 - A X{right-hand side}, specifying the contents of the
67 hypothesized structure.
68
69 - A X{dot position}, indicating how much of the hypothesized
70 structure is consistent with the sentence.
71
72 Every edge is either X{complete} or X{incomplete}:
73
74 - An edge is X{complete} if its structure is fully consistent
75 with the sentence.
76
77 - An edge is X{incomplete} if its structure is partially
78 consistent with the sentence. For every incomplete edge, the
79 span specifies a possible prefix for the edge's structure.
80
81 There are two kinds of edge:
82
83 - C{TreeEdges<TreeEdge>} record which trees have been found to
84 be (partially) consistent with the text.
85
86 - C{LeafEdges<leafEdge>} record the tokens occur in the text.
87
88 The C{EdgeI} interface provides a common interface to both types
89 of edge, allowing chart parsers to treat them in a uniform manner.
90 """
92 if self.__class__ == EdgeI:
93 raise TypeError('Edge is an abstract interface')
94
95
96
97
98
100 """
101 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
102 portion of the sentence that is consistent with this
103 edge's structure.
104 @rtype: C{(int, int)}
105 """
106 raise AssertionError('EdgeI is an abstract interface')
107
109 """
110 @return: The start index of this edge's span.
111 @rtype: C{int}
112 """
113 raise AssertionError('EdgeI is an abstract interface')
114
116 """
117 @return: The end index of this edge's span.
118 @rtype: C{int}
119 """
120 raise AssertionError('EdgeI is an abstract interface')
121
123 """
124 @return: The length of this edge's span.
125 @rtype: C{int}
126 """
127 raise AssertionError('EdgeI is an abstract interface')
128
129
130
131
132
134 """
135 @return: This edge's left-hand side, which specifies what kind
136 of structure is hypothesized by this edge.
137 @see: L{TreeEdge} and L{LeafEdge} for a description of
138 the left-hand side values for each edge type.
139 """
140 raise AssertionError('EdgeI is an abstract interface')
141
142
143
144
145
147 """
148 @return: This edge's right-hand side, which specifies
149 the content of the structure hypothesized by this
150 edge.
151 @see: L{TreeEdge} and L{LeafEdge} for a description of
152 the right-hand side values for each edge type.
153 """
154 raise AssertionError('EdgeI is an abstract interface')
155
157 """
158 @return: This edge's dot position, which indicates how much of
159 the hypothesized structure is consistent with the
160 sentence. In particular, C{self.rhs[:dot]} is consistent
161 with C{subtoks[self.start():self.end()]}.
162 @rtype: C{int}
163 """
164 raise AssertionError('EdgeI is an abstract interface')
165
167 """
168 @return: The element of this edge's right-hand side that
169 immediately follows its dot.
170 @rtype: C{Nonterminal} or X{terminal} or C{None}
171 """
172 raise AssertionError('EdgeI is an abstract interface')
173
175 """
176 @return: True if this edge's structure is fully consistent
177 with the text.
178 @rtype: C{boolean}
179 """
180 raise AssertionError('EdgeI is an abstract interface')
181
183 """
184 @return: True if this edge's structure is partially consistent
185 with the text.
186 @rtype: C{boolean}
187 """
188 raise AssertionError('EdgeI is an abstract interface')
189
190
191
192
195
198
200 """
201 An edge that records the fact that a tree is (partially)
202 consistent with the sentence. A tree edge consists of:
203
204 - A X{span}, indicating what part of the sentence is
205 consistent with the hypothesized tree.
206
207 - A X{left-hand side}, specifying the hypothesized tree's node
208 value.
209
210 - A X{right-hand side}, specifying the hypothesized tree's
211 children. Each element of the right-hand side is either a
212 terminal, specifying a token with that terminal as its leaf
213 value; or a nonterminal, specifying a subtree with that
214 nonterminal's symbol as its node value.
215
216 - A X{dot position}, indicating which children are consistent
217 with part of the sentence. In particular, if C{dot} is the
218 dot position, C{rhs} is the right-hand size, C{(start,end)}
219 is the span, and C{sentence} is the list of subtokens in the
220 sentence, then C{subtokens[start:end]} can be spanned by the
221 children specified by C{rhs[:dot]}.
222
223 For more information about edges, see the L{EdgeI} interface.
224 """
225 - def __init__(self, span, lhs, rhs, dot=0):
226 """
227 Construct a new C{TreeEdge}.
228
229 @type span: C{(int, int)}
230 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
231 portion of the sentence that is consistent with the new
232 edge's structure.
233 @type lhs: L{Nonterminal}
234 @param lhs: The new edge's left-hand side, specifying the
235 hypothesized tree's node value.
236 @type rhs: C{list} of (L{Nonterminal} and C{string})
237 @param rhs: The new edge's right-hand side, specifying the
238 hypothesized tree's children.
239 @type dot: C{int}
240 @param dot: The position of the new edge's dot. This position
241 specifies what prefix of the production's right hand side
242 is consistent with the text. In particular, if
243 C{sentence} is the list of subtokens in the sentence, then
244 C{subtokens[span[0]:span[1]]} can be spanned by the
245 children specified by C{rhs[:dot]}.
246 """
247 self._lhs = lhs
248 self._rhs = tuple(rhs)
249 self._span = span
250 self._dot = dot
251
252
254 """
255 @return: A new C{TreeEdge} formed from the given production.
256 The new edge's left-hand side and right-hand side will
257 be taken from C{production}; its span will be C{(index,
258 index)}; and its dot position will be C{0}.
259 @rtype: L{TreeEdge}
260 """
261 return TreeEdge(span=(index, index), lhs=production.lhs(),
262 rhs=production.rhs(), dot=0)
263 from_production = staticmethod(from_production)
264
265
266 - def lhs(self): return self._lhs
267 - def span(self): return self._span
268 - def start(self): return self._span[0]
269 - def end(self): return self._span[1]
270 - def length(self): return self._span[1] - self._span[0]
271 - def rhs(self): return self._rhs
272 - def dot(self): return self._dot
273 - def is_complete(self): return self._dot == len(self._rhs)
276 if self._dot >= len(self._rhs): return None
277 else: return self._rhs[self._dot]
278
279
281 if self.__class__ != other.__class__: return -1
282 return cmp((self._span, self.lhs(), self.rhs(), self._dot),
283 (other._span, other.lhs(), other.rhs(), other._dot))
285 return hash((self.lhs(), self.rhs(), self._span, self._dot))
286
287
289 str = '[%s:%s] ' % (self._span[0], self._span[1])
290 str += '%-2s ->' % (self._lhs.symbol(),)
291
292 for i in range(len(self._rhs)):
293 if i == self._dot: str += ' *'
294 if isinstance(self._rhs[i], cfg.Nonterminal):
295 str += ' %s' % (self._rhs[i].symbol(),)
296 else:
297 str += ' %r' % (self._rhs[i],)
298 if len(self._rhs) == self._dot: str += ' *'
299 return str
300
302 return '[Edge: %s]' % self
303
305 """
306 An edge that records the fact that a leaf value is consistent with
307 a word in the sentence. A leaf edge consists of:
308
309 - An X{index}, indicating the position of the word.
310 - A X{leaf}, specifying the word's content.
311
312 A leaf edge's left-hand side is its leaf value, and its right hand
313 side is C{()}. Its span is C{[index, index+1]}, and its dot
314 position is C{0}.
315 """
317 """
318 Construct a new C{LeafEdge}.
319
320 @param leaf: The new edge's leaf value, specifying the word
321 that is recorded by this edge.
322 @param index: The new edge's index, specifying the position of
323 the word that is recorded by this edge.
324 """
325 self._leaf = leaf
326 self._index = index
327
328
329 - def lhs(self): return self._leaf
334 - def rhs(self): return ()
335 - def dot(self): return 0
338 - def next(self): return None
339
340
342 if not isinstance(other, LeafEdge): return -1
343 return cmp((self._index, self._leaf), (other._index, other._leaf))
345 return hash((self._index, self._leaf))
346
347
350 return '[Edge: %s]' % (self)
351
352
353
354
355
357 """
358 A blackboard for hypotheses about the syntactic constituents of a
359 sentence. A chart contains a set of edges, and each edge encodes
360 a single hypothesis about the structure of some portion of the
361 sentence.
362
363 The L{select} method can be used to select a specific collection
364 of edges. For example C{chart.select(is_complete=True, start=0)}
365 yields all complete edges whose start indices are 0. To ensure
366 the efficiency of these selection operations, C{Chart} dynamically
367 creates and maintains an index for each set of attributes that
368 have been selected on.
369
370 In order to reconstruct the trees that are represented by an edge,
371 the chart associates each edge with a set of child pointer lists.
372 A X{child pointer list} is a list of the edges that license an
373 edge's right-hand side.
374
375 @ivar _tokens: The sentence that the chart covers.
376 @ivar _num_leaves: The number of tokens.
377 @ivar _edges: A list of the edges in the chart
378 @ivar _edge_to_cpls: A dictionary mapping each edge to a set
379 of child pointer lists that are associated with that edge.
380 @ivar _indexes: A dictionary mapping tuples of edge attributes
381 to indices, where each index maps the corresponding edge
382 attribute values to lists of edges.
383 """
385 """
386 Construct a new empty chart.
387
388 @type tokens: L{list}
389 @param tokens: The sentence that this chart will be used to parse.
390 """
391
392 self._tokens = list(tokens)
393 self._num_leaves = len(self._tokens)
394
395
396 self._edges = []
397
398
399 self._edge_to_cpls = {}
400
401
402
403 self._indexes = {}
404
405
406
407
408
410 """
411 @return: The number of words in this chart's sentence.
412 @rtype: C{int}
413 """
414 return self._num_leaves
415
416 - def leaf(self, index):
417 """
418 @return: The leaf value of the word at the given index.
419 @rtype: C{string}
420 """
421 return self._tokens[index]
422
424 """
425 @return: A list of the leaf values of each word in the
426 chart's sentence.
427 @rtype: C{list} of C{string}
428 """
429 return self._tokens
430
431
432
433
434
436 """
437 @return: A list of all edges in this chart. New edges
438 that are added to the chart after the call to edges()
439 will I{not} be contained in this list.
440 @rtype: C{list} of L{EdgeI}
441 @see: L{iteredges}, L{select}
442 """
443 return self._edges[:]
444
446 """
447 @return: An iterator over the edges in this chart. Any
448 new edges that are added to the chart before the iterator
449 is exahusted will also be generated.
450 @rtype: C{iter} of L{EdgeI}
451 @see: L{edges}, L{select}
452 """
453 return iter(self._edges)
454
455
456 __iter__ = iteredges
457
459 """
460 @return: The number of edges contained in this chart.
461 @rtype: C{int}
462 """
463 return len(self._edge_to_cpls)
464
465 - def select(self, **restrictions):
466 """
467 @return: An iterator over the edges in this chart. Any
468 new edges that are added to the chart before the iterator
469 is exahusted will also be generated. C{restrictions}
470 can be used to restrict the set of edges that will be
471 generated.
472 @rtype: C{iter} of L{EdgeI}
473
474 @kwarg span: Only generate edges C{e} where C{e.span()==span}
475 @kwarg start: Only generate edges C{e} where C{e.start()==start}
476 @kwarg end: Only generate edges C{e} where C{e.end()==end}
477 @kwarg length: Only generate edges C{e} where C{e.length()==length}
478 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs}
479 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs}
480 @kwarg next: Only generate edges C{e} where C{e.next()==next}
481 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot}
482 @kwarg is_complete: Only generate edges C{e} where
483 C{e.is_complete()==is_complete}
484 @kwarg is_incomplete: Only generate edges C{e} where
485 C{e.is_incomplete()==is_incomplete}
486 """
487
488 if restrictions=={}: return iter(self._edges)
489
490
491 restr_keys = restrictions.keys()
492 restr_keys.sort()
493 restr_keys = tuple(restr_keys)
494
495
496 if not self._indexes.has_key(restr_keys):
497 self._add_index(restr_keys)
498 vals = [restrictions[k] for k in restr_keys]
499 return iter(self._indexes[restr_keys].get(tuple(vals), []))
500
502 """
503 A helper function for L{select}, which creates a new index for
504 a given set of attributes (aka restriction keys).
505 """
506
507 for k in restr_keys:
508 if not hasattr(EdgeI, k):
509 raise ValueError, 'Bad restriction: %s' % k
510
511
512 self._indexes[restr_keys] = {}
513
514
515 for edge in self._edges:
516 vals = [getattr(edge, k)() for k in restr_keys]
517 index = self._indexes[restr_keys]
518 index.setdefault(tuple(vals),[]).append(edge)
519
520
521
522
523
524 - def insert(self, edge, child_pointer_list):
525 """
526 Add a new edge to the chart.
527
528 @type edge: L{Edge}
529 @param edge: The new edge
530 @type child_pointer_list: C{tuple} of L{Edge}
531 @param child_pointer_list: A list of the edges that were used to
532 form this edge. This list is used to reconstruct the trees
533 (or partial trees) that are associated with C{edge}.
534 @rtype: C{bool}
535 @return: True if this operation modified the chart. In
536 particular, return true iff the chart did not already
537 contain C{edge}, or if it did not already associate
538 C{child_pointer_list} with C{edge}.
539 """
540
541 if not self._edge_to_cpls.has_key(edge):
542
543 self._edges.append(edge)
544
545
546 for (restr_keys, index) in self._indexes.items():
547 vals = [getattr(edge, k)() for k in restr_keys]
548 index = self._indexes[restr_keys]
549 index.setdefault(tuple(vals),[]).append(edge)
550
551
552 cpls = self._edge_to_cpls.setdefault(edge,{})
553 child_pointer_list = tuple(child_pointer_list)
554
555 if cpls.has_key(child_pointer_list):
556
557 return False
558 else:
559
560 cpls[child_pointer_list] = True
561 return True
562
563
564
565
566
568 """
569 @return: A list of the complete tree structures that span
570 the entire chart, and whose root node is C{root}.
571 """
572 trees = []
573
574
575
576 for edge in self.select(span=(0,self._num_leaves), lhs=root):
577 trees += self.trees(edge, tree_class=tree_class, complete=True)
578 return trees
579
580 - def trees(self, edge, tree_class=Tree, complete=False):
581 """
582 @return: A list of the tree structures that are associated
583 with C{edge}.
584
585 If C{edge} is incomplete, then the unexpanded children will be
586 encoded as childless subtrees, whose node value is the
587 corresponding terminal or nonterminal.
588
589 @rtype: C{list} of L{Tree}
590 @note: If two trees share a common subtree, then the same
591 C{Tree} may be used to encode that subtree in
592 both trees. If you need to eliminate this subtree
593 sharing, then create a deep copy of each tree.
594 """
595 return self._trees(edge, complete, memo={}, tree_class=tree_class)
596
597 - def _trees(self, edge, complete, memo, tree_class):
598 """
599 A helper function for L{trees}.
600 @param memo: A dictionary used to record the trees that we've
601 generated for each edge, so that when we see an edge more
602 than once, we can reuse the same trees.
603 """
604
605 if memo.has_key(edge): return memo[edge]
606
607 trees = []
608
609
610 if complete and edge.is_incomplete():
611 return trees
612
613
614
615
616
617
618
619 memo[edge] = []
620
621
622 if isinstance(edge, LeafEdge):
623 leaf = self._tokens[edge.start()]
624 memo[edge] = leaf
625 return [leaf]
626
627
628 for cpl in self.child_pointer_lists(edge):
629
630
631
632 child_choices = [self._trees(cp, complete, memo, tree_class)
633 for cp in cpl]
634
635
636 if len(child_choices) > 0 and type(child_choices[0]) == type(""):
637 child_choices = [child_choices]
638
639
640 for children in self._choose_children(child_choices):
641 lhs = edge.lhs().symbol()
642 trees.append(tree_class(lhs, children))
643
644
645 if edge.is_incomplete():
646 unexpanded = [tree_class(elt,[])
647 for elt in edge.rhs()[edge.dot():]]
648 for tree in trees:
649 tree.extend(unexpanded)
650
651
652 memo[edge] = trees
653
654
655 return trees
656
658 """
659 A helper function for L{_trees} that finds the possible sets
660 of subtrees for a new tree.
661
662 @param child_choices: A list that specifies the options for
663 each child. In particular, C{child_choices[i]} is a list of
664 tokens and subtrees that can be used as the C{i}th child.
665 """
666 children_lists = [[]]
667 for child_choice in child_choices:
668 children_lists = [child_list+[child]
669 for child in child_choice
670 for child_list in children_lists]
671 return children_lists
672
674 """
675 @rtype: C{list} of C{list} of C{Edge}
676 @return: The set of child pointer lists for the given edge.
677 Each child pointer list is a list of edges that have
678 been used to form this edge.
679 """
680
681 return self._edge_to_cpls.get(edge, {}).keys()
682
683
684
685
686 - def pp_edge(self, edge, width=None):
715
717 """
718 @return: A pretty-printed string representation of this
719 chart's leaves. This string can be used as a header
720 for calls to L{pp_edge}.
721 """
722 if width is None: width = 50/(self.num_leaves()+1)
723
724 if self._tokens is not None and width>1:
725 header = '|.'
726 for tok in self._tokens:
727 header += tok[:width-1].center(width-1)+'.'
728 header += '|'
729 else:
730 header = ''
731
732 return header
733
734 - def pp(self, width=None):
735 """
736 @return: A pretty-printed string representation of this chart.
737 @rtype: C{string}
738 @param width: The number of characters allotted to each
739 index in the sentence.
740 """
741 if width is None: width = 50/(self.num_leaves()+1)
742
743
744 edges = [(e.length(), e.start(), e) for e in self]
745 edges.sort()
746 edges = [e for (_,_,e) in edges]
747
748 return (self.pp_leaves(width) + '\n' +
749 '\n'.join(self.pp_edge(edge, width) for edge in edges))
750
751
752
753
754
756
757 s = 'digraph nltk_chart {\n'
758
759 s += ' rankdir=LR;\n'
760 s += ' node [height=0.1,width=0.1];\n'
761 s += ' node [style=filled, color="lightgray"];\n'
762
763
764 for y in range(self.num_edges(), -1, -1):
765 if y == 0:
766 s += ' node [style=filled, color="black"];\n'
767 for x in range(self.num_leaves()+1):
768 if y == 0 or (x <= self._edges[y-1].start() or
769 x >= self._edges[y-1].end()):
770 s += ' %04d.%04d [label=""];\n' % (x,y)
771
772
773 s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
774
775
776 for x in range(self.num_leaves()+1):
777 s += ' {rank=same;'
778 for y in range(self.num_edges()+1):
779 if y == 0 or (x <= self._edges[y-1].start() or
780 x >= self._edges[y-1].end()):
781 s += ' %04d.%04d' % (x,y)
782 s += '}\n'
783
784
785 s += ' edge [style=invis, weight=100];\n'
786 s += ' node [shape=plaintext]\n'
787 s += ' 0000.0000'
788 for x in range(self.num_leaves()):
789 s += '->%s->%04d.0000' % (self.leaf(x), x+1)
790 s += ';\n\n'
791
792
793 s += ' edge [style=solid, weight=1];\n'
794 for y, edge in enumerate(self):
795 for x in range(edge.start()):
796 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
797 (x, y+1, x+1, y+1))
798 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' %
799 (edge.start(), y+1, edge.end(), y+1, edge))
800 for x in range(edge.end(), self.num_leaves()):
801 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' %
802 (x, y+1, x+1, y+1))
803 s += '}\n'
804 return s
805
806
807
808
809
811 """
812 A rule that specifies what new edges are licensed by any given set
813 of existing edges. Each chart rule expects a fixed number of
814 edges, as indicated by the class variable L{NUM_EDGES}. In
815 particular:
816
817 - A chart rule with C{NUM_EDGES=0} specifies what new edges are
818 licensed, regardless of existing edges.
819
820 - A chart rule with C{NUM_EDGES=1} specifies what new edges are
821 licensed by a single existing edge.
822
823 - A chart rule with C{NUM_EDGES=2} specifies what new edges are
824 licensed by a pair of existing edges.
825
826 @type NUM_EDGES: C{int}
827 @cvar NUM_EDGES: The number of existing edges that this rule uses
828 to license new edges. Typically, this number ranges from zero
829 to two.
830 """
831 - def apply(self, chart, grammar, *edges):
832 """
833 Add the edges licensed by this rule and the given edges to the
834 chart.
835
836 @type edges: C{list} of L{EdgeI}
837 @param edges: A set of existing edges. The number of edges
838 that should be passed to C{apply} is specified by the
839 L{NUM_EDGES} class variable.
840 @rtype: C{list} of L{EdgeI}
841 @return: A list of the edges that were added.
842 """
843 raise AssertionError, 'ChartRuleI is an abstract interface'
844
846 """
847 @return: A generator that will add edges licensed by this rule
848 and the given edges to the chart, one at a time. Each
849 time the generator is resumed, it will either add a new
850 edge and yield that edge; or return.
851 @rtype: C{iter} of L{EdgeI}
852
853 @type edges: C{list} of L{EdgeI}
854 @param edges: A set of existing edges. The number of edges
855 that should be passed to C{apply} is specified by the
856 L{NUM_EDGES} class variable.
857 """
858 raise AssertionError, 'ChartRuleI is an abstract interface'
859
861 """
862 Add all the edges licensed by this rule and the edges in the
863 chart to the chart.
864
865 @rtype: C{list} of L{EdgeI}
866 @return: A list of the edges that were added.
867 """
868 raise AssertionError, 'ChartRuleI is an abstract interface'
869
871 """
872 @return: A generator that will add all edges licensed by
873 this rule, given the edges that are currently in the
874 chart, one at a time. Each time the generator is resumed,
875 it will either add a new edge and yield that edge; or
876 return.
877 @rtype: C{iter} of L{EdgeI}
878 """
879 raise AssertionError, 'ChartRuleI is an abstract interface'
880
882 """
883 An abstract base class for chart rules. C{AbstractChartRule}
884 provides:
885 - A default implementation for C{apply}, based on C{apply_iter}.
886 - A default implementation for C{apply_everywhere_iter},
887 based on C{apply_iter}.
888 - A default implementation for C{apply_everywhere}, based on
889 C{apply_everywhere_iter}. Currently, this implementation
890 assumes that C{NUM_EDGES}<=3.
891 - A default implementation for C{__str__}, which returns a
892 name basd on the rule's class name.
893 """
894
895
898
899
900
902 if self.NUM_EDGES == 0:
903 for new_edge in self.apply_iter(chart, grammar):
904 yield new_edge
905
906 elif self.NUM_EDGES == 1:
907 for e1 in chart:
908 for new_edge in self.apply_iter(chart, grammar, e1):
909 yield new_edge
910
911 elif self.NUM_EDGES == 2:
912 for e1 in chart:
913 for e2 in chart:
914 for new_edge in self.apply_iter(chart, grammar, e1, e2):
915 yield new_edge
916
917 elif self.NUM_EDGES == 3:
918 for e1 in chart:
919 for e2 in chart:
920 for e3 in chart:
921 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3):
922 yield new_edge
923
924 else:
925 raise AssertionError, 'NUM_EDGES>3 is not currently supported'
926
927
928 - def apply(self, chart, grammar, *edges):
930
931
934
935
937
938 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
939
940
941
942
944 """
945 A rule that joins two adjacent edges to form a single combined
946 edge. In particular, this rule specifies that any pair of edges:
947
948 - [AS{->}S{alpha}*BS{beta}][i:j]
949 - [BS{->}S{gamma}*][j:k]
950 licenses the edge:
951 - [AS{->}S{alpha}B*S{beta}][i:j]
952 """
953 NUM_EDGES = 2
954 - def apply_iter(self, chart, grammar, left_edge, right_edge):
974
976 """
977 A rule that joins a given edge with adjacent edges in the chart,
978 to form combined edges. In particular, this rule specifies that
979 either of the edges:
980 - [AS{->}S{alpha}*BS{beta}][i:j]
981 - [BS{->}S{gamma}*][j:k]
982 licenses the edge:
983 - [AS{->}S{alpha}B*S{beta}][i:j]
984 if the other edge is already in the chart.
985 @note: This is basically L{FundamentalRule}, with one edge is left
986 unspecified.
987 """
988 NUM_EDGES = 1
989
990 _fundamental_rule = FundamentalRule()
991
993 fr = self._fundamental_rule
994 if edge1.is_incomplete():
995
996 for edge2 in chart.select(start=edge1.end(), is_complete=True,
997 lhs=edge1.next()):
998 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2):
999 yield new_edge
1000 else:
1001
1002 for edge2 in chart.select(end=edge1.start(), is_complete=False,
1003 next=edge1.lhs()):
1004 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1):
1005 yield new_edge
1006
1007 - def __str__(self): return 'Fundamental Rule'
1008
1010 """
1011 A rule licensing any edges corresponding to terminals in the
1012 text. In particular, this rule licenses the leaf edge:
1013 - [wS{->}*][i:i+1]
1014 for C{w} is a word in the text, where C{i} is C{w}'s index.
1015 """
1016 NUM_EDGES = 0
1022
1023
1024
1025
1026
1027
1029 """
1030 A rule licensing edges corresponding to the grammar productions for
1031 the grammar's start symbol. In particular, this rule specifies that:
1032 - [SS{->}*S{alpha}][0:i]
1033 is licensed for each grammar production C{SS{->}S{alpha}}, where
1034 C{S} is the grammar's start symbol.
1035 """
1036 NUM_EDGES = 0
1042
1044 """
1045 A rule licensing edges corresponding to the grammar productions
1046 for the nonterminal following an incomplete edge's dot. In
1047 particular, this rule specifies that:
1048 - [AS{->}S{alpha}*BS{beta}][i:j]
1049 licenses the edge:
1050 - [BS{->}*S{gamma}][j:j]
1051 for each grammar production C{BS{->}S{gamma}}.
1052 """
1053 NUM_EDGES = 1
1060
1062 """
1063 A rule licensing an edge corresponding to a terminal following an
1064 incomplete edge's dot. In particular, this rule specifies that:
1065 - [AS{->}S{alpha}*w{beta}][i:j]
1066 licenses the leaf edge:
1067 - [wS{->}*][j:j+1]
1068 if the C{j}th word in the text is C{w}.
1069 """
1070 NUM_EDGES = 1
1079
1080
1082 """
1083 A cached version of L{TopDownInitRule}. After the first time this
1084 rule is applied, it will not generate any more edges.
1085
1086 If C{chart} or C{grammar} are changed, then the cache is flushed.
1087 """
1091
1103
1104 - def __str__(self): return 'Top Down Init Rule'
1105
1107 """
1108 A cached version of L{TopDownExpandRule}. After the first time
1109 this rule is applied to an edge with a given C{end} and C{next},
1110 it will not generate any more edges for edges with that C{end} and
1111 C{next}.
1112
1113 If C{chart} or C{grammar} are changed, then the cache is flushed.
1114 """
1118
1132
1133 - def __str__(self): return 'Top Down Expand Rule'
1134
1135
1136
1137
1138
1140 """
1141 A rule licensing any edges corresponding to terminals in the
1142 text. In particular, this rule licenses the leaf edge:
1143 - [wS{->}*][i:i+1]
1144 for C{w} is a word in the text, where C{i} is C{w}'s index.
1145 """
1146 NUM_EDGES = 0
1152
1154 """
1155 A rule licensing any edge corresponding to a production whose
1156 right-hand side begins with a complete edge's left-hand side. In
1157 particular, this rule specifies that:
1158 - [AS{->}S{alpha}*]
1159 licenses the edge:
1160 - [BS{->}*AS{beta}]
1161 for each grammar production C{BS{->}AS{beta}}
1162 """
1163 NUM_EDGES = 1
1170
1171
1172
1173
1174
1176 """
1177 A rule that joins a given complete edge with adjacent incomplete
1178 edges in the chart, to form combined edges. In particular, this
1179 rule specifies that:
1180 - [BS{->}S{gamma}*][j:k]
1181 licenses the edge:
1182 - [AS{->}S{alpha}B*S{beta}][i:j]
1183 given that the chart contains:
1184 - [AS{->}S{alpha}*BS{beta}][i:j]
1185 @note: This is basically L{FundamentalRule}, with the left edge
1186 left unspecified.
1187 """
1188 NUM_EDGES = 1
1189
1190 _fundamental_rule = FundamentalRule()
1191
1192 - def apply_iter(self, chart, grammar, right_edge):
1200
1201 - def __str__(self): return 'Completer Rule'
1202
1204 """
1205 A rule licensing a leaf edge corresponding to a part-of-speech
1206 terminal following an incomplete edge's dot. In particular, this
1207 rule specifies that:
1208 - [AS{->}S{alpha}*PS{beta}][i:j]
1209 licenses the edges:
1210 - [PS{->}w*][j:j+1]
1211 - [wS{->}*][j:j+1]
1212 if the C{j}th word in the text is C{w}; and C{P} is a valid part
1213 of speech for C{w}.
1214 """
1215 NUM_EDGES = 1
1216 - def __init__(self, word_to_pos_lexicon):
1217 self._word_to_pos = word_to_pos_lexicon
1218
1231
1232
1234
1235
1236
1237
1238
1240 """
1241 A chart parser implementing the Earley parsing algorithm:
1242
1243 - For each index I{end} in [0, 1, ..., N]:
1244 - For each I{edge} s.t. I{edge}.end = I{end}:
1245 - If I{edge} is incomplete, and I{edge}.next is not a part
1246 of speech:
1247 - Apply PredictorRule to I{edge}
1248 - If I{edge} is incomplete, and I{edge}.next is a part of
1249 speech:
1250 - Apply ScannerRule to I{edge}
1251 - If I{edge} is complete:
1252 - Apply CompleterRule to I{edge}
1253 - Return any complete parses in the chart
1254
1255 C{EarleyChartParse} uses a X{lexicon} to decide whether a leaf
1256 has a given part of speech. This lexicon is encoded as a
1257 dictionary that maps each word to a list of parts of speech that
1258 word can have.
1259 """
1260 - def __init__(self, grammar, lexicon, trace=0):
1261 """
1262 Create a new Earley chart parser, that uses C{grammar} to
1263 parse texts.
1264
1265 @type grammar: C{cfg.Grammar}
1266 @param grammar: The grammar used to parse texts.
1267 @type lexicon: C{dict} from C{string} to (C{list} of C{string})
1268 @param lexicon: A lexicon of words that records the parts of
1269 speech that each word can have. Each key is a word, and
1270 the corresponding value is a list of parts of speech.
1271 @type trace: C{int}
1272 @param trace: The level of tracing that should be used when
1273 parsing a text. C{0} will generate no tracing output;
1274 and higher numbers will produce more verbose tracing
1275 output.
1276 """
1277 self._grammar = grammar
1278 self._lexicon = lexicon
1279 self._trace = trace
1280 AbstractParse.__init__(self)
1281
1283 chart = Chart(list(tokens))
1284 grammar = self._grammar
1285
1286
1287 w = 50/(chart.num_leaves()+1)
1288 if self._trace > 0: print ' ', chart.pp_leaves(w)
1289
1290
1291 root = cfg.Nonterminal('[INIT]')
1292 edge = TreeEdge((0,0), root, (grammar.start(),))
1293 chart.insert(edge, ())
1294
1295
1296 predictor = PredictorRule()
1297 completer = CompleterRule()
1298 scanner = ScannerRule(self._lexicon)
1299
1300 for end in range(chart.num_leaves()+1):
1301 if self._trace > 1: print 'Processing queue %d' % end
1302 for edge in chart.select(end=end):
1303 if edge.is_incomplete():
1304 for e in predictor.apply(chart, grammar, edge):
1305 if self._trace > 0:
1306 print 'Predictor', chart.pp_edge(e,w)
1307 if edge.is_incomplete():
1308 for e in scanner.apply(chart, grammar, edge):
1309 if self._trace > 0:
1310 print 'Scanner ', chart.pp_edge(e,w)
1311 if edge.is_complete():
1312 for e in completer.apply(chart, grammar, edge):
1313 if self._trace > 0:
1314 print 'Completer', chart.pp_edge(e,w)
1315
1316
1317 return chart.parses(grammar.start(), tree_class=tree_class)
1318
1319
1320
1321
1322
1323 TD_STRATEGY = [CachedTopDownInitRule(), CachedTopDownExpandRule(),
1324 TopDownMatchRule(), SingleEdgeFundamentalRule()]
1325 BU_STRATEGY = [BottomUpInitRule(), BottomUpPredictRule(),
1326 SingleEdgeFundamentalRule()]
1327
1329 """
1330 A generic chart parser. A X{strategy}, or list of
1331 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to
1332 the chart. In particular, C{ChartParse} uses the following
1333 algorithm to parse texts:
1334
1335 - Until no new edges are added:
1336 - For each I{rule} in I{strategy}:
1337 - Apply I{rule} to any applicable edges in the chart.
1338 - Return any complete parses in the chart
1339 """
1340 - def __init__(self, grammar, strategy, trace=0):
1341 """
1342 Create a new chart parser, that uses C{grammar} to parse
1343 texts.
1344
1345 @type grammar: L{cfg.Grammar}
1346 @param grammar: The grammar used to parse texts.
1347 @type strategy: C{list} of L{ChartRuleI}
1348 @param strategy: A list of rules that should be used to decide
1349 what edges to add to the chart.
1350 @type trace: C{int}
1351 @param trace: The level of tracing that should be used when
1352 parsing a text. C{0} will generate no tracing output;
1353 and higher numbers will produce more verbose tracing
1354 output.
1355 """
1356 self._grammar = grammar
1357 self._strategy = strategy
1358 self._trace = trace
1359 AbstractParse.__init__(self)
1360
1362 chart = Chart(list(tokens))
1363 grammar = self._grammar
1364
1365
1366 w = 50/(chart.num_leaves()+1)
1367 if self._trace > 0: print chart.pp_leaves(w)
1368
1369 edges_added = 1
1370 while edges_added > 0:
1371 edges_added = 0
1372 for rule in self._strategy:
1373 edges_added_by_rule = 0
1374 for e in rule.apply_everywhere(chart, grammar):
1375 if self._trace > 0 and edges_added_by_rule == 0:
1376 print '%s:' % rule
1377 edges_added_by_rule += 1
1378 if self._trace > 1: print chart.pp_edge(e,w)
1379 if self._trace == 1 and edges_added_by_rule > 0:
1380 print ' - Added %d edges' % edges_added_by_rule
1381 edges_added += edges_added_by_rule
1382
1383
1384 return chart.parses(grammar.start(), tree_class=tree_class)
1385
1386
1387
1388
1389
1391 """
1392 A C{ChartParse} that allows you to step through the parsing
1393 process, adding a single edge at a time. It also allows you to
1394 change the parser's strategy or grammar midway through parsing a
1395 text.
1396
1397 The C{initialize} method is used to start parsing a text. C{step}
1398 adds a single edge to the chart. C{set_strategy} changes the
1399 strategy used by the chart parser. C{parses} returns the set of
1400 parses that has been found by the chart parser.
1401
1402 @ivar _restart: Records whether the parser's strategy, grammar,
1403 or chart has been changed. If so, then L{step} must restart
1404 the parsing algorithm.
1405 """
1406 - def __init__(self, grammar, strategy=None, trace=0):
1411
1412
1413
1414
1415
1417 "Begin parsing the given tokens."
1418 self._chart = Chart(list(tokens))
1419 self._restart = True
1420
1421
1422
1423
1424
1426 """
1427 @return: A generator that adds edges to the chart, one at a
1428 time. Each time the generator is resumed, it adds a single
1429 edge and yields that edge. If no more edges can be added,
1430 then it yields C{None}.
1431
1432 If the parser's strategy, grammar, or chart is changed, then
1433 the generator will continue adding edges using the new
1434 strategy, grammar, or chart.
1435
1436 Note that this generator never terminates, since the grammar
1437 or strategy might be changed to values that would add new
1438 edges. Instead, it yields C{None} when no more edges can be
1439 added with the current strategy and grammar.
1440 """
1441 if self._chart is None:
1442 raise ValueError, 'Parser must be initialized first'
1443 while 1:
1444 self._restart = False
1445 w = 50/(self._chart.num_leaves()+1)
1446
1447 for e in self._parse():
1448 if self._trace > 1: print self._current_chartrule
1449 if self._trace > 0: print self._chart.pp_edge(e,w)
1450 yield e
1451 if self._restart: break
1452 else:
1453 yield None
1454
1456 """
1457 A generator that implements the actual parsing algorithm.
1458 L{step} iterates through this generator, and restarts it
1459 whenever the parser's strategy, grammar, or chart is modified.
1460 """
1461 chart = self._chart
1462 grammar = self._grammar
1463 edges_added = 1
1464 while edges_added > 0:
1465 edges_added = 0
1466 for rule in self._strategy:
1467 self._current_chartrule = rule
1468 for e in rule.apply_everywhere_iter(chart, grammar):
1469 edges_added += 1
1470 yield e
1471
1472
1473
1474
1475
1477 "@return: The strategy used by this parser."
1478 return self._strategy
1479
1481 "@return: The grammar used by this parser."
1482 return self._grammar
1483
1485 "@return: The chart that is used by this parser."
1486 return self._chart
1487
1489 "@return: The chart rule used to generate the most recent edge."
1490 return self._current_chartrule
1491
1493 "@return: The parse trees currently contained in the chart."
1494 return self._chart.parses(self._grammar.start(), tree_class)
1495
1496
1497
1498
1499
1501 """
1502 Change the startegy that the parser uses to decide which edges
1503 to add to the chart.
1504 @type strategy: C{list} of L{ChartRuleI}
1505 @param strategy: A list of rules that should be used to decide
1506 what edges to add to the chart.
1507 """
1508 if strategy == self._strategy: return
1509 self._strategy = strategy[:]
1510 self._restart = True
1511
1513 "Change the grammar used by the parser."
1514 if grammar is self._grammar: return
1515 self._grammar = grammar
1516 self._restart = True
1517
1519 "Load a given chart into the chart parser."
1520 if chart is self._chart: return
1521 self._chart = chart
1522 self._restart = True
1523
1524
1525
1526
1527
1529
1530 self.initialize(token)
1531
1532
1533 for e in self.step():
1534 if e is None: break
1535
1536
1537 return self.parses(tree_class=tree_class)
1538
1539
1540
1541
1542
1544 """
1545 A demonstration of the chart parsers.
1546 """
1547 import sys, time
1548
1549
1550 S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP')
1551 V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det')
1552
1553
1554 grammatical_productions = [
1555 cfg.Production(S, [NP, VP]), cfg.Production(PP, [P, NP]),
1556 cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]),
1557 cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP]),
1558 cfg.Production(VP, [V]),]
1559
1560
1561 lexical_productions = [
1562 cfg.Production(NP, ['John']), cfg.Production(NP, ['I']),
1563 cfg.Production(Det, ['the']), cfg.Production(Det, ['my']),
1564 cfg.Production(Det, ['a']),
1565 cfg.Production(N, ['dog']), cfg.Production(N, ['cookie']),
1566 cfg.Production(V, ['ate']), cfg.Production(V, ['saw']),
1567 cfg.Production(P, ['with']), cfg.Production(P, ['under']),
1568 ]
1569
1570
1571 earley_lexicon = {}
1572 for prod in lexical_productions:
1573 earley_lexicon.setdefault(prod.rhs()[0], []).append(prod.lhs())
1574
1575
1576 grammar = cfg.Grammar(S, grammatical_productions+lexical_productions)
1577
1578
1579 earley_grammar = cfg.Grammar(S, grammatical_productions)
1580
1581
1582 sent = 'I saw John with a dog with my cookie'
1583 print "Sentence:\n", sent
1584 from nltk_lite import tokenize
1585 tokens = list(tokenize.whitespace(sent))
1586
1587 print tokens
1588
1589
1590 print ' 1: Top-down chart parser'
1591 print ' 2: Bottom-up chart parser'
1592 print ' 3: Earley parser'
1593 print ' 4: Stepping chart parser (alternating top-down & bottom-up)'
1594 print ' 5: All parsers'
1595 print '\nWhich parser (1-5)? ',
1596 choice = sys.stdin.readline().strip()
1597 print
1598 if choice not in '12345':
1599 print 'Bad parser number'
1600 return
1601
1602
1603 times = {}
1604
1605
1606 if choice in ('1', '5'):
1607 cp = ChartParse(grammar, TD_STRATEGY, trace=2)
1608 t = time.time()
1609 parses = cp.get_parse_list(tokens)
1610 times['top down'] = time.time()-t
1611 assert len(parses)==5, 'Not all parses found'
1612 for tree in parses: print tree
1613
1614
1615 if choice in ('2', '5'):
1616 cp = ChartParse(grammar, BU_STRATEGY, trace=2)
1617 t = time.time()
1618 parses = cp.get_parse_list(tokens)
1619 times['bottom up'] = time.time()-t
1620 assert len(parses)==5, 'Not all parses found'
1621 for tree in parses: print tree
1622
1623
1624 if choice in ('3', '5'):
1625 cp = EarleyChartParse(earley_grammar, earley_lexicon, trace=1)
1626 t = time.time()
1627 parses = cp.get_parse_list(tokens)
1628 times['Earley parser'] = time.time()-t
1629 assert len(parses)==5, 'Not all parses found'
1630 for tree in parses: print tree
1631
1632
1633 if choice in ('4', '5'):
1634 t = time.time()
1635 cp = SteppingChartParse(grammar, trace=1)
1636 cp.initialize(tokens)
1637 for i in range(5):
1638 print '*** SWITCH TO TOP DOWN'
1639 cp.set_strategy(TD_STRATEGY)
1640 for j, e in enumerate(cp.step()):
1641 if j>20 or e is None: break
1642 print '*** SWITCH TO BOTTOM UP'
1643 cp.set_strategy(BU_STRATEGY)
1644 for j, e in enumerate(cp.step()):
1645 if j>20 or e is None: break
1646 times['stepping'] = time.time()-t
1647 assert len(cp.parses())==5, 'Not all parses found'
1648 for parse in cp.parses(): print parse
1649
1650
1651 maxlen = max(len(key) for key in times.keys())
1652 format = '%' + `maxlen` + 's parser: %6.3fsec'
1653 times_items = times.items()
1654 times_items.sort(lambda a,b:cmp(a[1], b[1]))
1655 for (parser, t) in times_items:
1656 print format % (parser, t)
1657
1658 if __name__ == '__main__': demo()
1659