1
2
3
4
5
6
7
8
9
10 """
11 Extension of chart parsing implementation to handle grammars with
12 feature structures as nodes.
13 """
14
15
16 from nltk_lite.parse import *
17
19 """A helper function to determine the value of an object when variables
20 are set. It uses apply_bindings if the object is a Category, and simply
21 returns the object otherwise."""
22 if isinstance(obj, Category): return obj.apply_bindings(vars)
23 else: return obj
24
26 """
27 A modification of L{TreeEdge} to handle nonterminals with features
28 (known as L{Categories<Category>}.
29
30 In addition to the span, left-hand side, right-hand side, and dot position
31 (described at L{TreeEdge}), a C{FeatureTreeEdge} includes X{vars}, a
32 set of L{FeatureBindings} saying which L{FeatureVariable}s are set to which
33 values.
34
35 These values are applied when examining the C{lhs} or C{rhs} of a
36 C{FeatureTreeEdge}.
37
38 For more information about edges, see the L{EdgeI} interface.
39 """
40 - def __init__(self, span, lhs, rhs, dot=0, vars=None):
41 """
42 Construct a new C{FeatureTreeEdge}.
43
44 @type span: C{(int, int)}
45 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the
46 portion of the sentence that is consistent with the new
47 edge's structure.
48 @type lhs: L{Category}
49 @param lhs: The new edge's left-hand side, specifying the
50 hypothesized tree's node value.
51 @type rhs: C{list} of (L{Category} and C{string})
52 @param rhs: The new edge's right-hand side, specifying the
53 hypothesized tree's children.
54 @type dot: C{int}
55 @param dot: The position of the new edge's dot. This position
56 specifies what prefix of the production's right hand side
57 is consistent with the text. In particular, if
58 C{sentence} is the list of subtokens in the sentence, then
59 C{subtokens[span[0]:span[1]]} can be spanned by the
60 children specified by C{rhs[:dot]}.
61 @type vars: L{FeatureBindings}
62 @param vars: The bindings specifying what values certain variables in
63 this edge must have.
64 """
65 TreeEdge.__init__(self, span, lhs, rhs, dot)
66 if vars is None: vars = FeatureBindings()
67 self._vars = vars
68
69
71 """
72 @return: A new C{FeatureTreeEdge} formed from the given production.
73 The new edge's left-hand side and right-hand side will
74 be taken from C{production}; its span will be C{(index,
75 index)}; its dot position will be C{0}, and it may have specified
76 variables set.
77 @rtype: L{FeatureTreeEdge}
78 """
79 return FeatureTreeEdge(span=(index, index), lhs=production.lhs(),
80 rhs=production.rhs(), dot=0, vars=bindings)
81 from_production = staticmethod(from_production)
82
83
85 """
86 @return: the L{VariableBindings} mapping L{FeatureVariable}s to values.
87 @rtype: L{VariableBindings}
88 """
89 return self._vars.copy()
91 """
92 @return: the value of the left-hand side with variables set.
93 @rtype: C{Category}
94 """
95 return TreeEdge.lhs(self).apply_bindings(self._vars)
97 """
98 @return: the value of the left-hand side with no variables set.
99 @rtype: C{Category}
100 """
101 return TreeEdge.lhs(self)
103 """
104 @return: the value of the right-hand side with variables set.
105 @rtype: C{Category}
106 """
107 return tuple(apply(x, self._vars) for x in TreeEdge.rhs(self))
109 """
110 @return: the value of the right-hand side with no variables set.
111 @rtype: C{Category}
112 """
113 return TreeEdge.rhs(self)
114
115
117 str = '%r ->' % self.lhs()
118
119 for i in range(len(self._rhs)):
120 if i == self._dot: str += ' *'
121 str += ' %r' % (self.rhs()[i],)
122 if len(self._rhs) == self._dot: str += ' *'
123 return str
124
126 - def apply_iter(self, chart, grammar, left_edge, right_edge):
127
128 if not (left_edge.end() == right_edge.start() and
129 left_edge.is_incomplete() and right_edge.is_complete() and
130 isinstance(left_edge, FeatureTreeEdge) and
131 isinstance(right_edge, FeatureTreeEdge)
132 ):
133 return
134 bindings = left_edge.vars()
135 unify = left_edge.next().unify(right_edge.lhs().remove_unbound_vars(), bindings)
136 if unify is None: return
137
138
139 new_edge = FeatureTreeEdge(span=(left_edge.start(), right_edge.end()),
140 lhs=left_edge.lhs(), rhs=left_edge.rhs(),
141 dot=left_edge.dot()+1, vars=bindings)
142
143
144 changed_chart = False
145 for cpl1 in chart.child_pointer_lists(left_edge):
146 if chart.insert(new_edge, cpl1+(right_edge,)):
147 changed_chart = True
148
149
150 if changed_chart: yield new_edge
151
167
169 """
170 The @C{TopDownExpandRule} specialised for feature-based grammars.
171 """
182
184 """
185 A chart parser implementing the Earley parsing algorithm, allowing
186 nonterminals that have features (known as L{Categories<Category>}).
187
188 - For each index I{end} in [0, 1, ..., N]:
189 - For each I{edge} s.t. I{edge}.end = I{end}:
190 - If I{edge} is incomplete, and I{edge}.next is not a part
191 of speech:
192 - Apply PredictorRule to I{edge}
193 - If I{edge} is incomplete, and I{edge}.next is a part of
194 speech:
195 - Apply ScannerRule to I{edge}
196 - If I{edge} is complete:
197 - Apply CompleterRule to I{edge}
198 - Return any complete parses in the chart
199
200 C{FeatureEarleyChartParse} uses a X{lexicon} to decide whether a leaf
201 has a given part of speech. This lexicon is encoded as a
202 dictionary that maps each word to a list of parts of speech that
203 word can have. Unlike in the L{EarleyChartParse}, this lexicon is
204 case-insensitive.
205 """
206 - def __init__(self, grammar, lexicon, trace=0):
211
213 chart = Chart(tokens)
214 grammar = self._grammar
215
216
217
218 w = 2
219 if self._trace > 0: print ' '*9, chart.pp_leaves(w)
220
221
222 root = GrammarCategory(pos='[INIT]')
223 edge = FeatureTreeEdge((0,0), root, (grammar.start(),), 0,
224 FeatureBindings())
225 chart.insert(edge, ())
226
227
228 predictor = FeatureTopDownExpandRule()
229 completer = SingleEdgeFeatureFundamentalRule()
230
231
232 for end in range(chart.num_leaves()+1):
233 if self._trace > 1: print 'Processing queue %d' % end
234
235
236
237 if end > 0 and end-1 < chart.num_leaves():
238 leaf = chart.leaf(end-1)
239 for pos in self._lexicon.get(leaf.upper(), []):
240 new_leaf_edge = LeafEdge(leaf, end-1)
241 chart.insert(new_leaf_edge, ())
242 new_pos_edge = FeatureTreeEdge((end-1, end), pos, [leaf], 1,
243 FeatureBindings())
244 chart.insert(new_pos_edge, (new_leaf_edge,))
245 if self._trace > 0:
246 print 'Scanner ', chart.pp_edge(new_leaf_edge,w)
247
248 for edge in chart.select(end=end):
249 if edge.is_incomplete():
250 for e in predictor.apply(chart, grammar, edge):
251 if self._trace > 0:
252 print 'Predictor', chart.pp_edge(e,w)
253
254
255
256
257 if edge.is_complete():
258 for e in completer.apply(chart, grammar, edge):
259 if self._trace > 0:
260 print 'Completer', chart.pp_edge(e,w)
261
262
263 return chart.parses(root)
264
266 import sys, time
267
268 S = GrammarCategory.parse('S')
269 VP = GrammarCategory.parse('VP')
270 NP = GrammarCategory.parse('NP')
271 PP = GrammarCategory.parse('PP')
272 V = GrammarCategory.parse('V')
273 N = GrammarCategory.parse('N')
274 P = GrammarCategory.parse('P')
275 Name = GrammarCategory.parse('Name')
276 Det = GrammarCategory.parse('Det')
277 DetSg = GrammarCategory.parse('Det[-pl]')
278 DetPl = GrammarCategory.parse('Det[+pl]')
279 NSg = GrammarCategory.parse('N[-pl]')
280 NPl = GrammarCategory.parse('N[+pl]')
281
282
283 grammatical_productions = [
284 cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)),
285 cfg.Production(NP, (NP, PP)),
286 cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)),
287 cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)),
288 cfg.Production(NP, (DetSg, NSg))]
289
290
291 lexical_productions = [
292 cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)),
293 cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)),
294 cfg.Production(Det, ('a',)),
295 cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)),
296 cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)),
297 cfg.Production(P, ('with',)), cfg.Production(P, ('under',)),
298 ]
299
300 earley_grammar = cfg.Grammar(S, grammatical_productions)
301 earley_lexicon = {}
302 for prod in lexical_productions:
303 earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs())
304
305 sent = 'I saw John with a dog with my cookie'
306 print "Sentence:\n", sent
307 from nltk_lite import tokenize
308 tokens = list(tokenize.whitespace(sent))
309 t = time.time()
310 cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1)
311 trees = cp.get_parse_list(tokens)
312 print "Time: %s" % (time.time() - t)
313 for tree in trees: print tree
314
316 import profile
317 profile.run('for i in range(10): demo()', '/tmp/profile.out')
318 import pstats
319 p = pstats.Stats('/tmp/profile.out')
320 p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
321 p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
322
323 if __name__ == '__main__':
324 demo()
325