1
2
3
4
5
6
7
8
9
10 import re
11 from nltk_lite.parse import *
12 from nltk_lite.probability import ImmutableProbabilisticMixIn
13
15 """
16 A probabilistic context free grammar production.
17 PCFG C{WeightedProduction}s are essentially just C{Production}s that
18 have probabilities associated with them. These probabilities are
19 used to record how likely it is that a given production will
20 be used. In particular, the probability of a C{WeightedProduction}
21 records the likelihood that its right-hand side is the correct
22 instantiation for any given occurance of its left-hand side.
23
24 @see: L{Production}
25 """
27 """
28 Construct a new C{WeightedProduction}.
29
30 @param lhs: The left-hand side of the new C{WeightedProduction}.
31 @type lhs: L{Nonterminal}
32 @param rhs: The right-hand side of the new C{WeightedProduction}.
33 @type rhs: sequence of (C{Nonterminal} and (terminal))
34 @param **prob: Probability parameters of the new C{WeightedProduction}.
35 """
36 ImmutableProbabilisticMixIn.__init__(self, **prob)
37 Production.__init__(self, lhs, rhs)
38
41
43 return (isinstance(other, self.__class__) and
44 self._lhs == other._lhs and
45 self._rhs == other._rhs and
46 self.prob() == other.prob())
47
49 return hash((self._lhs, self._rhs, self.prob()))
50
52 """
53 A probabilistic context-free grammar. A Weighted Grammar consists of a start
54 state and a set of weighted productions. The set of terminals and
55 nonterminals is implicitly specified by the productions.
56
57 PCFG productions should be C{WeightedProduction}s. C{WeightedGrammar}s impose
58 the constraint that the set of productions with any given
59 left-hand-side must have probabilities that sum to 1.
60
61 If you need efficient key-based access to productions, you can use
62 a subclass to implement it.
63
64 @type EPSILON: C{float}
65 @cvar EPSILON: The acceptable margin of error for checking that
66 productions with a given left-hand side have probabilities
67 that sum to 1.
68 """
69 EPSILON = 0.01
70
72 """
73 Create a new context-free grammar, from the given start state
74 and set of C{WeightedProduction}s.
75
76 @param start: The start symbol
77 @type start: L{Nonterminal}
78 @param productions: The list of productions that defines the grammar
79 @type productions: C{list} of C{Production}
80 @raise ValueError: if the set of productions with any left-hand-side
81 do not have probabilities that sum to a value within
82 EPSILON of 1.
83 """
84 Grammar.__init__(self, start, productions)
85
86
87 probs = {}
88 for production in productions:
89 probs[production.lhs()] = (probs.get(production.lhs(), 0) +
90 production.prob())
91 for (lhs, p) in probs.items():
92 if not ((1-WeightedGrammar.EPSILON) < p < (1+WeightedGrammar.EPSILON)):
93 raise ValueError("Productions for %r do not sum to 1" % lhs)
94
95 -def induce(start, productions):
96 """
97 Induce a PCFG grammar from a list of productions.
98
99 The probability of a production A -> B C in a PCFG is:
100
101 | count(A -> B C)
102 | P(B, C | A) = --------------- where * is any right hand side
103 | count(A -> *)
104
105 @param start: The start symbol
106 @type start: L{Nonterminal}
107 @param productions: The list of productions that defines the grammar
108 @type productions: C{list} of L{Production}
109 """
110
111 pcount = {}
112 lcount = {}
113
114 for prod in productions:
115 lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
116 pcount[prod] = pcount.get(prod, 0) + 1
117
118 prods = [WeightedProduction(p.lhs(), p.rhs(), prob=float(pcount[p]) / lcount[p.lhs()])\
119 for p in pcount]
120 return WeightedGrammar(start, prods)
121
122
123
124
125
126 _PARSE_RE = re.compile(r'''^\s* # leading whitespace
127 (\w+(?:/\w+)?)\s* # lhs
128 (?:[-=]+>)\s* # arrow
129 (?:( # rhs:
130 "[^"]+" # doubled-quoted terminal
131 | '[^']+' # single-quoted terminal
132 | \w+(?:/\w+)? # non-terminal
133 | \[[01]?\.\d+\] # probability
134 | \| # disjunction
135 )
136 \s*) # trailing space
137 *$''',
138 re.VERBOSE)
139 _SPLIT_RE = re.compile(r'''(\w+(?:/\w+)?|\[[01]?\.\d+\]|[-=]+>|"[^"]+"|'[^']+'|\|)''')
140
142 """
143 Returns a list of PCFG productions
144 """
145
146 if not _PARSE_RE.match(s):
147 raise ValueError, 'Bad production string'
148
149 pieces = _SPLIT_RE.split(s)
150 pieces = [p for i,p in enumerate(pieces) if i%2==1]
151 lhside = Nonterminal(pieces[0])
152 rhsides = [[]]
153 probabilities = [0.0]
154 found_terminal = found_non_terminal = False
155 for piece in pieces[2:]:
156 if piece == '|':
157 rhsides.append([])
158 probabilities.append(0.0)
159 found_terminal = found_non_terminal = False
160 elif piece[0] in ('"', "'"):
161 rhsides[-1].append(piece[1:-1])
162 found_terminal = True
163 elif piece[0] in "[":
164 probabilities[-1] = float(piece[1:-1])
165 else:
166 rhsides[-1].append(Nonterminal(piece))
167 found_non_terminal = True
168 if found_terminal and found_non_terminal:
169 raise ValueError, 'Bad right-hand-side: do not mix terminals and non-terminals'
170 return [WeightedProduction(lhside, rhside, prob=probability)
171 for (rhside, probability) in zip(rhsides, probabilities)]
172
185
186
187 toy1 = parse_pcfg("""
188 S -> NP VP [1.0]
189 NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
190 Det -> 'the' [0.8] | 'my' [0.2]
191 N -> 'man' [0.5] | 'telescope' [0.5]
192 VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
193 V -> 'ate' [0.35] | 'saw' [0.65]
194 PP -> P NP [1.0]
195 P -> 'with' [0.61] | 'under' [0.39]
196 """)
197
198 toy2 = parse_pcfg("""
199 S -> NP VP [1.0]
200 VP -> V NP [.59]
201 VP -> V [.40]
202 VP -> VP PP [.01]
203 NP -> Det N [.41]
204 NP -> Name [.28]
205 NP -> NP PP [.31]
206 PP -> P NP [1.0]
207 V -> 'saw' [.21]
208 V -> 'ate' [.51]
209 V -> 'ran' [.28]
210 N -> 'boy' [.11]
211 N -> 'cookie' [.12]
212 N -> 'table' [.13]
213 N -> 'telescope' [.14]
214 N -> 'hill' [.5]
215 Name -> 'Jack' [.52]
216 Name -> 'Bob' [.48]
217 P -> 'with' [.61]
218 P -> 'under' [.39]
219 Det -> 'the' [.41]
220 Det -> 'a' [.31]
221 Det -> 'my' [.28]
222 """)
223
224
225
226
227
229 """
230 A demonstration showing how PCFG C{Grammar}s can be created and used.
231 """
232
233 from nltk_lite.corpora import treebank, extract
234 from nltk_lite.parse import cfg, pcfg, pchart, treetransforms
235 from itertools import islice
236
237 pcfg_prods = pcfg.toy1.productions()
238
239 pcfg_prod = pcfg_prods[2]
240 print 'A PCFG production:', `pcfg_prod`
241 print ' pcfg_prod.lhs() =>', `pcfg_prod.lhs()`
242 print ' pcfg_prod.rhs() =>', `pcfg_prod.rhs()`
243 print ' pcfg_prod.prob() =>', `pcfg_prod.prob()`
244 print
245
246 grammar = pcfg.toy2
247 print 'A PCFG grammar:', `grammar`
248 print ' grammar.start() =>', `grammar.start()`
249 print ' grammar.productions() =>',
250
251 print `grammar.productions()`.replace(',', ',\n'+' '*26)
252 print
253
254
255 print "Induce PCFG grammar from treebank data:"
256
257 productions = []
258 for tree in islice(treebank.parsed(),3):
259
260
261
262
263 productions += tree.productions()
264
265 S = Nonterminal('S')
266 grammar = pcfg.induce(S, productions)
267 print grammar
268 print
269
270 print "Parse sentence using induced grammar:"
271
272 parser = pchart.InsideParse(grammar)
273 parser.trace(3)
274
275 sent = extract(0, treebank.raw())
276 print sent
277 for parse in parser.get_parse_list(sent):
278 print parse
279
280 if __name__ == '__main__':
281 demo()
282