1
2
3
4
5
6
7
8
9
10
11
12 _NINF = float('-1e300')
13
14 """
15 Classes for representing and processing probabilistic information.
16
17 The L{FreqDist} class is used to encode X{frequency distributions},
18 which count the number of times that each outcome of an experiment
19 occurs.
20
21 The L{ProbDistI} class defines a standard interface for X{probability
22 distributions}, which encode the probability of each outcome for an
23 experiment. There are two types of probability distribution:
24
25 - X{derived probability distributions} are created from frequency
26 distributions. They attempt to model the probability distribution
27 that generated the frequency distribution.
28 - X{analytic probability distributions} are created directly from
29 parameters (such as variance).
30
31 The L{ConditionalFreqDist} class and L{ConditionalProbDistI} interface
32 are used to encode conditional distributions. Conditional probability
33 distributions can be derived or analytic; but currently the only
34 implementation of the C{ConditionalProbDistI} interface is
35 L{ConditionalProbDist}, a derived distribution.
36
37 """
38
39 import types, math
40
41
42
43
44
46 """
47 A frequency distribution for the outcomes of an experiment. A
48 frequency distribution records the number of times each outcome of
49 an experiment has occured. For example, a frequency distribution
50 could be used to record the frequency of each word type in a
51 document. Formally, a frequency distribution can be defined as a
52 function mapping from each sample to the number of times that
53 sample occured as an outcome.
54
55 Frequency distributions are generally constructed by running a
56 number of experiments, and incrementing the count for a sample
57 every time it is an outcome of an experiment. For example, the
58 following code will produce a frequency distribution that encodes
59 how often each word occurs in a text:
60
61 >>> fdist = FreqDist()
62 >>> for word in tokenize.whitespace(sent):
63 ... fdist.inc(word)
64 """
66 """
67 Construct a new empty, C{FreqDist}. In particular, the count
68 for every sample is zero.
69 """
70 self._count = {}
71 self._N = 0
72 self._Nr_cache = None
73 self._max_cache = None
74
75 - def inc(self, sample, count=1):
76 """
77 Increment this C{FreqDist}'s count for the given
78 sample.
79
80 @param sample: The sample whose count should be incremented.
81 @type sample: any
82 @param count: The amount to increment the sample's count by.
83 @type count: C{int}
84 @rtype: None
85 @raise NotImplementedError: If C{sample} is not a
86 supported sample type.
87 """
88 if count == 0: return
89
90 self._N += count
91 self._count[sample] = self._count.get(sample,0) + count
92
93
94 self._Nr_cache = None
95 self._max_cache = None
96
98 """
99 @return: The total number of sample outcomes that have been
100 recorded by this C{FreqDist}. For the number of unique
101 sample values (or bins) with counts greater than zero, use
102 C{FreqDist.B()}.
103 @rtype: C{int}
104 """
105 return self._N
106
108 """
109 @return: The total number of sample values (or X{bins}) that
110 have counts greater than zero. For the total
111 number of sample outcomes recorded, use C{FreqDist.N()}.
112 @rtype: C{int}
113 """
114 return len(self._count)
115
117 """
118 @return: A list of all samples that have been recorded as
119 outcomes by this frequency distribution. Use C{count()}
120 to determine the count for each sample.
121 @rtype: C{list}
122 """
123 return self._count.keys()
124
125 - def Nr(self, r, bins=None):
126 """
127 @return: The number of samples with count r.
128 @rtype: C{int}
129 @type r: C{int}
130 @param r: A sample count.
131 @type bins: C{int}
132 @param bins: The number of possible sample outcomes. C{bins}
133 is used to calculate Nr(0). In particular, Nr(0) is
134 C{bins-self.B()}. If C{bins} is not specified, it
135 defaults to C{self.B()} (so Nr(0) will be 0).
136 """
137 if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative'
138
139
140 if r == 0:
141 if bins is None: return 0
142 else: return bins-self.B()
143
144
145
146
147 if self._Nr_cache is None:
148 self._cache_Nr_values()
149
150 if r >= len(self._Nr_cache): return 0
151 return self._Nr_cache[r]
152
161
162 - def count(self, sample):
163 """
164 Return the count of a given sample. The count of a sample is
165 defined as the number of times that sample outcome was
166 recorded by this C{FreqDist}. Counts are non-negative
167 integers.
168
169 @return: The count of a given sample.
170 @rtype: C{int}
171 @param sample: the sample whose count
172 should be returned.
173 @type sample: any.
174 """
175 return self._count.get(sample, 0)
176
177 - def freq(self, sample):
178 """
179 Return the frequency of a given sample. The frequency of a
180 sample is defined as the count of that sample divided by the
181 total number of sample outcomes that have been recorded by
182 this C{FreqDist}. The count of a sample is defined as the
183 number of times that sample outcome was recorded by this
184 C{FreqDist}. Frequencies are always real numbers in the range
185 [0, 1].
186
187 @return: The frequency of a given sample.
188 @rtype: float
189 @param sample: the sample whose frequency
190 should be returned.
191 @type sample: any
192 """
193 if self._N is 0: return 0
194 return float(self._count.get(sample, 0)) / self._N
195
197 """
198 Return the sample with the greatest number of outcomes in this
199 frequency distribution. If two or more samples have the same
200 number of outcomes, return one of them; which sample is
201 returned is undefined. If no outcomes have occured in this
202 frequency distribution, return C{None}.
203
204 @return: The sample with the maximum number of outcomes in this
205 frequency distribution.
206 @rtype: any or C{None}
207 """
208 if self._max_cache is None:
209 best_sample = None
210 best_count = -1
211 for sample in self._count.keys():
212 if self._count[sample] > best_count:
213 best_sample = sample
214 best_count = self._count[sample]
215 self._max_cache = best_sample
216 return self._max_cache
217
219 """
220 Return the samples sorted in decreasing order of frequency. Instances
221 with the same count will be arbitrarily ordered. Instances with a
222 count of zero will be omitted. This method is C{O(N^2)}, where C{N} is
223 the number of samples, but will complete in a shorter time on average.
224
225 @return: The set of samples in sorted order.
226 @rtype: sequence of any
227 """
228 items = [(-count,sample) for (sample,count) in self._count.items()]
229 items.sort()
230 return [sample for (neg_count,sample) in items]
231
233 """
234 @return: A string representation of this C{FreqDist}.
235 @rtype: string
236 """
237 return '<FreqDist with %d samples>' % self.N()
238
240 """
241 @return: A string representation of this C{FreqDist}.
242 @rtype: string
243 """
244 samples = self.sorted_samples()
245 items = ['%r: %r' % (s, self._count[s]) for s in samples]
246 return '<FreqDist: %s>' % ', '.join(items)
247
249 """
250 @return: True if the given sample occurs one or more times in
251 this frequency distribution.
252 @rtype: C{boolean}
253 @param sample: The sample to search for.
254 @type sample: any
255 """
256 return self._count.has_key(sample)
257
258
259
260
261
263 """
264 A probability distribution for the outcomes of an experiment. A
265 probability distribution specifies how likely it is that an
266 experiment will have any given outcome. For example, a
267 probability distribution could be used to predict the probability
268 that a token in a document will have a given type. Formally, a
269 probability distribution can be defined as a function mapping from
270 samples to nonnegative real numbers, such that the sum of every
271 number in the function's range is 1.0. C{ProbDist}s are often
272 used to model the probability distribution of the experiment used
273 to generate a frequency distribution.
274 """
278
279 - def prob(self, sample):
280 """
281 @return: the probability for a given sample. Probabilities
282 are always real numbers in the range [0, 1].
283 @rtype: float
284 @param sample: The sample whose probability
285 should be returned.
286 @type sample: any
287 """
288 raise AssertionError()
289
291 """
292 @return: the natural logarithm of the probability for a given
293 sample. Log probabilities range from negitive infinity to
294 zero.
295 @rtype: float
296 @param sample: The sample whose probability
297 should be returned.
298 @type sample: any
299 """
300
301 p = self.prob(sample)
302 if p == 0:
303
304
305 return _NINF
306 else:
307 return math.log(p)
308
310 """
311 @return: the sample with the greatest probability. If two or
312 more samples have the same probability, return one of them;
313 which sample is returned is undefined.
314 @rtype: any
315 """
316 raise AssertionError()
317
319 """
320 @return: A list of all samples that have nonzero
321 probabilities. Use C{prob} to find the probability of
322 each sample.
323 @rtype: C{list}
324 """
325 raise AssertionError()
326
357
359 """
360 A probability distribution whose probabilities are directly
361 specified by a given dictionary. The given dictionary maps
362 samples to probabilities.
363 """
364 - def __init__(self, prob_dict=None, log=False, normalize=False):
365 """
366 Construct a new probability distribution from the given
367 dictionary, which maps values to probabilities (or to log
368 probabilities, if C{log} is true). If C{normalize} is
369 true, then the probability values are scaled by a constant
370 factor such that they sum to 1.
371 """
372 self._prob_dict = prob_dict.copy()
373 self._log = log
374
375
376 if normalize:
377 if log:
378 value_sum = sum_logs(self._prob_dict.values())
379 if value_sum <= _NINF:
380 logp = math.log(1.0/len(prob_dict.keys()))
381 for x in prob_dict.keys():
382 self._prob_dict[x] = logp
383 else:
384 for (x, p) in self._prob_dict.items():
385 self._prob_dict[x] -= value_sum
386 else:
387 value_sum = sum(self._prob_dict.values())
388 if value_sum == 0:
389 p = 1.0/len(prob_dict.keys())
390 for x in prob_dict.keys():
391 self._prob_dict[x] = p
392 else:
393 norm_factor = 1.0/value_sum
394 for (x, p) in self._prob_dict.items():
395 self._prob_dict[x] *= norm_factor
396
397 - def prob(self, sample):
398 if self._log:
399 if sample not in self._prob_dict: return 0
400 else: return math.exp(self._prob_dict[sample])
401 else:
402 return self._prob_dict.get(sample, 0)
403
410
412 if not hasattr(self, '_max'):
413 self._max = max((p,v) for (v,p) in self._prob_dict.items())[1]
414 return self._max
416 return self._prob_dict.keys()
418 return '<ProbDist with %d samples>' % len(self._prob_dict)
419
421 """
422 The maximum likelihood estimate for the probability distribution
423 of the experiment used to generate a frequency distribution. The
424 X{maximum likelihood estimate} approximates the probability of
425 each sample as the frequency of that sample in the frequency
426 distribution.
427 """
429 """
430 Use the maximum likelihood estimate to create a probability
431 distribution for the experiment used to generate C{freqdist}.
432
433 @type freqdist: C{FreqDist}
434 @param freqdist: The frequency distribution that the
435 probability estimates should be based on.
436 """
437 if freqdist.N() == 0:
438 raise ValueError('An MLE probability distribution must '+
439 'have at least one sample.')
440
441 self._freqdist = freqdist
442
444 """
445 @return: The frequency distribution that this probability
446 distribution is based on.
447 @rtype: C{FreqDist}
448 """
449 return self._freqdist
450
451 - def prob(self, sample):
453
455 return self._freqdist.max()
456
458 return self._freqdist.samples()
459
461 """
462 @rtype: C{string}
463 @return: A string representation of this C{ProbDist}.
464 """
465 return '<MLEProbDist based on %d samples>' % self._freqdist.N()
466
468 """
469 The Lidstone estimate for the probability distribution of the
470 experiment used to generate a frequency distribution. The
471 C{Lidstone estimate} is paramaterized by a real number M{gamma},
472 which typically ranges from 0 to 1. The X{Lidstone estimate}
473 approximates the probability of a sample with count M{c} from an
474 experiment with M{N} outcomes and M{B} bins as
475 M{(c+gamma)/(N+B*gamma)}. This is equivalant to adding
476 M{gamma} to the count for each bin, and taking the maximum
477 likelihood estimate of the resulting frequency distribution.
478 """
479 - def __init__(self, freqdist, gamma, bins=None):
480 """
481 Use the Lidstone estimate to create a probability distribution
482 for the experiment used to generate C{freqdist}.
483
484 @type freqdist: C{FreqDist}
485 @param freqdist: The frequency distribution that the
486 probability estimates should be based on.
487 @type gamma: C{float}
488 @param gamma: A real number used to paramaterize the
489 estimate. The Lidstone estimate is equivalant to adding
490 M{gamma} to the count for each bin, and taking the
491 maximum likelihood estimate of the resulting frequency
492 distribution.
493 @type bins: C{int}
494 @param bins: The number of sample values that can be generated
495 by the experiment that is described by the probability
496 distribution. This value must be correctly set for the
497 probabilities of the sample values to sum to one. If
498 C{bins} is not specified, it defaults to C{freqdist.B()}.
499 """
500 if (bins == 0) or (bins is None and freqdist.N() == 0):
501 name = self.__class__.__name__[:-8]
502 raise ValueError('A %s probability distribution ' % name +
503 'must have at least one bin.')
504 if (bins is not None) and (bins < freqdist.B()):
505 name = self.__class__.__name__[:-8]
506 raise ValueError('\nThe number of bins in a %s must be ' % name +
507 'greater than or equal to\nthe number of '+
508 'bins in the FreqDist used to create it.')
509
510 self._freqdist = freqdist
511 self._gamma = float(gamma)
512 self._N = self._freqdist.N()
513
514 if bins is None: bins = freqdist.B()
515 self._bins = bins
516
518 """
519 @return: The frequency distribution that this probability
520 distribution is based on.
521 @rtype: C{FreqDist}
522 """
523 return self._freqdist
524
525 - def prob(self, sample):
526 c = self._freqdist.count(sample)
527 return (c + self._gamma) / (self._N + self._bins * self._gamma)
528
530
531
532
533 return self._freqdist.max()
534
536 return self._freqdist.samples()
537
539 """
540 @rtype: C{string}
541 @return: A string representation of this C{ProbDist}.
542 """
543 return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
544
546 """
547 The Laplace estimate for the probability distribution of the
548 experiment used to generate a frequency distribution. The
549 X{Lidstone estimate} approximates the probability of a sample with
550 count M{c} from an experiment with M{N} outcomes and M{B} bins as
551 M{(c+1)/(N+B)}. This is equivalant to adding one to the count for
552 each bin, and taking the maximum likelihood estimate of the
553 resulting frequency distribution.
554 """
555 - def __init__(self, freqdist, bins=None):
556 """
557 Use the Laplace estimate to create a probability distribution
558 for the experiment used to generate C{freqdist}.
559
560 @type freqdist: C{FreqDist}
561 @param freqdist: The frequency distribution that the
562 probability estimates should be based on.
563 @type bins: C{int}
564 @param bins: The number of sample values that can be generated
565 by the experiment that is described by the probability
566 distribution. This value must be correctly set for the
567 probabilities of the sample values to sum to one. If
568 C{bins} is not specified, it defaults to C{freqdist.B()}.
569 """
570 LidstoneProbDist.__init__(self, freqdist, 1, bins)
571
573 """
574 @rtype: C{string}
575 @return: A string representation of this C{ProbDist}.
576 """
577 return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
578
580 """
581 The expected likelihood estimate for the probability distribution
582 of the experiment used to generate a frequency distribution. The
583 X{expected likelihood estimate} approximates the probability of a
584 sample with count M{c} from an experiment with M{N} outcomes and
585 M{B} bins as M{(c+0.5)/(N+B/2)}. This is equivalant to adding 0.5
586 to the count for each bin, and taking the maximum likelihood
587 estimate of the resulting frequency distribution.
588 """
589 - def __init__(self, freqdist, bins=None):
590 """
591 Use the expected likelihood estimate to create a probability
592 distribution for the experiment used to generate C{freqdist}.
593
594 @type freqdist: C{FreqDist}
595 @param freqdist: The frequency distribution that the
596 probability estimates should be based on.
597 @type bins: C{int}
598 @param bins: The number of sample values that can be generated
599 by the experiment that is described by the probability
600 distribution. This value must be correctly set for the
601 probabilities of the sample values to sum to one. If
602 C{bins} is not specified, it defaults to C{freqdist.B()}.
603 """
604 LidstoneProbDist.__init__(self, freqdist, 0.5, bins)
605
607 """
608 @rtype: C{string}
609 @return: A string representation of this C{ProbDist}.
610 """
611 return '<ELEProbDist based on %d samples>' % self._freqdist.N()
612
614 """
615 The heldout estimate for the probability distribution of the
616 experiment used to generate two frequency distributions. These
617 two frequency distributions are called the "heldout frequency
618 distribution" and the "base frequency distribution." The
619 X{heldout estimate} uses uses the X{heldout frequency
620 distribution} to predict the probability of each sample, given its
621 frequency in the X{base frequency distribution}.
622
623 In particular, the heldout estimate approximates the probability
624 for a sample that occurs M{r} times in the base distribution as
625 the average frequency in the heldout distribution of all samples
626 that occur M{r} times in the base distribution.
627
628 This average frequency is M{Tr[r]/(Nr[r]*N)}, where:
629 - M{Tr[r]} is the total count in the heldout distribution for
630 all samples that occur M{r} times in the base
631 distribution.
632 - M{Nr[r]} is the number of samples that occur M{r} times in
633 the base distribution.
634 - M{N} is the number of outcomes recorded by the heldout
635 frequency distribution.
636
637 In order to increase the efficiency of the C{prob} member
638 function, M{Tr[r]/(Nr[r]*N)} is precomputed for each value of M{r}
639 when the C{HeldoutProbDist} is created.
640
641 @type _estimate: C{list} of C{float}
642 @ivar _estimate: A list mapping from M{r}, the number of
643 times that a sample occurs in the base distribution, to the
644 probability estimate for that sample. C{_estimate[M{r}]} is
645 calculated by finding the average frequency in the heldout
646 distribution of all samples that occur M{r} times in the base
647 distribution. In particular, C{_estimate[M{r}]} =
648 M{Tr[r]/(Nr[r]*N)}.
649 @type _max_r: C{int}
650 @ivar _max_r: The maximum number of times that any sample occurs
651 in the base distribution. C{_max_r} is used to decide how
652 large C{_estimate} must be.
653 """
654 - def __init__(self, base_fdist, heldout_fdist, bins=None):
655 """
656 Use the heldout estimate to create a probability distribution
657 for the experiment used to generate C{base_fdist} and
658 C{heldout_fdist}.
659
660 @type base_fdist: C{FreqDist}
661 @param base_fdist: The base frequency distribution.
662 @type heldout_fdist: C{FreqDist}
663 @param heldout_fdist: The heldout frequency distribution.
664 @type bins: C{int}
665 @param bins: The number of sample values that can be generated
666 by the experiment that is described by the probability
667 distribution. This value must be correctly set for the
668 probabilities of the sample values to sum to one. If
669 C{bins} is not specified, it defaults to C{freqdist.B()}.
670 """
671
672 self._base_fdist = base_fdist
673 self._heldout_fdist = heldout_fdist
674
675
676 self._max_r = base_fdist.count(base_fdist.max())
677
678
679 Tr = self._calculate_Tr()
680 Nr = [base_fdist.Nr(r, bins) for r in range(self._max_r+1)]
681 N = heldout_fdist.N()
682
683
684
685 self._estimate = self._calculate_estimate(Tr, Nr, N)
686
688 """
689 @return: the list M{Tr}, where M{Tr[r]} is the total count in
690 C{heldout_fdist} for all samples that occur M{r}
691 times in C{base_fdist}.
692 @rtype: C{list} of C{float}
693 """
694 Tr = [0.0] * (self._max_r+1)
695 for sample in self._heldout_fdist.samples():
696 r = self._base_fdist.count(sample)
697 Tr[r] += self._heldout_fdist.count(sample)
698 return Tr
699
701 """
702 @return: the list M{estimate}, where M{estimate[r]} is the
703 probability estimate for any sample that occurs M{r} times
704 in the base frequency distribution. In particular,
705 M{estimate[r]} is M{Tr[r]/(N[r]*N)}. In the special case
706 that M{N[r]=0}, M{estimate[r]} will never be used; so we
707 define M{estimate[r]=None} for those cases.
708 @rtype: C{list} of C{float}
709 @type Tr: C{list} of C{float}
710 @param Tr: the list M{Tr}, where M{Tr[r]} is the total count in
711 the heldout distribution for all samples that occur M{r}
712 times in base distribution.
713 @type Nr: C{list} of C{float}
714 @param Nr: The list M{Nr}, where M{Nr[r]} is the number of
715 samples that occur M{r} times in the base distribution.
716 @type N: C{int}
717 @param N: The total number of outcomes recorded by the heldout
718 frequency distribution.
719 """
720 estimate = []
721 for r in range(self._max_r+1):
722 if Nr[r] == 0: estimate.append(None)
723 else: estimate.append(Tr[r]/(Nr[r]*N))
724 return estimate
725
727 """
728 @return: The base frequency distribution that this probability
729 distribution is based on.
730 @rtype: C{FreqDist}
731 """
732 return self._base_fdist
733
735 """
736 @return: The heldout frequency distribution that this
737 probability distribution is based on.
738 @rtype: C{FreqDist}
739 """
740 return self._heldout_fdist
741
742 - def prob(self, sample):
743
744 r = self._base_fdist.count(sample)
745 return self._estimate[r]
746
748
749
750
751 return self._base_fdist.max()
752
754 """
755 @rtype: C{string}
756 @return: A string representation of this C{ProbDist}.
757 """
758 s = '<HeldoutProbDist: %d base samples; %d heldout samples>'
759 return s % (self._base_fdist.N(), self._heldout_fdist.N())
760
762 """
763 The cross-validation estimate for the probability distribution of
764 the experiment used to generate a set of frequency distribution.
765 The X{cross-validation estimate} for the probability of a sample
766 is found by averaging the held-out estimates for the sample in
767 each pair of frequency distributions.
768 """
770 """
771 Use the cross-validation estimate to create a probability
772 distribution for the experiment used to generate
773 C{freqdists}.
774
775 @type freqdists: C{list} of C{FreqDist}
776 @param freqdists: A list of the frequency distributions
777 generated by the experiment.
778 @type bins: C{int}
779 @param bins: The number of sample values that can be generated
780 by the experiment that is described by the probability
781 distribution. This value must be correctly set for the
782 probabilities of the sample values to sum to one. If
783 C{bins} is not specified, it defaults to C{freqdist.B()}.
784 """
785 self._freqdists = freqdists
786
787
788
789 self._heldout_probdists = []
790 for fdist1 in freqdists:
791 for fdist2 in freqdists:
792 if fdist1 is not fdist2:
793 probdist = HeldoutProbDist(fdist1, fdist2, bins)
794 self._heldout_probdists.append(probdist)
795
797 """
798 @rtype: C{list} of C{FreqDist}
799 @return: The list of frequency distributions that this
800 C{ProbDist} is based on.
801 """
802 return self._freqdists
803
804 - def prob(self, sample):
805
806
807 prob = 0.0
808 for heldout_probdist in self._heldout_probdists:
809 prob += heldout_probdist.prob(sample)
810 return prob/len(self._heldout_probdists)
811
813 """
814 @rtype: C{string}
815 @return: A string representation of this C{ProbDist}.
816 """
817 return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
818
820 """
821 The Witten-Bell estimate of a probability distribution. This distribution
822 allocates uniform probability mass to as yet unseen events by using the
823 number of events that have only been seen once. The probability mass
824 reserved for unseen events is equal to:
825
826 - M{T / (N + T)}
827
828 where M{T} is the number of observed event types and M{N} is the total
829 number of observed events. This equates to the maximum likelihood estimate
830 of a new type event occuring. The remaining probability mass is discounted
831 such that all probability estimates sum to one, yielding:
832
833 - M{p = T / Z (N + T)}, if count = 0
834 - M{p = c / (N + T)}, otherwise
835 """
836
837 - def __init__(self, freqdist, bins=None):
838 """
839 Creates a distribution of Witten-Bell probability estimates. This
840 distribution allocates uniform probability mass to as yet unseen
841 events by using the number of events that have only been seen once.
842 The probability mass reserved for unseen events is equal to:
843
844 - M{T / (N + T)}
845
846 where M{T} is the number of observed event types and M{N} is the total
847 number of observed events. This equates to the maximum likelihood
848 estimate of a new type event occuring. The remaining probability mass
849 is discounted such that all probability estimates sum to one,
850 yielding:
851
852 - M{p = T / Z (N + T)}, if count = 0
853 - M{p = c / (N + T)}, otherwise
854
855 The parameters M{T} and M{N} are taken from the C{freqdist} parameter
856 (the C{B()} and C{N()} values). The normalising factor M{Z} is
857 calculated using these values along with the C{bins} parameter.
858
859 @param freqdist: The frequency counts upon which to base the
860 estimation.
861 @type freqdist: C{FreqDist}
862 @param bins: The number of possible event types. This must be
863 at least as large as the number of bins in the
864 C{freqdist}. If C{None}, then it's assumed to be
865 equal to that of the C{freqdist}
866 @type bins: C{Int}
867 """
868 assert bins == None or bins >= freqdist.B(),\
869 'Bins parameter must not be less than freqdist.B()'
870 if bins == None:
871 bins = freqdist.B()
872 self._freqdist = freqdist
873 self._T = self._freqdist.B()
874 self._Z = bins - self._freqdist.B()
875 self._N = self._freqdist.N()
876
877 - def prob(self, sample):
878
879 c = self._freqdist.count(sample)
880 if c == 0:
881 return self._T / float(self._Z * (self._N + self._T))
882 else:
883 return c / float(self._N + self._T)
884
886 return self._freqdist.max()
887
889 return self._freqdist.samples()
890
892 return self._freqdist
893
895 """
896 @rtype: C{string}
897 @return: A string representation of this C{ProbDist}.
898 """
899 return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
900
902 """
903 The Good-Turing estimate of a probability distribution. This method
904 calculates the probability mass to assign to events with zero or low
905 counts based on the number of events with higher counts. It does so by
906 using the smoothed count M{c*}:
907
908 - M{c* = (c + 1) N(c + 1) / N(c)}
909
910 where M{c} is the original count, M{N(i)} is the number of event types
911 observed with count M{i}. These smoothed counts are then normalised to
912 yield a probability distribution.
913 """
914
915
916
918 """
919 Creates a Good-Turing probability distribution estimate. This method
920 calculates the probability mass to assign to events with zero or low
921 counts based on the number of events with higher counts. It does so by
922 using the smoothed count M{c*}:
923
924 - M{c* = (c + 1) N(c + 1) / N(c)}
925
926 where M{c} is the original count, M{N(i)} is the number of event types
927 observed with count M{i}. These smoothed counts are then normalised to
928 yield a probability distribution.
929
930 The C{bins} parameter allows C{N(0)} to be estimated.
931
932 @param freqdist: The frequency counts upon which to base the
933 estimation.
934 @type freqdist: C{FreqDist}
935 @param bins: The number of possible event types. This must be
936 at least as large as the number of bins in the
937 C{freqdist}. If C{None}, then it's taken to be
938 equal to C{freqdist.B()}.
939 @type bins: C{Int}
940 """
941 assert bins == None or bins >= freqdist.B(),\
942 'Bins parameter must not be less than freqdist.B()'
943 if bins == None:
944 bins = freqdist.B()
945 self._freqdist = freqdist
946 self._bins = bins
947
948 - def prob(self, sample):
949
950 c = self._freqdist.count(sample)
951 nc = self._freqdist.Nr(c, self._bins)
952 ncn = self._freqdist.Nr(c + 1, self._bins)
953
954
955 if nc == 0 or self._freqdist.N() == 0:
956 return 0.0
957
958 return float(c + 1) * ncn / (nc * self._freqdist.N())
959
961 return self._freqdist.max()
962
964 return self._freqdist.samples()
965
967 return self._freqdist
968
970 """
971 @rtype: C{string}
972 @return: A string representation of this C{ProbDist}.
973 """
974 return '<GoodTuringProbDist based on %d samples>' % self._freqdist.N()
975
977 """
978 An mutable probdist where the probabilities may be easily modified. This
979 simply copies an existing probdist, storing the probability values in a
980 mutable dictionary and providing an update method.
981 """
982
983 - def __init__(self, prob_dist, samples, store_logs=True):
984 """
985 Creates the mutable probdist based on the given prob_dist and using
986 the list of samples given. These values are stored as log
987 probabilities if the store_logs flag is set.
988
989 @param prob_dist: the distribution from which to garner the
990 probabilities
991 @type prob_dist: ProbDist
992 @param samples: the complete set of samples
993 @type samples: sequence of any
994 @param store_logs: whether to store the probabilities as logarithms
995 @type store_logs: bool
996 """
997 try:
998 import numpy
999 except ImportError:
1000 print "Error: Please install numpy; for instructions see http://nltk.sf.net/install.html"
1001 exit()
1002 self._samples = samples
1003 self._sample_dict = dict((samples[i], i) for i in range(len(samples)))
1004 self._data = numpy.zeros(len(samples), numpy.float64)
1005 for i in range(len(samples)):
1006 if store_logs:
1007 self._data[i] = prob_dist.logprob(samples[i])
1008 else:
1009 self._data[i] = prob_dist.prob(samples[i])
1010 self._logs = store_logs
1011
1013
1014 return self._samples
1015
1016 - def prob(self, sample):
1017
1018 i = self._sample_dict.get(sample)
1019 if i != None:
1020 if self._logs:
1021 return exp(self._data[i])
1022 else:
1023 return self._data[i]
1024 else:
1025 return 0.0
1026
1028
1029 i = self._sample_dict.get(sample)
1030 if i != None:
1031 if self._logs:
1032 return self._data[i]
1033 else:
1034 return log(self._data[i])
1035 else:
1036 return float('-inf')
1037
1038 - def update(self, sample, prob, log=True):
1039 """
1040 Update the probability for the given sample. This may cause the object
1041 to stop being the valid probability distribution - the user must
1042 ensure that they update the sample probabilities such that all samples
1043 have probabilities between 0 and 1 and that all probabilities sum to
1044 one.
1045
1046 @param sample: the sample for which to update the probability
1047 @type sample: C{any}
1048 @param prob: the new probability
1049 @type prob: C{float}
1050 @param log: is the probability already logged
1051 @type log: C{bool}
1052 """
1053 i = self._sample_dict.get(sample)
1054 assert i != None
1055 if self._logs:
1056 if log: self._data[i] = prob
1057 else: self._data[i] = log(prob)
1058 else:
1059 if log: self._data[i] = exp(prob)
1060 else: self._data[i] = prob
1061
1062
1063
1064
1065
1067
1068 return sum(actual_pdist.prob(s) * math.log(test_pdist.prob(s))
1069 for s in actual_pdist.samples())
1070
1071
1072
1073
1074
1076 """
1077 A collection of frequency distributions for a single experiment
1078 run under different conditions. Conditional frequency
1079 distributions are used to record the number of times each sample
1080 occured, given the condition under which the experiment was run.
1081 For example, a conditional frequency distribution could be used to
1082 record the frequency of each word (type) in a document, given its
1083 length. Formally, a conditional frequency distribution can be
1084 defined as a function that maps from each condition to the
1085 C{FreqDist} for the experiment under that condition.
1086
1087 The frequency distribution for each condition is accessed using
1088 the indexing operator:
1089
1090 >>> cfdist[3]
1091 <FreqDist with 73 outcomes>
1092 >>> cfdist[3].freq('the')
1093 0.4
1094 >>> cfdist[3].count('dog')
1095 2
1096
1097 When the indexing operator is used to access the frequency
1098 distribution for a condition that has not been accessed before,
1099 C{ConditionalFreqDist} creates a new empty C{FreqDist} for that
1100 condition.
1101
1102 Conditional frequency distributions are typically constructed by
1103 repeatedly running an experiment under a variety of conditions,
1104 and incrementing the sample outcome counts for the appropriate
1105 conditions. For example, the following code will produce a
1106 conditional frequency distribution that encodes how often each
1107 word type occurs, given the length of that word type:
1108
1109 >>> cfdist = ConditionalFreqDist()
1110 >>> for word in tokenize.whitespace(sent):
1111 ... condition = len(word)
1112 ... cfdist[condition].inc(word)
1113 """
1115 """
1116 Construct a new empty conditional frequency distribution. In
1117 particular, the count for every sample, under every condition,
1118 is zero.
1119 """
1120 self._fdists = {}
1121
1123 """
1124 Return the frequency distribution that encodes the frequency
1125 of each sample outcome, given that the experiment was run
1126 under the given condition. If the frequency distribution for
1127 the given condition has not been accessed before, then this
1128 will create a new empty C{FreqDist} for that condition.
1129
1130 @return: The frequency distribution that encodes the frequency
1131 of each sample outcome, given that the experiment was run
1132 under the given condition.
1133 @rtype: C{FreqDist}
1134
1135 @param condition: The condition under which the experiment was
1136 run.
1137 @type condition: any
1138 """
1139
1140 if not self._fdists.has_key(condition):
1141 self._fdists[condition] = FreqDist()
1142
1143 return self._fdists[condition]
1144
1146 """
1147 @return: A list of the conditions that have been accessed for
1148 this C{ConditionalFreqDist}. Use the indexing operator to
1149 access the frequency distribution for a given condition.
1150 Note that the frequency distributions for some conditions
1151 may contain zero sample outcomes.
1152 @rtype: C{list}
1153 """
1154 return self._fdists.keys()
1155
1157 """
1158 @return: A string representation of this
1159 C{ConditionalFreqDist}.
1160 @rtype: C{string}
1161 """
1162 n = len(self._fdists)
1163 return '<ConditionalFreqDist with %d conditions>' % n
1164
1166 """
1167 A collection of probability distributions for a single experiment
1168 run under different conditions. Conditional probability
1169 distributions are used to estimate the likelihood of each sample,
1170 given the condition under which the experiment was run. For
1171 example, a conditional probability distribution could be used to
1172 estimate the probability of each word type in a document, given
1173 the length of the word type. Formally, a conditional probability
1174 distribution can be defined as a function that maps from each
1175 condition to the C{ProbDist} for the experiment under that
1176 condition.
1177 """
1180
1182 """
1183 @return: The probability distribution for the experiment run
1184 under the given condition.
1185 @rtype: C{ProbDistI}
1186 @param condition: The condition whose probability distribution
1187 should be returned.
1188 @type condition: any
1189 """
1190 raise AssertionError
1191
1193 """
1194 @return: A list of the conditions that are represented by
1195 this C{ConditionalProbDist}. Use the indexing operator to
1196 access the probability distribution for a given condition.
1197 @rtype: C{list}
1198 """
1199 raise AssertionError
1200
1201
1202
1203
1204
1205
1207 """
1208 A conditional probability distribution modelling the experiments
1209 that were used to generate a conditional frequency distribution.
1210 A C{ConditoinalProbDist} is constructed from a
1211 C{ConditionalFreqDist} and a X{C{ProbDist} factory}:
1212
1213 - The B{C{ConditionalFreqDist}} specifies the frequency
1214 distribution for each condition.
1215 - The B{C{ProbDist} factory} is a function that takes a
1216 condition's frequency distribution, and returns its
1217 probability distribution. A C{ProbDist} class's name (such as
1218 C{MLEProbDist} or C{HeldoutProbDist}) can be used to specify
1219 that class's constructor.
1220
1221 The first argument to the C{ProbDist} factory is the frequency
1222 distribution that it should model; and the remaining arguments are
1223 specified by the C{factory_args} parameter to the
1224 C{ConditionalProbDist} constructor. For example, the following
1225 code constructs a C{ConditionalProbDist}, where the probability
1226 distribution for each condition is an C{ELEProbDist} with 10 bins:
1227
1228 >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10)
1229 >>> print cpdist['run'].max()
1230 'NN'
1231 >>> print cpdist['run'].prob('NN')
1232 0.0813
1233 """
1234 - def __init__(self, cfdist, probdist_factory,
1235 supply_condition=False, *factory_args):
1236 """
1237 Construct a new conditional probability distribution, based on
1238 the given conditional frequency distribution and C{ProbDist}
1239 factory.
1240
1241 @type cfdist: L{ConditionalFreqDist}
1242 @param cfdist: The C{ConditionalFreqDist} specifying the
1243 frequency distribution for each condition.
1244 @type probdist_factory: C{class} or C{function}
1245 @param probdist_factory: The function or class that maps
1246 a condition's frequency distribution to its probability
1247 distribution. The function is called with the frequency
1248 distribution as its first argument, the condition as its
1249 second argument (only if C{supply_condition=True}), and
1250 C{factory_args} as its remaining arguments.
1251 @type supply_condition: C{bool}
1252 @param supply_condition: If true, then pass the condition as
1253 the second argument to C{probdist_factory}.
1254 @type factory_args: (any)
1255 @param factory_args: Extra arguments for C{probdist_factory}.
1256 These arguments are usually used to specify extra
1257 properties for the probability distributions of individual
1258 conditions, such as the number of bins they contain.
1259 """
1260 self._probdist_factory = probdist_factory
1261 self._cfdist = cfdist
1262 self._supply_condition = supply_condition
1263 self._factory_args = factory_args
1264
1265 self._pdists = {}
1266 for c in cfdist.conditions():
1267 if supply_condition:
1268 pdist = probdist_factory(cfdist[c], c, *factory_args)
1269 else:
1270 pdist = probdist_factory(cfdist[c], *factory_args)
1271 self._pdists[c] = pdist
1272
1274 if not self._pdists.has_key(condition):
1275
1276
1277
1278 pdist = self._probdist_factory(FreqDist(), *self._factory_args)
1279 self._pdists[condition] = pdist
1280
1281 return self._pdists[condition]
1282
1284 return self._pdists.keys()
1285
1287 """
1288 @return: A string representation of this
1289 C{ConditionalProbDist}.
1290 @rtype: C{string}
1291 """
1292 n = len(self._pdists)
1293 return '<ConditionalProbDist with %d conditions>' % n
1294
1296 """
1297 An alternative ConditionalProbDist that simply wraps a dictionary of
1298 ProbDists rather than creating these from FreqDists.
1299 """
1300
1302 """
1303 @param probdist_dict: a dictionary containing the probdists indexed
1304 by the conditions
1305 @type probdist_dict: dict any -> probdist
1306 """
1307 self._dict = probdist_dict
1308
1310
1311
1312 return self._dict[condition]
1313
1315
1316 return self._dict.keys()
1317
1318
1319
1320
1321
1322
1323 _ADD_LOGS_MAX_DIFF = math.log(1e-30)
1324
1326 """
1327 Given two numbers C{logx}=M{log(x)} and C{logy}=M{log(y)}, return
1328 M{log(x+y)}. Conceptually, this is the same as returning
1329 M{log(exp(C{logx})+exp(C{logy}))}, but the actual implementation
1330 avoids overflow errors that could result from direct computation.
1331 """
1332 if (logx < logy + _ADD_LOGS_MAX_DIFF):
1333 return logy
1334 if (logy < logx + _ADD_LOGS_MAX_DIFF):
1335 return logx
1336 base = min(logx, logy)
1337 return base + math.log(math.exp(logx-base) + math.exp(logy-base))
1338
1340 if len(logs) == 0:
1341
1342
1343 return _NINF
1344 else:
1345 return reduce(add_logs, logs[1:], logs[0])
1346
1347
1348
1349
1350
1352 """
1353 A mix-in class to associate probabilities with other classes
1354 (trees, rules, etc.). To use the C{ProbabilisticMixIn} class,
1355 define a new class that derives from an existing class and from
1356 ProbabilisticMixIn. You will need to define a new constructor for
1357 the new class, which explicitly calls the constructors of both its
1358 parent classes. For example:
1359
1360 >>> class A:
1361 ... def __init__(self, x, y): self.data = (x,y)
1362 ...
1363 >>> class ProbabilisticA(A, ProbabilisticMixIn):
1364 ... def __init__(self, x, y, **prob_kwarg):
1365 ... A.__init__(self, x, y)
1366 ... ProbabilisticMixIn.__init__(self, **prob_kwarg)
1367
1368 See the documentation for the ProbabilisticMixIn
1369 L{constructor<__init__>} for information about the arguments it
1370 expects.
1371
1372 You should generally also redefine the string representation
1373 methods, the comparison methods, and the hashing method.
1374 """
1376 """
1377 Initialize this object's probability. This initializer should
1378 be called by subclass constructors. C{prob} should generally be
1379 the first argument for those constructors.
1380
1381 @kwparam prob: The probability associated with the object.
1382 @type prob: C{float}
1383 @kwparam logprob: The log of the probability associated with
1384 the object.
1385 @type logprob: C{float}
1386 """
1387 if 'prob' in kwargs:
1388 if 'logprob' in kwargs:
1389 raise TypeError('Must specify either prob or logprob '
1390 '(not both)')
1391 else:
1392 ProbabilisticMixIn.set_prob(self, kwargs['prob'])
1393 elif 'logprob' in kwargs:
1394 ProbabilisticMixIn.set_logprob(self, kwargs['logprob'])
1395 else:
1396 self.__prob = self.__logprob = None
1397
1399 """
1400 Set the probability associated with this object to C{prob}.
1401 @param prob: The new probability
1402 @type prob: C{float}
1403 """
1404 self.__prob = prob
1405 self.__logprob = None
1406
1408 """
1409 Set the log probability associated with this object to
1410 C{logprob}. I.e., set the probability associated with this
1411 object to C{exp(logprob)}.
1412 @param logprob: The new log probability
1413 @type logprob: C{float}
1414 """
1415 self.__logprob = prob
1416 self.__prob = None
1417
1419 """
1420 @return: The probability associated with this object.
1421 @rtype: C{float}
1422 """
1423 if self.__prob is None:
1424 if self.__logprob is None: return None
1425 self.__prob = math.exp(self.__logprob)
1426 return self.__prob
1427
1429 """
1430 @return: C{log(p)}, where C{p} is the probability associated
1431 with this object.
1432
1433 @rtype: C{float}
1434 """
1435 if self.__logprob is None:
1436 if self.__prob is None: return None
1437 self.__logprob = math.log(self.__prob)
1438 return self.__logprob
1439
1442 raise ValueError, '%s is immutable' % self.__class__.__name__
1444 raise ValueError, '%s is immutable' % self.__class__.__name__
1445
1446
1447
1448
1449
1451 """
1452 Create a new frequency distribution, with random samples. The
1453 samples are numbers from 1 to C{numsamples}, and are generated by
1454 summing two numbers, each of which has a uniform distribution.
1455 """
1456 import random
1457 from math import sqrt
1458 fdist = FreqDist()
1459 for x in range(numoutcomes):
1460 y = (random.randint(1, (1+numsamples)/2) +
1461 random.randint(0, numsamples/2))
1462 fdist.inc(y)
1463 return fdist
1464
1466 """
1467 Return the true probability distribution for the experiment
1468 C{_create_rand_fdist(numsamples, x)}.
1469 """
1470 fdist = FreqDist()
1471 for x in range(1, (1+numsamples)/2+1):
1472 for y in range(0, numsamples/2+1):
1473 fdist.inc(x+y)
1474 return MLEProbDist(fdist)
1475
1476 -def demo(numsamples=6, numoutcomes=500):
1477 """
1478 A demonstration of frequency distributions and probability
1479 distributions. This demonstration creates three frequency
1480 distributions with, and uses them to sample a random process with
1481 C{numsamples} samples. Each frequency distribution is sampled
1482 C{numoutcomes} times. These three frequency distributions are
1483 then used to build six probability distributions. Finally, the
1484 probability estimates of these distributions are compared to the
1485 actual probability of each sample.
1486
1487 @type numsamples: C{int}
1488 @param numsamples: The number of samples to use in each demo
1489 frequency distributions.
1490 @type numoutcomes: C{int}
1491 @param numoutcomes: The total number of outcomes for each
1492 demo frequency distribution. These outcomes are divided into
1493 C{numsamples} bins.
1494 @rtype: C{None}
1495 """
1496
1497
1498 fdist1 = _create_rand_fdist(numsamples, numoutcomes)
1499 fdist2 = _create_rand_fdist(numsamples, numoutcomes)
1500 fdist3 = _create_rand_fdist(numsamples, numoutcomes)
1501
1502
1503 pdists = [
1504 MLEProbDist(fdist1),
1505 LidstoneProbDist(fdist1, 0.5, numsamples),
1506 HeldoutProbDist(fdist1, fdist2, numsamples),
1507 HeldoutProbDist(fdist2, fdist1, numsamples),
1508 CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples),
1509 _create_sum_pdist(numsamples),
1510 ]
1511
1512
1513 vals = []
1514 for n in range(1,numsamples+1):
1515 vals.append(tuple([n, fdist1.freq(n)] +
1516 [pdist.prob(n) for pdist in pdists]))
1517
1518
1519 print ('%d samples (1-%d); %d outcomes were sampled for each FreqDist' %
1520 (numsamples, numsamples, numoutcomes))
1521 print '='*9*(len(pdists)+2)
1522 FORMATSTR = ' FreqDist '+ '%8s '*(len(pdists)-1) + '| Actual'
1523 print FORMATSTR % tuple(`pdist`[1:9] for pdist in pdists[:-1])
1524 print '-'*9*(len(pdists)+2)
1525 FORMATSTR = '%3d %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f'
1526 for val in vals:
1527 print FORMATSTR % val
1528
1529
1530 zvals = zip(*vals)
1531 def sum(lst): return reduce(lambda x,y:x+y, lst, 0)
1532 sums = [sum(val) for val in zvals[1:]]
1533 print '-'*9*(len(pdists)+2)
1534 FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f'
1535 print FORMATSTR % tuple(sums)
1536 print '='*9*(len(pdists)+2)
1537
1538
1539 if len(`str(fdist1)`) < 70:
1540 print ' fdist1:', str(fdist1)
1541 print ' fdist2:', str(fdist2)
1542 print ' fdist3:', str(fdist3)
1543 print
1544
1545 if __name__ == '__main__':
1546 demo(6, 10)
1547 demo(5, 5000)
1548