1
2
3
4
5
6
7
8
9 """
10 Read tokens from the NLTK Gutenberg Corpus.
11
12 Project Gutenberg -- http://gutenberg.net/
13
14 This corpus contains selected texts from Project Gutenberg:
15
16 * Jane Austen (3)
17 * William Blake (2)
18 * G. K. Chesterton (3)
19 * King James Bible
20 * John Milton
21 * William Shakespeare (3)
22 * Walt Whitman
23 """
24
25 from nltk_lite.corpora import get_basedir
26 from nltk_lite import tokenize
27 import os, re
28
29 items = [
30 'austen-emma',
31 'austen-persuasion',
32 'austen-sense',
33 'bible-kjv',
34 'blake-poems',
35 'blake-songs',
36 'chesterton-ball',
37 'chesterton-brown',
38 'chesterton-thursday',
39 'milton-paradise',
40 'shakespeare-caesar',
41 'shakespeare-hamlet',
42 'shakespeare-macbeth',
43 'whitman-leaves'
44 ]
45
46 item_name = {
47 'austen-emma': 'Jane Austen: Emma',
48 'austen-persuasion': 'Jane Austen: Persuasion',
49 'austen-sense': 'Jane Austen: Sense and Sensibility',
50 'bible-kjv': 'King James Bible',
51 'blake-poems': 'William Blake: Poems',
52 'blake-songs': 'Willian Blake: Songs of Innocence and Experience',
53 'chesterton-ball': 'G.K. Chesterton: The Ball and The Cross',
54 'chesterton-brown': 'G.K. Chesterton: The Wisdom of Father Brown',
55 'chesterton-thursday': 'G.K. Chesterton: The Man Who Was Thursday',
56 'milton-paradise': 'John Milton: Paradise Lost',
57 'shakespeare-caesar': 'William Shakespeare: Julius Caesar',
58 'shakespeare-hamlet': 'William Shakespeare: Hamlet',
59 'shakespeare-macbeth': 'William Shakespeare: Macbeth',
60 'whitman-leaves': 'Walt Whitman: Leaves of Grass',
61 }
62
63
77
84
85 if __name__ == '__main__':
86 demo()
87