1
2
3
4
5
6
7
8
9
10 """
11 Functions for tokenizing a text, based on a regular expression
12 which matches tokens or gaps.
13 """
14
15 SPACE = ' '
16 NEWLINE = '\n'
17 BLANKLINE = '\n\n'
18 SHOEBOXSEP = r'^\\'
19
21 """
22 Tokenize the text at a single space character.
23
24 @param s: the string or string iterator to be tokenized
25 @type s: C{string} or C{iter(string)}
26 @return: An iterator over tokens
27 """
28 return s.split(SPACE)
29
31 """
32 Tokenize the text into lines.
33
34 @param s: the string or string iterator to be tokenized
35 @type s: C{string} or C{iter(string)}
36 @return: An iterator over tokens
37 """
38 return s.split(NEWLINE)
39
41 """
42 Tokenize the text into paragraphs (separated by blank lines).
43
44 @param s: the string or string iterator to be tokenized
45 @type s: C{string} or C{iter(string)}
46 @return: An iterator over tokens
47 """
48 return s.split(BLANKLINE)
49
51 """
52 Tokenize a Shoebox entry into its fields (separated by backslash markers).
53
54 @param s: the string or string iterator to be tokenized
55 @type s: C{string} or C{iter(string)}
56 @return: An iterator over tokens
57 """
58 return s.split(SHOEBOXSEP)
59
60
61
62
63
65 """
66 A demonstration that shows the output of several different
67 tokenizers on the same string.
68 """
69
70 s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
71 print 'Input text:'
72 print `s`
73 print
74 print 'Tokenize using individual space characters:'
75 print list(space(s))
76 print
77 print 'Tokenize by lines:'
78 print list(line(s))
79 print
80
81 if __name__ == '__main__':
82 demo()
83