1
2
3 """New CSS Tokenizer (a generator)
4 """
5 __all__ = ['Tokenizer', 'CSSProductions']
6 __docformat__ = 'restructuredtext'
7 __version__ = '$Id: tokenize2.py 1420 2008-08-09 19:28:34Z cthedot $'
8
9 import re
10 from helper import normalize
11 from cssproductions import *
12
14 """
15 generates a list of Token tuples:
16 (Tokenname, value, startline, startcolumn)
17 """
18 _atkeywords = {
19 u'@font-face': CSSProductions.FONT_FACE_SYM,
20 u'@import': CSSProductions.IMPORT_SYM,
21 u'@media': CSSProductions.MEDIA_SYM,
22 u'@namespace': CSSProductions.NAMESPACE_SYM,
23 u'@page': CSSProductions.PAGE_SYM
24 }
25 _linesep = u'\n'
26
27 - def __init__(self, macros=None, productions=None):
28 """
29 inits tokenizer with given macros and productions which default to
30 cssutils own macros and productions
31 """
32 if not macros:
33 macros = MACROS
34 if not productions:
35 productions = PRODUCTIONS
36 self.tokenmatches = self._compile_productions(
37 self._expand_macros(macros,
38 productions))
39 self.commentmatcher = [x[1] for x in self.tokenmatches if x[0] == 'COMMENT'][0]
40 self.urimatcher = [x[1] for x in self.tokenmatches if x[0] == 'URI'][0]
41 self.unicodesub = re.compile(r'\\[0-9a-fA-F]{1,6}(?:\r\n|[\t|\r|\n|\f|\x20])?').sub
42
44 """returns macro expanded productions, order of productions is kept"""
45 def macro_value(m):
46 return '(?:%s)' % macros[m.groupdict()['macro']]
47 expanded = []
48 for key, value in productions:
49 while re.search(r'{[a-zA-Z][a-zA-Z0-9-]*}', value):
50 value = re.sub(r'{(?P<macro>[a-zA-Z][a-zA-Z0-9-]*)}',
51 macro_value, value)
52 expanded.append((key, value))
53 return expanded
54
56 """compile productions into callable match objects, order is kept"""
57 compiled = []
58 for key, value in expanded_productions:
59 compiled.append((key, re.compile('^(?:%s)' % value, re.U).match))
60 return compiled
61
62 - def tokenize(self, text, fullsheet=False):
63 """Generator: Tokenize text and yield tokens, each token is a tuple
64 of::
65
66 (nname, value, line, col)
67
68 The token value will contain a normal string, meaning CSS unicode
69 escapes have been resolved to normal characters. The serializer
70 escapes needed characters back to unicode escapes depending on
71 the stylesheet target encoding.
72
73 text
74 to be tokenized
75 fullsheet
76 if ``True`` appends EOF token as last one and completes incomplete
77 COMMENT or INVALID (to STRING) tokens
78 """
79 def _repl(m):
80 "used by unicodesub"
81 num = int(m.group(0)[1:], 16)
82 if num < 0x10000:
83 return unichr(num)
84 else:
85 return m.group(0)
86
87 def _normalize(value):
88 "normalize and do unicodesub"
89 return normalize(self.unicodesub(_repl, value))
90
91 line = col = 1
92
93
94 (BOM, matcher), productions = self.tokenmatches[0], self.tokenmatches[1:]
95 match = matcher(text)
96 if match:
97 found = match.group(0)
98 yield (BOM, found, line, col)
99 text = text[len(found):]
100
101
102 if text.startswith('@charset '):
103 found = '@charset '
104 yield (CSSProductions.CHARSET_SYM, found, line, col)
105 text = text[len(found):]
106 col += len(found)
107
108 while text:
109
110 c = text[0]
111 if c in '{}:;,':
112 yield ('CHAR', c, line, col)
113 col += 1
114 text = text[1:]
115
116 else:
117
118 for name, matcher in productions:
119 if fullsheet and name == 'CHAR' and text.startswith(u'/*'):
120
121 possiblecomment = u'%s*/' % text
122 match = self.commentmatcher(possiblecomment)
123 if match:
124 yield ('COMMENT', possiblecomment, line, col)
125 text = None
126 break
127
128 match = matcher(text)
129 if match:
130 found = match.group(0)
131 if fullsheet:
132
133 if 'INVALID' == name and text == found:
134
135 name, found = 'STRING', '%s%s' % (found, found[0])
136
137 elif 'FUNCTION' == name and\
138 u'url(' == _normalize(found):
139
140
141 for end in (u"')", u'")', u')'):
142 possibleuri = '%s%s' % (text, end)
143 match = self.urimatcher(possibleuri)
144 if match:
145 name, found = 'URI', match.group(0)
146 break
147
148 if name in ('DIMENSION', 'IDENT', 'STRING', 'URI',
149 'HASH', 'COMMENT', 'FUNCTION', 'INVALID'):
150
151
152 value = self.unicodesub(_repl, found)
153
154 else:
155 if 'ATKEYWORD' == name:
156
157 if '@charset' == found and ' ' == text[len(found):len(found)+1]:
158
159 name = CSSProductions.CHARSET_SYM
160 found += ' '
161 else:
162 name = self._atkeywords.get(_normalize(found), 'ATKEYWORD')
163
164 value = found
165
166 yield (name, value, line, col)
167 text = text[len(found):]
168 nls = found.count(self._linesep)
169 line += nls
170 if nls:
171 col = len(found[found.rfind(self._linesep):])
172 else:
173 col += len(found)
174 break
175
176 if fullsheet:
177 yield ('EOF', u'', line, col)
178