1 |
wakaba |
1.1 |
package Whatpm::CSS::Tokenizer; |
2 |
|
|
use strict; |
3 |
|
|
|
4 |
|
|
sub new ($) { |
5 |
|
|
my $self = bless {token => []}, shift; |
6 |
|
|
return $self; |
7 |
|
|
} # new |
8 |
|
|
|
9 |
|
|
sub init ($) { |
10 |
|
|
my $self = shift; |
11 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
12 |
|
|
$self->{c} = $self->{get_char}->(); |
13 |
|
|
} # init |
14 |
|
|
|
15 |
|
|
sub get_next_token ($) { |
16 |
|
|
my $self = shift; |
17 |
|
|
if (@{$self->{token}}) { |
18 |
|
|
return shift @{$self->{token}}; |
19 |
|
|
} |
20 |
|
|
|
21 |
|
|
my $current_token; |
22 |
|
|
my $char; |
23 |
|
|
my $num; # |{num}|, if any. |
24 |
|
|
my $i; # |$i + 1|th character in |unicode| in |escape|. |
25 |
|
|
my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"| |
26 |
|
|
|
27 |
|
|
A: { |
28 |
|
|
if ($self->{state} == BEFORE_TOKEN_STATE) { |
29 |
|
|
if ($self->{c} == 0x002D) { # - |
30 |
|
|
## NOTE: |-| in |ident| in |IDENT| |
31 |
|
|
$current_token = {type => IDENT_TOKEN, value => '-'}; |
32 |
|
|
$self->{state} = BEFORE_NMSTART_STATE; |
33 |
|
|
$self->{c} = $self->{get_char}->(); |
34 |
|
|
redo A; |
35 |
|
|
} elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z |
36 |
|
|
(0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z |
37 |
|
|
$self->{c} == 0x005F or # _ |
38 |
|
|
$self->{c} > 0x007F) { # nonascii |
39 |
|
|
## NOTE: |nmstart| in |ident| in |IDENT| |
40 |
|
|
$current_token = {type => IDENT_TOKEN, value => chr $self->{char}}; |
41 |
|
|
$self->{state} = NAME_STATE; |
42 |
|
|
$self->{c} = $self->{get_char}->(); |
43 |
|
|
redo A; |
44 |
|
|
} elsif ($self->{c} == 0x005C) { # \ |
45 |
|
|
## NOTE: |nmstart| in |ident| in |IDENT| |
46 |
|
|
$current_token = {type => IDENT_TOKEN, value => ''}; |
47 |
|
|
$self->{state} = ESCAPE_OPEN_STATE; $q = 0; |
48 |
|
|
$self->{c} = $self->{get_char}->(); |
49 |
|
|
redo A; |
50 |
|
|
} elsif ($self->{c} == 0x0040) { # @ |
51 |
|
|
## NOTE: |@| in |ATKEYWORD| |
52 |
|
|
$current_token = {type => ATKEYWORD_TOKEN, value => ''}; |
53 |
|
|
$self->{state} = BEFORE_NMSTART_STATE; |
54 |
|
|
$self->{c} = $self->{get_char}->(); |
55 |
|
|
redo A; |
56 |
|
|
} elsif ($self->{c} == 0x0022) { # " |
57 |
|
|
## NOTE: |"| in |string1| in |string| in |STRING|, or |
58 |
|
|
## |"| in |invalid1| in |invalid| in |INVALID|. |
59 |
|
|
$current_token = {type => STRING_TOKEN, value => ''}; |
60 |
|
|
$self->{state} = STRING_STATE; $q = 1; |
61 |
|
|
$self->{c} = $self->{get_char}->(); |
62 |
|
|
redo A; |
63 |
|
|
} elsif ($self->{c} == 0x0027) { # ' |
64 |
|
|
## NOTE: |'| in |string2| in |string| in |STRING|, or |
65 |
|
|
## |'| in |invalid2| in |invalid| in |INVALID|. |
66 |
|
|
$current_token = {type => STRING_TOKEN, value => ''}; |
67 |
|
|
$self->{state} = STRING_STATE; $q = 2; |
68 |
|
|
$self->{c} = $self->{get_char}->(); |
69 |
|
|
redo A; |
70 |
|
|
} elsif ($self->{c} == 0x0023) { # # |
71 |
|
|
## NOTE: |#| in |HASH|. |
72 |
|
|
$current_token = {type => HASH_TOKEN, value => ''}; |
73 |
|
|
$self->{state} = HASH_OPEN_STATE; |
74 |
|
|
$self->{c} = $self->{get_char}->(); |
75 |
|
|
redo A; |
76 |
|
|
} elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9 |
77 |
|
|
## NOTE: |num|. |
78 |
|
|
$current_token = {type => NUMBER_TOKEN, value => chr $self->{c}}; |
79 |
|
|
$self->{state} = NUMBER_STATE; |
80 |
|
|
$self->{c} = $self->{get_char}->(); |
81 |
|
|
redo A; |
82 |
|
|
} elsif ($self->{c} == 0x002E) { # . |
83 |
|
|
## NOTE: |num|. |
84 |
|
|
$current_token = {type => NUMBER_TOKEN, value => '.'}; |
85 |
|
|
$self->{state} = NUMBER_FRACTION_STATE; |
86 |
|
|
$self->{c} = $self->{get_char}->(); |
87 |
|
|
redo A; |
88 |
|
|
} elsif ($self->{c} == 0x003C) { # < |
89 |
|
|
## NOTE: |CDO| |
90 |
|
|
$self->{c} = $self->{get_char}->(); |
91 |
|
|
if ($self->{c} == 0x0021) { # ! |
92 |
|
|
$self->{c} = $self->{get_char}->(); |
93 |
|
|
if ($self->{c} == 0x002C) { # - |
94 |
|
|
$self->{c} = $self->{get_char}->(); |
95 |
|
|
if ($self->{c} == 0x002C) { # - |
96 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
97 |
|
|
$self->{c} = $self->{get_char}->(); |
98 |
|
|
return {type => CDO_TOKEN}; |
99 |
|
|
#redo A; |
100 |
|
|
} else { |
101 |
|
|
unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'}; |
102 |
|
|
## NOTE: |-| in |ident| in |IDENT| |
103 |
|
|
$current_token = {type => IDENT_TOKEN, value => '-'}; |
104 |
|
|
$self->{state} = BEFORE_NMSTART_STATE; |
105 |
|
|
#reprocess |
106 |
|
|
return {type => DELIM_TOKEN, value => '<'}; |
107 |
|
|
#redo A; |
108 |
|
|
} |
109 |
|
|
} else { |
110 |
|
|
unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'}; |
111 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
112 |
|
|
#reprocess |
113 |
|
|
return {type => DELIM_TOKEN, value => '<'}; |
114 |
|
|
#redo A; |
115 |
|
|
} |
116 |
|
|
} else { |
117 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
118 |
|
|
#reprocess |
119 |
|
|
return {type => DELIM_TOKEN, value => '<'}; |
120 |
|
|
#redo A; |
121 |
|
|
} |
122 |
|
|
} elsif ({ |
123 |
|
|
0x003B => 1, # ; |
124 |
|
|
0x007B => 1, # { |
125 |
|
|
0x007D => 1, # } |
126 |
|
|
0x0028 => 1, # ( |
127 |
|
|
0x0029 => 1, # ) |
128 |
|
|
0x005B => 1, # [ |
129 |
|
|
0x005D => 1, # ] |
130 |
|
|
}->{$self->{c}}) { |
131 |
|
|
# stay in the state |
132 |
|
|
$self->{c} = $self->{get_char}->(); |
133 |
|
|
return {type => chr $self->{c}}; |
134 |
|
|
# redo A; |
135 |
|
|
} elsif ({ |
136 |
|
|
0x0020 => 1, # SP |
137 |
|
|
0x0009 => 1, # \t |
138 |
|
|
0x000D => 1, # \r |
139 |
|
|
0x000A => 1, # \n |
140 |
|
|
0x000C => 1, # \f |
141 |
|
|
}->{$self->{c}}) { |
142 |
|
|
W: { |
143 |
|
|
$self->{c} = $self->{get_char}->(); |
144 |
|
|
if ({ |
145 |
|
|
0x0020 => 1, # SP |
146 |
|
|
0x0009 => 1, # \t |
147 |
|
|
0x000D => 1, # \r |
148 |
|
|
0x000A => 1, # \n |
149 |
|
|
0x000C => 1, # \f |
150 |
|
|
}->{$self->{c}}) { |
151 |
|
|
redo W; |
152 |
|
|
} elsif (my $v = { |
153 |
|
|
0x002B => PLUS_TOKEN, # + |
154 |
|
|
0x003E => GREATER_TOKEN, # > |
155 |
|
|
0x002C => COMMA_TOKEN, # , |
156 |
|
|
0x007E => TILDE_TOKEN, # ~ |
157 |
|
|
}->{$self->{c}}) { |
158 |
|
|
# stay in the state |
159 |
|
|
$self->{c} = $self->{get_char}->(); |
160 |
|
|
return {type => $v}; |
161 |
|
|
#redo A; |
162 |
|
|
} else { |
163 |
|
|
# stay in the state |
164 |
|
|
# reprocess |
165 |
|
|
return {type => S_TOKEN}; |
166 |
|
|
#redo A; |
167 |
|
|
} |
168 |
|
|
} # W |
169 |
|
|
} elsif (my $v = { |
170 |
|
|
0x007C => DASHMATCH_TOKEN, # | |
171 |
|
|
0x005E => PREFIXMATCH_TOKEN, # ^ |
172 |
|
|
0x0024 => SUFFIXMATCH_TOKEN, # $ |
173 |
|
|
0x002A => SUBSTRINGMATCH_TOKEN, # * |
174 |
|
|
}->{$self->{c}}) { |
175 |
|
|
$self->{c} = $self->{get_char}->(); |
176 |
|
|
if ($self->{c} == 0x003D) { # = |
177 |
|
|
# stay in the state |
178 |
|
|
$self->{c} = $self->{get_char}->(); |
179 |
|
|
return {type => $v}; |
180 |
|
|
#redo A; |
181 |
|
|
} else { |
182 |
|
|
# stay in the state |
183 |
|
|
# reprocess |
184 |
|
|
return {type => DELIM_TOKEN, value => chr $self->{c}}; |
185 |
|
|
#redo A; |
186 |
|
|
} |
187 |
|
|
} elsif ($self->{c} == 0x002B) { # + |
188 |
|
|
# stay in the state |
189 |
|
|
$self->{c} = $self->{get_char}->(); |
190 |
|
|
return {type => PLUS_TOKEN}; |
191 |
|
|
#redo A; |
192 |
|
|
} elsif ($self->{c} == 0x003E) { # > |
193 |
|
|
# stay in the state |
194 |
|
|
$self->{c} = $self->{get_char}->(); |
195 |
|
|
return {type => GREATER_TOKEN}; |
196 |
|
|
#redo A; |
197 |
|
|
} elsif ($self->{c} == 0x002C) { # , |
198 |
|
|
# stay in the state |
199 |
|
|
$self->{c} = $self->{get_char}->(); |
200 |
|
|
return {type => COMMA_TOKEN}; |
201 |
|
|
#redo A; |
202 |
|
|
} elsif ($self->{c} == 0x007E) { # ~ |
203 |
|
|
$self->{c} = $self->{get_char}->(); |
204 |
|
|
if ($self->{c} == 0x003D) { # = |
205 |
|
|
# stay in the state |
206 |
|
|
$self->{c} = $self->{get_char}->(); |
207 |
|
|
return {type => INCLUDES_TOKEN}; |
208 |
|
|
#redo A; |
209 |
|
|
} else { |
210 |
|
|
# stay in the state |
211 |
|
|
# reprocess |
212 |
|
|
return {type => TILDE_TOKEN}; |
213 |
|
|
#redo A; |
214 |
|
|
} |
215 |
|
|
} elsif ($self->{c} == -1) { |
216 |
|
|
# stay in the state |
217 |
|
|
$self->{c} = $self->{get_char}->(); |
218 |
|
|
return {type => EOF_TOKEN}; |
219 |
|
|
#redo A; |
220 |
|
|
} else { |
221 |
|
|
# stay in the state |
222 |
|
|
$current_token = {type => DELIM_TOKEN, value => chr $self->{c}}; |
223 |
|
|
$self->{c} = $self->{get_char}->(); |
224 |
|
|
return $current_token; |
225 |
|
|
#redo A; |
226 |
|
|
} |
227 |
|
|
} elsif ($self->{state} == BEFORE_NMSTART_STATE) { |
228 |
|
|
## NOTE: |nmstart| in |ident| in (|IDENT| or |ATKEYWORD|) |
229 |
|
|
if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z |
230 |
|
|
(0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z |
231 |
|
|
$self->{c} == 0x005F or # _ |
232 |
|
|
$self->{c} > 0x007F) { # nonascii |
233 |
|
|
$current_token->{value} .= chr $self->{char}; |
234 |
|
|
$self->{state} = NAME_STATE; |
235 |
|
|
$self->{c} = $self->{get_char}->(); |
236 |
|
|
redo A; |
237 |
|
|
} elsif ($self->{c} == 0x005C) { # \ |
238 |
|
|
$self->{state} = ESCAPE_OPEN_STATE; $q = 0; |
239 |
|
|
$self->{c} = $self->{get_char}->(); |
240 |
|
|
redo A; |
241 |
|
|
} elsif ($self->{c} == 0x002D and # - |
242 |
|
|
$current_token->{type} == IDENT_TOKEN) { |
243 |
|
|
$self->{c} = $self->{get_char}->(); |
244 |
|
|
if ($self->{c} == 0x003E) { # > |
245 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
246 |
|
|
$self->{c} = $self->{get_char}->(); |
247 |
|
|
return {type => CDC_TOKEN}; |
248 |
|
|
#redo A; |
249 |
|
|
} else { |
250 |
|
|
## NOTE: |-|, |-|, $self->{c} |
251 |
|
|
#$current_token = {type => IDENT_TOKEN, value => '-'}; |
252 |
|
|
# stay in the state |
253 |
|
|
# reconsume |
254 |
|
|
return {type => DELIM_TOKEN, value => '-'}; |
255 |
|
|
#redo A; |
256 |
|
|
} |
257 |
|
|
} else { |
258 |
|
|
if ($current_token->{type} == NUMBER_TOKEN) { |
259 |
|
|
## NOTE: |-| after |num|. |
260 |
|
|
unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'}; |
261 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
262 |
|
|
$self->{c} = $self->{get_char}->(); |
263 |
|
|
return $current_token; |
264 |
|
|
} elsif ($current_token->{type} == ATKEYWORD_TOKEN) { |
265 |
|
|
## NOTE: |-| after |@|. |
266 |
|
|
unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'}; |
267 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
268 |
|
|
$self->{c} = $self->{get_char}->(); |
269 |
|
|
return $current_token; |
270 |
|
|
} else { |
271 |
|
|
## NOTE: |-| not followed by |nmstart|. |
272 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
273 |
|
|
$self->{c} = $self->{get_char}->(); |
274 |
|
|
return {type => DELIM_TOKEN, value => '-'}; |
275 |
|
|
} |
276 |
|
|
} |
277 |
|
|
} elsif ($self->{state} == AFTER_NUMBER_STATE) { |
278 |
|
|
if ($self->{c} == 0x002D) { # - |
279 |
|
|
## NOTE: |-| in |ident|. |
280 |
|
|
$current_token->{value} = '-'; |
281 |
|
|
$self->{state} = BEFORE_NMSTART_STATE; |
282 |
|
|
$self->{c} = $self->{get_char}->(); |
283 |
|
|
redo A; |
284 |
|
|
} elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z |
285 |
|
|
(0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z |
286 |
|
|
$self->{c} == 0x005F or # _ |
287 |
|
|
$self->{c} > 0x007F) { # nonascii |
288 |
|
|
## NOTE: |nmstart| in |ident|. |
289 |
|
|
$current_token->{value} = chr $self->{char}; |
290 |
|
|
$self->{state} = NAME_STATE; |
291 |
|
|
$self->{c} = $self->{get_char}->(); |
292 |
|
|
redo A; |
293 |
|
|
} elsif ($self->{c} == 0x005C) { # \ |
294 |
|
|
## NOTE: |nmstart| in |ident| in |IDENT| |
295 |
|
|
$current_token->{value} = ''; |
296 |
|
|
$self->{state} = ESCAPE_OPEN_STATE; $q = 0; |
297 |
|
|
$self->{c} = $self->{get_char}->(); |
298 |
|
|
redo A; |
299 |
|
|
} elsif ($self->{c} == 0x0025) { # % |
300 |
|
|
$current_token->{type} = PERCENTAGE_TOKEN; |
301 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
302 |
|
|
$self->{c} = $self->{get_char}->(); |
303 |
|
|
return $current_token; |
304 |
|
|
#redo A; |
305 |
|
|
} else { |
306 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
307 |
|
|
# reprocess |
308 |
|
|
return $current_token; |
309 |
|
|
#redo A; |
310 |
|
|
} |
311 |
|
|
} elsif ($self->{state} == HASH_OPEN_STATE) { |
312 |
|
|
## NOTE: The first |nmchar| in |name| in |HASH|. |
313 |
|
|
if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z |
314 |
|
|
(0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z |
315 |
|
|
(0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9 |
316 |
|
|
$self->{c} == 0x002D or # - |
317 |
|
|
$self->{c} == 0x005F or # _ |
318 |
|
|
$self->{c} > 0x007F) { # nonascii |
319 |
|
|
$current_token->{value} .= chr $self->{char}; |
320 |
|
|
$self->{state} = NAME_STATE; |
321 |
|
|
$self->{c} = $self->{get_char}->(); |
322 |
|
|
redo A; |
323 |
|
|
} elsif ($self->{c} == 0x005C) { # \ |
324 |
|
|
$self->{state} = ESCAPE_OPEN_STATE; $q = 0; |
325 |
|
|
$self->{c} = $self->{get_char}->(); |
326 |
|
|
redo A; |
327 |
|
|
} else { |
328 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
329 |
|
|
$self->{c} = $self->{get_char}->(); |
330 |
|
|
return {type => DELIM_TOKEN, value => '#'}; |
331 |
|
|
#redo A; |
332 |
|
|
} |
333 |
|
|
} elsif ($self->{state} == NAME_STATE) { |
334 |
|
|
## NOTE: |nmchar| in (|ident| or |name|). |
335 |
|
|
if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z |
336 |
|
|
(0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z |
337 |
|
|
(0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9 |
338 |
|
|
$self->{c} == 0x005F or # _ |
339 |
|
|
$self->{c} == 0x002D or # - |
340 |
|
|
$self->{c} > 0x007F) { # nonascii |
341 |
|
|
$current_token->{value} .= chr $self->{char}; |
342 |
|
|
# stay in the state |
343 |
|
|
$self->{c} = $self->{get_char}->(); |
344 |
|
|
redo A; |
345 |
|
|
} elsif ($self->{c} == 0x005C) { # \ |
346 |
|
|
$self->{state} = ESCAPE_OPEN_STATE; # $q = 0; |
347 |
|
|
$self->{c} = $self->{get_char}->(); |
348 |
|
|
redo A; |
349 |
|
|
} elsif ($self->{c} == 0x0028 and # ( |
350 |
|
|
$current_token->{type} == IDENT_TOKEN) { # ( |
351 |
|
|
if (not $current_token->{has_escape} and |
352 |
|
|
{url => 1, Url => 1, uRl => 1, urL => 1, |
353 |
|
|
URl => 1, UrL => 1, uRL => 1, URL => 1} |
354 |
|
|
->{$current_token->{value}}) { |
355 |
|
|
$current_token->{type} = URI_TOKEN; |
356 |
|
|
$self->{state} = URI_BEFORE_WSP_STATE; |
357 |
|
|
$self->{c} = $self->{get_char}->(); |
358 |
|
|
|
359 |
|
|
## NOTE: This version of the tokenizer does not support the |URI| |
360 |
|
|
## token type. Note that browsers disagree in how to tokenize |
361 |
|
|
## |url| function. |
362 |
|
|
$current_token->{type} = FUNCTION_TOKEN; |
363 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
364 |
|
|
$self->{c} = $self->{get_char}->(); |
365 |
|
|
return $current_token; |
366 |
|
|
|
367 |
|
|
redo A; |
368 |
|
|
} else { |
369 |
|
|
$current_token->{type} = FUNCTION_TOKEN; |
370 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
371 |
|
|
$self->{c} = $self->{get_char}->(); |
372 |
|
|
return $current_token; |
373 |
|
|
#redo A; |
374 |
|
|
} |
375 |
|
|
} else { |
376 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
377 |
|
|
# reconsume |
378 |
|
|
return $current_token; |
379 |
|
|
#redo A; |
380 |
|
|
} |
381 |
|
|
} elsif ($self->{state} == ESCAPE_OPEN_STATE) { |
382 |
|
|
$current_token->{has_escape} = 1; |
383 |
|
|
if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9 |
384 |
|
|
## NOTE: second character of |unicode| in |escape|. |
385 |
|
|
$char = $self->{c} - 0x0030; |
386 |
|
|
$self->{state} = ESCAPE_STATE; $i = 2; |
387 |
|
|
$self->{c} = $self->{get_char}->(); |
388 |
|
|
redo A; |
389 |
|
|
} elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F |
390 |
|
|
## NOTE: second character of |unicode| in |escape|. |
391 |
|
|
$char = $self->{c} - 0x0041 + 0xA; |
392 |
|
|
$self->{state} = ESCAPE_STATE; $i = 2; |
393 |
|
|
$self->{c} = $self->{get_char}->(); |
394 |
|
|
redo A; |
395 |
|
|
} elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f |
396 |
|
|
## NOTE: second character of |unicode| in |escape|. |
397 |
|
|
$char = $self->{c} - 0x0061 - 0xA; |
398 |
|
|
$self->{state} = ESCAPE_STATE; $i = 2; |
399 |
|
|
$self->{c} = $self->{get_char}->(); |
400 |
|
|
redo A; |
401 |
|
|
} elsif ($self->{c} == 0x000A or # \n |
402 |
|
|
$self->{c} == 0x000C) { # \f |
403 |
|
|
if ($q == 0) { |
404 |
|
|
## NOTE: In |escape| in ... in |ident|. |
405 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
406 |
|
|
unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'}; |
407 |
|
|
return $current_token; |
408 |
|
|
# reconsume |
409 |
|
|
#redo A; |
410 |
|
|
} else { |
411 |
|
|
## Note: In |nl| in ... in |string| or |ident|. |
412 |
|
|
$current_token->{value} .= chr $self->{c}; |
413 |
|
|
$self->{state} = STRING_STATE; |
414 |
|
|
$self->{c} = $self->{get_char}->(); |
415 |
|
|
redo A; |
416 |
|
|
} |
417 |
|
|
} elsif ($self->{c} == 0x000D) { # \r |
418 |
|
|
if ($q == 0) { |
419 |
|
|
## NOTE: In |escape| in ... in |ident|. |
420 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
421 |
|
|
unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'}; |
422 |
|
|
return $current_token; |
423 |
|
|
# reconsume |
424 |
|
|
#redo A; |
425 |
|
|
} else { |
426 |
|
|
## Note: In |nl| in ... in |string| or |ident|. |
427 |
|
|
$current_token->{value} .= "\x0D\x0A"; |
428 |
|
|
$self->{state} = ESCAPE_BEFORE_LF_STATE; |
429 |
|
|
$self->{c} = $self->{get_char}->(); |
430 |
|
|
redo A; |
431 |
|
|
} |
432 |
|
|
} else { |
433 |
|
|
## NOTE: second character of |escape|. |
434 |
|
|
$current_token->{value} .= chr $self->{c}; |
435 |
|
|
$self->{state} = $q == 0 ? NAME_STATE : STRING_STATE; |
436 |
|
|
$self->{c} = $self->{get_char}->(); |
437 |
|
|
redo A; |
438 |
|
|
} |
439 |
|
|
} elsif ($self->{state} == ESCAPE_STATE) { |
440 |
|
|
## NOTE: third..seventh character of |unicode| in |escape|. |
441 |
|
|
if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9 |
442 |
|
|
$char = $char * 0x10 + $self->{c} - 0x0030; |
443 |
|
|
$self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE; |
444 |
|
|
$self->{c} = $self->{get_char}->(); |
445 |
|
|
redo A; |
446 |
|
|
} elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F |
447 |
|
|
$char = $char * 0x10 + $self->{c} - 0x0041 + 0xA; |
448 |
|
|
$self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE; |
449 |
|
|
$self->{c} = $self->{get_char}->(); |
450 |
|
|
redo A; |
451 |
|
|
} elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f |
452 |
|
|
$char = $char * 0x10 + $self->{c} - 0x0061 - 0xA; |
453 |
|
|
$self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE; |
454 |
|
|
$self->{c} = $self->{get_char}->(); |
455 |
|
|
redo A; |
456 |
|
|
} elsif ($self->{c} == 0x0020 or # SP |
457 |
|
|
$self->{c} == 0x000A or # \n |
458 |
|
|
$self->{c} == 0x0009 or # \t |
459 |
|
|
$self->{c} == 0x000C) { # \f |
460 |
|
|
$current_token->{value} .= chr $char; |
461 |
|
|
$self->{state} = $q == 0 ? NAME_STATE : STRING_STATE; |
462 |
|
|
$self->{c} = $self->{get_char}->(); |
463 |
|
|
redo A; |
464 |
|
|
} elsif ($self->{c} == 0x000D) { # \r |
465 |
|
|
$self->{state} = ESCAPE_BEFORE_LF_STATE; |
466 |
|
|
$self->{c} = $self->{get_char}->(); |
467 |
|
|
redo A; |
468 |
|
|
} else { |
469 |
|
|
$current_token->{value} .= chr $char; |
470 |
|
|
$self->{state} = $q == 0 ? NAME_STATE : STRING_STATE; |
471 |
|
|
# reconsume |
472 |
|
|
redo A; |
473 |
|
|
} |
474 |
|
|
} elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) { |
475 |
|
|
## NOTE: eightth character of |unicode| in |escape|. |
476 |
|
|
if ($self->{c} == 0x0020 or # SP |
477 |
|
|
$self->{c} == 0x000A or # \n |
478 |
|
|
$self->{c} == 0x0009 or # \t |
479 |
|
|
$self->{c} == 0x000C) { # \f |
480 |
|
|
$current_token->{value} .= chr $char; |
481 |
|
|
$self->{state} = $q == 0 ? NAME_STATE : STRING_STATE; |
482 |
|
|
$self->{c} = $self->{get_char}->(); |
483 |
|
|
redo A; |
484 |
|
|
} elsif ($self->{c} == 0x000D) { # \r |
485 |
|
|
$self->{state} = ESCAPE_BEFORE_NL_STATE; |
486 |
|
|
$self->{c} = $self->{get_char}->(); |
487 |
|
|
redo A; |
488 |
|
|
} else { |
489 |
|
|
$current_token->{value} .= chr $char; |
490 |
|
|
$self->{state} = $q == 0 ? NAME_STATE : STRING_STATE; |
491 |
|
|
# reconsume |
492 |
|
|
redo A; |
493 |
|
|
} |
494 |
|
|
} elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) { |
495 |
|
|
## NOTE: |\n| in |\r\n| in |unicode| in |escape|. |
496 |
|
|
if ($self->{c} == 0x000A) { # \n |
497 |
|
|
$current_token->{value} .= chr $char; |
498 |
|
|
$self->{state} = $q == 0 ? NAME_STATE : STRING_STATE; |
499 |
|
|
$self->{c} = $self->{get_char}->(); |
500 |
|
|
redo A; |
501 |
|
|
} else { |
502 |
|
|
$current_token->{value} .= chr $char; |
503 |
|
|
$self->{state} = $q == 0 ? NAME_STATE : STRING_STATE; |
504 |
|
|
# reconsume |
505 |
|
|
redo A; |
506 |
|
|
} |
507 |
|
|
} elsif ($self->{state} == STRING_STATE) { |
508 |
|
|
## NOTE: A character in |string$Q| in |string| in |STRING|, or |
509 |
|
|
## a character in |invalid$Q| in |invalid| in |INVALID|, |
510 |
|
|
## where |$Q = $q == 0x0022 ? 1 : 2|. |
511 |
|
|
if ($self->{c} == 0x005C) { # \ |
512 |
|
|
$self->{state} = ESCAPE_OPEN_STATE; |
513 |
|
|
$self->{c} = $self->{get_char}->(); |
514 |
|
|
redo A; |
515 |
|
|
} elsif ($self->{c} == $q) { # " | ' |
516 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
517 |
|
|
$self->{c} = $self->{get_char}->(); |
518 |
|
|
return $current_token; |
519 |
|
|
#redo A; |
520 |
|
|
} elsif ($self->{c} == 0x000A or # \n |
521 |
|
|
$self->{c} == 0x000D or # \r |
522 |
|
|
$self->{c} == 0x000C or # \f |
523 |
|
|
$self->{c} == -1) { |
524 |
|
|
$current_token->{type} = INVALID_TOKEN; |
525 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
526 |
|
|
# reconsume |
527 |
|
|
return $current_token; |
528 |
|
|
#redo A; |
529 |
|
|
} else { |
530 |
|
|
$current_token->{value} .= chr $self->{c}; |
531 |
|
|
# stay in the state |
532 |
|
|
$self->{c} = $self->{get_char}->(); |
533 |
|
|
redo A; |
534 |
|
|
} |
535 |
|
|
} elsif ($self->{state} == NUMBER_STATE) { |
536 |
|
|
## NOTE: 2nd, 3rd, or ... character in |num| before |.|. |
537 |
|
|
if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { |
538 |
|
|
$current_token->{value} .= chr $self->{c}; |
539 |
|
|
# stay in the state |
540 |
|
|
$self->{c} = $self->{get_char}->(); |
541 |
|
|
redo A; |
542 |
|
|
} elsif ($self->{c} == 0x002E) { # . |
543 |
|
|
$self->{state} = NUMBER_DOT_STATE; |
544 |
|
|
$self->{c} = $self->{get_char}->(); |
545 |
|
|
redo A; |
546 |
|
|
} else { |
547 |
|
|
$self->{number} = $self->{value}; |
548 |
|
|
$self->{value} = ''; |
549 |
|
|
$self->{state} = AFTER_NUMBER_STATE; |
550 |
|
|
# reprocess |
551 |
|
|
return $current_token; |
552 |
|
|
#redo A; |
553 |
|
|
} |
554 |
|
|
} elsif ($self->{state} == NUMBER_DOT_STATE) { |
555 |
|
|
## NOTE: The character immediately following |.| in |num|. |
556 |
|
|
if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { |
557 |
|
|
$current_token->{value} .= chr $self->{c}; |
558 |
|
|
$self->{state} = NUMBER_DOT_NUMBER_STATE; |
559 |
|
|
$self->{c} = $self->{get_char}->(); |
560 |
|
|
redo A; |
561 |
|
|
} else { |
562 |
|
|
unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'}; |
563 |
|
|
$self->{number} = $self->{value}; |
564 |
|
|
$self->{value} = ''; |
565 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
566 |
|
|
# reprocess |
567 |
|
|
return $current_token; |
568 |
|
|
#redo A; |
569 |
|
|
} |
570 |
|
|
} elsif ($self->{state} == NUMBER_FRACTION_STATE) { |
571 |
|
|
## NOTE: The character immediately following |.| at the beginning of |num|. |
572 |
|
|
if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { |
573 |
|
|
$current_token->{value} .= chr $self->{c}; |
574 |
|
|
$self->{state} = NUMBER_DOT_NUMBER_STATE; |
575 |
|
|
$self->{c} = $self->{get_char}->(); |
576 |
|
|
redo A; |
577 |
|
|
} else { |
578 |
|
|
$self->{state} = BEFORE_TOKEN_STATE; |
579 |
|
|
$self->{c} = $self->{get_char}->(); |
580 |
|
|
return {type => DELIM_TOKEN, value => '.'}; |
581 |
|
|
#redo A; |
582 |
|
|
} |
583 |
|
|
} elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) { |
584 |
|
|
## NOTE: |[0-9]| in |num| after |.|. |
585 |
|
|
if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { |
586 |
|
|
$current_token->{value} .= chr $self->{c}; |
587 |
|
|
# stay in the state |
588 |
|
|
$self->{c} = $self->{get_char}->(); |
589 |
|
|
redo A; |
590 |
|
|
} else { |
591 |
|
|
$self->{number} = $self->{value}; |
592 |
|
|
$self->{value} = ''; |
593 |
|
|
$self->{state} = AFTER_NUMBER_STATE; |
594 |
|
|
# reprocess |
595 |
|
|
return $current_token; |
596 |
|
|
#redo A; |
597 |
|
|
} |
598 |
|
|
} else { |
599 |
|
|
die "$0: Unknown state |$self->{state}|"; |
600 |
|
|
} |
601 |
|
|
} # A |
602 |
|
|
|
603 |
|
|
## TODO: |URI|, |UNICODE-RANGE|, |COMMENT| |
604 |
|
|
|
605 |
|
|
} # get_next_token |
606 |
|
|
|
607 |
|
|
1; |
608 |
|
|
# $Date:$ |