/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (show annotations) (download)
Sun Jan 20 06:15:20 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.17: +135 -52 lines
++ whatpm/Whatpm/CSS/ChangeLog	20 Jan 2008 06:15:14 -0000
	* Parser.pm, SelectorsParser.pm: |{href}| parameter added
	to all the onerror invocations.  The |{onerror}| function
	is no longer called with |{line}| and |{column}| parameters.

	* Tokenizer.pm: All token are now given |{line}| and |{column}|
	values.

2008-01-20  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::CSS::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 require Exporter;
6 push our @ISA, 'Exporter';
7
8 sub BEFORE_TOKEN_STATE () { 0 }
9 sub BEFORE_NMSTART_STATE () { 1 }
10 sub NAME_STATE () { 2 }
11 sub ESCAPE_OPEN_STATE () { 3 }
12 sub STRING_STATE () { 4 }
13 sub HASH_OPEN_STATE () { 5 }
14 sub NUMBER_STATE () { 6 }
15 sub NUMBER_FRACTION_STATE () { 7 }
16 sub AFTER_NUMBER_STATE () { 8 }
17 sub URI_BEFORE_WSP_STATE () { 9 }
18 sub ESCAPE_STATE () { 10 }
19 sub ESCAPE_BEFORE_LF_STATE () { 11 }
20 sub ESCAPE_BEFORE_NL_STATE () { 12 }
21 sub NUMBER_DOT_STATE () { 13 }
22 sub NUMBER_DOT_NUMBER_STATE () { 14 }
23 sub DELIM_STATE () { 15 }
24 sub URI_UNQUOTED_STATE () { 16 }
25 sub URI_AFTER_WSP_STATE () { 17 }
26 sub AFTER_AT_STATE () { 18 }
27 sub AFTER_AT_HYPHEN_STATE () { 19 }
28
29 sub IDENT_TOKEN () { 1 }
30 sub ATKEYWORD_TOKEN () { 2 }
31 sub HASH_TOKEN () { 3 }
32 sub FUNCTION_TOKEN () { 4 }
33 sub URI_TOKEN () { 5 }
34 sub URI_INVALID_TOKEN () { 6 }
35 sub URI_PREFIX_TOKEN () { 7 }
36 sub URI_PREFIX_INVALID_TOKEN () { 8 }
37 sub STRING_TOKEN () { 9 }
38 sub INVALID_TOKEN () { 10 }
39 sub NUMBER_TOKEN () { 11 }
40 sub DIMENSION_TOKEN () { 12 }
41 sub PERCENTAGE_TOKEN () { 13 }
42 sub UNICODE_RANGE_TOKEN () { 14 }
43 sub DELIM_TOKEN () { 16 }
44 sub PLUS_TOKEN () { 17 }
45 sub GREATER_TOKEN () { 18 }
46 sub COMMA_TOKEN () { 19 }
47 sub TILDE_TOKEN () { 20 }
48 sub DASHMATCH_TOKEN () { 21 }
49 sub PREFIXMATCH_TOKEN () { 22 }
50 sub SUFFIXMATCH_TOKEN () { 23 }
51 sub SUBSTRINGMATCH_TOKEN () { 24 }
52 sub INCLUDES_TOKEN () { 25 }
53 sub SEMICOLON_TOKEN () { 26 }
54 sub LBRACE_TOKEN () { 27 }
55 sub RBRACE_TOKEN () { 28 }
56 sub LPAREN_TOKEN () { 29 }
57 sub RPAREN_TOKEN () { 30 }
58 sub LBRACKET_TOKEN () { 31 }
59 sub RBRACKET_TOKEN () { 32 }
60 sub S_TOKEN () { 33 }
61 sub CDO_TOKEN () { 34 }
62 sub CDC_TOKEN () { 35 }
63 sub COMMENT_TOKEN () { 36 }
64 sub COMMENT_INVALID_TOKEN () { 37 }
65 sub EOF_TOKEN () { 38 }
66 sub MINUS_TOKEN () { 39 }
67 sub STAR_TOKEN () { 40 }
68 sub VBAR_TOKEN () { 41 }
69 sub DOT_TOKEN () { 42 }
70 sub COLON_TOKEN () { 43 }
71 sub MATCH_TOKEN () { 44 }
72 sub EXCLAMATION_TOKEN () { 45 }
73
74 our @TokenName = qw(
75 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79 LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81 );
82
83 our @EXPORT_OK = qw(
84 IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85 URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86 STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87 UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88 TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89 SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90 RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91 S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92 MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93 EXCLAMATION_TOKEN
94 );
95
96 our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97
98 sub new ($) {
99 my $self = bless {token => [], get_char => sub { -1 }}, shift;
100 return $self;
101 } # new
102
103 sub init ($) {
104 my $self = shift;
105 $self->{state} = BEFORE_TOKEN_STATE;
106 $self->{c} = $self->{get_char}->();
107 #$self->{t} = {type => token-type, value => value, number => number};
108 } # init
109
110 sub get_next_token ($) {
111 my $self = shift;
112 if (@{$self->{token}}) {
113 return shift @{$self->{token}};
114 }
115
116 my $char;
117 my $num; # |{num}|, if any.
118 my $i; # |$i + 1|th character in |unicode| in |escape|.
119 my $q;
120 ## NOTE:
121 ## 0: in |ident|.
122 ## 1: in |URI| outside of |string|.
123 ## 0x0022: in |string1| or |invalid1|.
124 ## 0x0027: in |string2| or |invalid2|.
125
126 A: {
127 if ($self->{state} == BEFORE_TOKEN_STATE) {
128 if ($self->{c} == 0x002D) { # -
129 ## NOTE: |-| in |ident| in |IDENT|
130 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
131 line => $self->{line}, column => $self->{column}};
132 $self->{state} = BEFORE_NMSTART_STATE;
133 $self->{c} = $self->{get_char}->();
134 redo A;
135 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
136 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
137 line => $self->{line}, column => $self->{column}};
138 $self->{c} = $self->{get_char}->();
139 if ($self->{c} == 0x002B) { # +
140 my ($l, $c) = ($self->{line}, $self->{column});
141 $self->{c} = $self->{get_char}->();
142 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
145 $self->{c} == 0x003F) { # ?
146 $self->{t}->{value} = chr $self->{c};
147 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
148 $self->{c} = $self->{get_char}->();
149 C: for (2..6) {
150 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
151 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
152 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
153 $self->{c} == 0x003F) { # ?
154 $self->{t}->{value} .= chr $self->{c};
155 $self->{c} = $self->{get_char}->();
156 } else {
157 last C;
158 }
159 } # C
160
161 if ($self->{c} == 0x002D) { # -
162 $self->{c} = $self->{get_char}->();
163 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
164 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
165 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
166 $self->{t}->{value} .= '-' . chr $self->{c};
167 $self->{c} = $self->{get_char}->();
168 C: for (2..6) {
169 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
170 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
171 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
172 $self->{t}->{value} .= chr $self->{c};
173 $self->{c} = $self->{get_char}->();
174 } else {
175 last C;
176 }
177 } # C
178
179 #
180 } else {
181 my $token = $self->{t};
182 $self->{t} = {type => IDENT_TOKEN, value => '-',
183 line => $self->{line},
184 column => $self->{column}};
185 $self->{state} = BEFORE_NMSTART_STATE;
186 # reprocess
187 return $token;
188 #redo A;
189 }
190 }
191
192 $self->{state} = BEFORE_TOKEN_STATE;
193 # reprocess
194 return $self->{t};
195 #redo A;
196 } else {
197 unshift @{$self->{token}},
198 {type => PLUS_TOKEN, line => $l, column => $c};
199 $self->{state} = BEFORE_TOKEN_STATE;
200 # reprocess
201 return $self->{t};
202 #redo A;
203 }
204 } else {
205 $self->{state} = NAME_STATE;
206 # reprocess
207 redo A;
208 }
209 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
210 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
211 $self->{c} == 0x005F or # _
212 $self->{c} > 0x007F) { # nonascii
213 ## NOTE: |nmstart| in |ident| in |IDENT|
214 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
215 line => $self->{line}, column => $self->{column}};
216 $self->{state} = NAME_STATE;
217 $self->{c} = $self->{get_char}->();
218 redo A;
219 } elsif ($self->{c} == 0x005C) { # \
220 ## NOTE: |nmstart| in |ident| in |IDENT|
221 $self->{t} = {type => IDENT_TOKEN, value => '',
222 line => $self->{line}, column => $self->{column}};
223 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
224 $self->{c} = $self->{get_char}->();
225 redo A;
226 } elsif ($self->{c} == 0x0040) { # @
227 ## NOTE: |@| in |ATKEYWORD|
228 $self->{t} = {type => ATKEYWORD_TOKEN, value => '',
229 line => $self->{line}, column => $self->{column}};
230 $self->{state} = AFTER_AT_STATE;
231 $self->{c} = $self->{get_char}->();
232 redo A;
233 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
234 $self->{t} = {type => STRING_TOKEN, value => '',
235 line => $self->{line}, column => $self->{column}};
236 $self->{state} = STRING_STATE; $q = $self->{c};
237 $self->{c} = $self->{get_char}->();
238 redo A;
239 } elsif ($self->{c} == 0x0023) { # #
240 ## NOTE: |#| in |HASH|.
241 $self->{t} = {type => HASH_TOKEN, value => '',
242 line => $self->{line}, column => $self->{column}};
243 $self->{state} = HASH_OPEN_STATE;
244 $self->{c} = $self->{get_char}->();
245 redo A;
246 } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
247 ## NOTE: |num|.
248 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c},
249 line => $self->{line}, column => $self->{column}};
250 $self->{state} = NUMBER_STATE;
251 $self->{c} = $self->{get_char}->();
252 redo A;
253 } elsif ($self->{c} == 0x002E) { # .
254 ## NOTE: |num|.
255 $self->{t} = {type => NUMBER_TOKEN, value => '0',
256 line => $self->{line}, column => $self->{column}};
257 $self->{state} = NUMBER_FRACTION_STATE;
258 $self->{c} = $self->{get_char}->();
259 redo A;
260 } elsif ($self->{c} == 0x002F) { # /
261 my ($l, $c) = ($self->{line}, $self->{column});
262 $self->{c} = $self->{get_char}->();
263 if ($self->{c} == 0x002A) { # *
264 C: {
265 $self->{c} = $self->{get_char}->();
266 if ($self->{c} == 0x002A) { # *
267 D: {
268 $self->{c} = $self->{get_char}->();
269 if ($self->{c} == 0x002F) { # /
270 #
271 } elsif ($self->{c} == 0x002A) { # *
272 redo D;
273 } else {
274 redo C;
275 }
276 } # D
277 } elsif ($self->{c} == -1) {
278 # stay in the state
279 # reprocess
280 return {type => COMMENT_INVALID_TOKEN};
281 #redo A;
282 } else {
283 redo C;
284 }
285 } # C
286
287 # stay in the state.
288 $self->{c} = $self->{get_char}->();
289 redo A;
290 } else {
291 # stay in the state.
292 # reprocess
293 return {type => DELIM_TOKEN, value => '/', line => $l, column => $c};
294 #redo A;
295 }
296 } elsif ($self->{c} == 0x003C) { # <
297 my ($l, $c) = ($self->{line}, $self->{column});
298 ## NOTE: |CDO|
299 $self->{c} = $self->{get_char}->();
300 if ($self->{c} == 0x0021) { # !
301 $self->{c} = $self->{get_char}->();
302 if ($self->{c} == 0x002D) { # -
303 $self->{c} = $self->{get_char}->();
304 if ($self->{c} == 0x002D) { # -
305 $self->{state} = BEFORE_TOKEN_STATE;
306 $self->{c} = $self->{get_char}->();
307 return {type => CDO_TOKEN, line => $l, column => $c};
308 #redo A;
309 } else {
310 unshift @{$self->{token}},
311 {type => EXCLAMATION_TOKEN, line => $l, column => $c + 1};
312 ## NOTE: |-| in |ident| in |IDENT|
313 $self->{t} = {type => IDENT_TOKEN, value => '-',
314 line => $l, column => $c + 2};
315 $self->{state} = BEFORE_NMSTART_STATE;
316 #reprocess
317 return {type => DELIM_TOKEN, value => '<',
318 line => $l, column => $c};
319 #redo A;
320 }
321 } else {
322 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN,
323 line => $l, column => $c + 1};
324 $self->{state} = BEFORE_TOKEN_STATE;
325 #reprocess
326 return {type => DELIM_TOKEN, value => '<',
327 line => $l, column => $c};
328 #redo A;
329 }
330 } else {
331 $self->{state} = BEFORE_TOKEN_STATE;
332 #reprocess
333 return {type => DELIM_TOKEN, value => '<',
334 line => $l, column => $c};
335 #redo A;
336 }
337 } elsif (my $t = {
338 0x0021 => EXCLAMATION_TOKEN, # !
339 0x002D => MINUS_TOKEN, # -
340 0x002E => DOT_TOKEN, # .
341 0x003A => COLON_TOKEN, # :
342 0x003B => SEMICOLON_TOKEN, # ;
343 0x003D => MATCH_TOKEN, # =
344 0x007B => LBRACE_TOKEN, # {
345 0x007D => RBRACE_TOKEN, # }
346 0x0028 => LPAREN_TOKEN, # (
347 0x0029 => RPAREN_TOKEN, # )
348 0x005B => LBRACKET_TOKEN, # [
349 0x005D => RBRACKET_TOKEN, # ]
350 }->{$self->{c}}) {
351 my ($l, $c) = ($self->{line}, $self->{column});
352 # stay in the state
353 $self->{c} = $self->{get_char}->($self);
354 return {type => $t, line => $l, column => $c};
355 # redo A;
356 } elsif ({
357 0x0020 => 1, # SP
358 0x0009 => 1, # \t
359 0x000D => 1, # \r
360 0x000A => 1, # \n
361 0x000C => 1, # \f
362 }->{$self->{c}}) {
363 my ($l, $c) = ($self->{line}, $self->{column});
364 W: {
365 $self->{c} = $self->{get_char}->();
366 if ({
367 0x0020 => 1, # SP
368 0x0009 => 1, # \t
369 0x000D => 1, # \r
370 0x000A => 1, # \n
371 0x000C => 1, # \f
372 }->{$self->{c}}) {
373 redo W;
374 } elsif (my $v = {
375 0x002B => PLUS_TOKEN, # +
376 0x003E => GREATER_TOKEN, # >
377 0x002C => COMMA_TOKEN, # ,
378 0x007E => TILDE_TOKEN, # ~
379 }->{$self->{c}}) {
380 my ($l, $c) = ($self->{line}, $self->{column});
381 # stay in the state
382 $self->{c} = $self->{get_char}->();
383 return {type => $v, line => $l, column => $c};
384 #redo A;
385 } else {
386 # stay in the state
387 # reprocess
388 return {type => S_TOKEN, line => $l, column => $c};
389 #redo A;
390 }
391 } # W
392 } elsif (my $v = {
393 0x007C => DASHMATCH_TOKEN, # |
394 0x005E => PREFIXMATCH_TOKEN, # ^
395 0x0024 => SUFFIXMATCH_TOKEN, # $
396 0x002A => SUBSTRINGMATCH_TOKEN, # *
397 }->{$self->{c}}) {
398 my ($line, $column) = ($self->{line}, $self->{column});
399 my $c = $self->{c};
400 $self->{c} = $self->{get_char}->();
401 if ($self->{c} == 0x003D) { # =
402 # stay in the state
403 $self->{c} = $self->{get_char}->();
404 return {type => $v, line => $line, column => $column};
405 #redo A;
406 } elsif ($v = {
407 0x002A => STAR_TOKEN, # *
408 0x007C => VBAR_TOKEN, # |
409 }->{$c}) {
410 # stay in the state.
411 # reprocess
412 return {type => $v, line => $line, column => $column};
413 #redo A;
414 } else {
415 # stay in the state
416 # reprocess
417 return {type => DELIM_TOKEN, value => chr $c,
418 line => $line, column => $column};
419 #redo A;
420 }
421 } elsif ($self->{c} == 0x002B) { # +
422 my ($l, $c) = ($self->{line}, $self->{column});
423 # stay in the state
424 $self->{c} = $self->{get_char}->();
425 return {type => PLUS_TOKEN, line => $l, column => $c};
426 #redo A;
427 } elsif ($self->{c} == 0x003E) { # >
428 my ($l, $c) = ($self->{line}, $self->{column});
429 # stay in the state
430 $self->{c} = $self->{get_char}->();
431 return {type => GREATER_TOKEN, line => $l, column => $c};
432 #redo A;
433 } elsif ($self->{c} == 0x002C) { # ,
434 my ($l, $c) = ($self->{line}, $self->{column});
435 # stay in the state
436 $self->{c} = $self->{get_char}->();
437 return {type => COMMA_TOKEN, line => $l, column => $c};
438 #redo A;
439 } elsif ($self->{c} == 0x007E) { # ~
440 my ($l, $c) = ($self->{line}, $self->{column});
441 $self->{c} = $self->{get_char}->();
442 if ($self->{c} == 0x003D) { # =
443 # stay in the state
444 $self->{c} = $self->{get_char}->();
445 return {type => INCLUDES_TOKEN, line => $l, column => $c};
446 #redo A;
447 } else {
448 # stay in the state
449 # reprocess
450 return {type => TILDE_TOKEN, line => $l, column => $c};
451 #redo A;
452 }
453 } elsif ($self->{c} == -1) {
454 # stay in the state
455 $self->{c} = $self->{get_char}->();
456 return {type => EOF_TOKEN,
457 line => $self->{line}, column => $self->{column}};
458 #redo A;
459 } else {
460 # stay in the state
461 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c},
462 line => $self->{line}, column => $self->{column}};
463 $self->{c} = $self->{get_char}->();
464 return $self->{t};
465 #redo A;
466 }
467 } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
468 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
469 ## |FUNCTION|)
470 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
471 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
472 $self->{c} == 0x005F or # _
473 $self->{c} > 0x007F) { # nonascii
474 $self->{t}->{value} .= chr $self->{c};
475 $self->{t}->{type} = DIMENSION_TOKEN
476 if $self->{t}->{type} == NUMBER_TOKEN;
477 $self->{state} = NAME_STATE;
478 $self->{c} = $self->{get_char}->();
479 redo A;
480 } elsif ($self->{c} == 0x005C) { # \
481 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
482 $self->{c} = $self->{get_char}->();
483 redo A;
484 } elsif ($self->{c} == 0x002D) { # -
485 if ($self->{t}->{type} == IDENT_TOKEN) {
486 $self->{c} = $self->{get_char}->();
487 if ($self->{c} == 0x003E) { # >
488 $self->{state} = BEFORE_TOKEN_STATE;
489 $self->{c} = $self->{get_char}->();
490 return {type => CDC_TOKEN,
491 line => $self->{t}->{line},
492 column => $self->{t}->{column}};
493 #redo A;
494 } else {
495 ## NOTE: |-|, |-|, $self->{c}
496 #$self->{t} = {type => IDENT_TOKEN, value => '-'};
497 $self->{t}->{column}++;
498 # stay in the state
499 # reconsume
500 return {type => MINUS_TOKEN,
501 line => $self->{t}->{line},
502 column => $self->{t}->{column} - 1};
503 #redo A;
504 }
505 } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
506 my ($l, $c) = ($self->{line}, $self->{column}); # second '-'
507 $self->{c} = $self->{get_char}->();
508 if ($self->{c} == 0x003E) { # >
509 unshift @{$self->{token}}, {type => CDC_TOKEN};
510 $self->{t}->{type} = NUMBER_TOKEN;
511 $self->{t}->{value} = '';
512 $self->{state} = BEFORE_TOKEN_STATE;
513 $self->{c} = $self->{get_char}->();
514 return $self->{t};
515 #redo A;
516 } else {
517 ## NOTE: NUMBER, |-|, |-|, $self->{c}
518 my $t = $self->{t};
519 $t->{type} = NUMBER_TOKEN;
520 $t->{value} = '';
521 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
522 line => $l, column => $c};
523 unshift @{$self->{token}}, {type => MINUS_TOKEN,
524 line => $l, column => $c - 1};
525 # stay in the state
526 # reconsume
527 return $t;
528 #redo A;
529 }
530 } else {
531 #
532 }
533 } else {
534 #
535 }
536
537 if ($self->{t}->{type} == DIMENSION_TOKEN) {
538 ## NOTE: |-| after |NUMBER|.
539 unshift @{$self->{token}}, {type => MINUS_TOKEN,
540 line => $self->{line},
541 column => $self->{column} - 1};
542 ## BUG: column might be wrong if on the line boundary.
543 $self->{state} = BEFORE_TOKEN_STATE;
544 # reprocess
545 $self->{t}->{type} = NUMBER_TOKEN;
546 $self->{t}->{value} = '';
547 return $self->{t};
548 } else {
549 ## NOTE: |-| not followed by |nmstart|.
550 $self->{state} = BEFORE_TOKEN_STATE;
551 # reprocess
552 return {type => MINUS_TOKEN,
553 line => $self->{line}, column => $self->{column} - 1};
554 ## BUG: column might be wrong if on the line boundary.
555 }
556 } elsif ($self->{state} == AFTER_AT_STATE) {
557 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
558 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
559 $self->{c} == 0x005F or # _
560 $self->{c} > 0x007F) { # nonascii
561 $self->{t}->{value} .= chr $self->{c};
562 $self->{state} = NAME_STATE;
563 $self->{c} = $self->{get_char}->();
564 redo A;
565 } elsif ($self->{c} == 0x002D) { # -
566 $self->{t}->{value} .= '-';
567 $self->{state} = AFTER_AT_HYPHEN_STATE;
568 $self->{c} = $self->{get_char}->();
569 redo A;
570 } elsif ($self->{c} == 0x005C) { # \
571 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
572 $self->{c} = $self->{get_char}->();
573 redo A;
574 } else {
575 $self->{state} = BEFORE_TOKEN_STATE;
576 # reprocess
577 return {type => DELIM_TOKEN, value => '@',
578 line => $self->{t}->{line},
579 column => $self->{t}->{column}};
580 }
581 } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
582 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
583 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
584 $self->{c} == 0x005F or # _
585 $self->{c} > 0x007F) { # nonascii
586 $self->{t}->{value} .= chr $self->{c};
587 $self->{state} = NAME_STATE;
588 $self->{c} = $self->{get_char}->();
589 redo A;
590 } elsif ($self->{c} == 0x002D) { # -
591 $self->{c} = $self->{get_char}->();
592 if ($self->{c} == 0x003E) { # >
593 unshift @{$self->{token}}, {type => CDC_TOKEN};
594 $self->{state} = BEFORE_TOKEN_STATE;
595 $self->{c} = $self->{get_char}->();
596 return {type => DELIM_TOKEN, value => '@'};
597 #redo A;
598 } else {
599 unshift @{$self->{token}}, {type => MINUS_TOKEN};
600 $self->{t} = {type => IDENT_TOKEN, value => '-'};
601 $self->{state} = BEFORE_NMSTART_STATE;
602 # reprocess
603 return {type => DELIM_TOKEN, value => '@'};
604 #redo A;
605 }
606 } elsif ($self->{c} == 0x005C) { # \
607 ## TODO: @-\{nl}
608 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
609 $self->{c} = $self->{get_char}->();
610 redo A;
611 } else {
612 unshift @{$self->{token}}, {type => MINUS_TOKEN};
613 $self->{state} = BEFORE_TOKEN_STATE;
614 # reprocess
615 return {type => DELIM_TOKEN, value => '@'};
616 }
617 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
618 if ($self->{c} == 0x002D) { # -
619 ## NOTE: |-| in |ident|.
620 $self->{t}->{hyphen} = 1;
621 $self->{t}->{value} = '-';
622 $self->{t}->{type} = DIMENSION_TOKEN;
623 $self->{state} = BEFORE_NMSTART_STATE;
624 $self->{c} = $self->{get_char}->();
625 redo A;
626 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
627 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
628 $self->{c} == 0x005F or # _
629 $self->{c} > 0x007F) { # nonascii
630 ## NOTE: |nmstart| in |ident|.
631 $self->{t}->{value} = chr $self->{c};
632 $self->{t}->{type} = DIMENSION_TOKEN;
633 $self->{state} = NAME_STATE;
634 $self->{c} = $self->{get_char}->();
635 redo A;
636 } elsif ($self->{c} == 0x005C) { # \
637 ## NOTE: |nmstart| in |ident| in |IDENT|
638 $self->{t}->{value} = '';
639 $self->{t}->{type} = DIMENSION_TOKEN;
640 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
641 $self->{c} = $self->{get_char}->();
642 redo A;
643 } elsif ($self->{c} == 0x0025) { # %
644 $self->{t}->{type} = PERCENTAGE_TOKEN;
645 $self->{state} = BEFORE_TOKEN_STATE;
646 $self->{c} = $self->{get_char}->();
647 return $self->{t};
648 #redo A;
649 } else {
650 $self->{state} = BEFORE_TOKEN_STATE;
651 # reprocess
652 return $self->{t};
653 #redo A;
654 }
655 } elsif ($self->{state} == HASH_OPEN_STATE) {
656 ## NOTE: The first |nmchar| in |name| in |HASH|.
657 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
658 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
659 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
660 $self->{c} == 0x002D or # -
661 $self->{c} == 0x005F or # _
662 $self->{c} > 0x007F) { # nonascii
663 $self->{t}->{value} .= chr $self->{c};
664 $self->{state} = NAME_STATE;
665 $self->{c} = $self->{get_char}->();
666 redo A;
667 } elsif ($self->{c} == 0x005C) { # \
668 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
669 $self->{c} = $self->{get_char}->();
670 redo A;
671 } else {
672 $self->{state} = BEFORE_TOKEN_STATE;
673 # reprocess
674 return {type => DELIM_TOKEN, value => '#',
675 line => $self->{t}->{line},
676 column => $self->{t}->{column}};
677 #redo A;
678 }
679 } elsif ($self->{state} == NAME_STATE) {
680 ## NOTE: |nmchar| in (|ident| or |name|).
681 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
682 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
683 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
684 $self->{c} == 0x005F or # _
685 $self->{c} == 0x002D or # -
686 $self->{c} > 0x007F) { # nonascii
687 $self->{t}->{value} .= chr $self->{c};
688 # stay in the state
689 $self->{c} = $self->{get_char}->();
690 redo A;
691 } elsif ($self->{c} == 0x005C) { # \
692 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
693 $self->{c} = $self->{get_char}->();
694 redo A;
695 } elsif ($self->{c} == 0x0028 and # (
696 $self->{t}->{type} == IDENT_TOKEN) { # (
697 my $func_name = $self->{t}->{value};
698 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
699 if ($func_name eq 'url' or $func_name eq 'url-prefix') {
700 if ($self->{t}->{has_escape}) {
701 ## TODO: warn
702 }
703 $self->{t}->{type}
704 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
705 $self->{t}->{value} = '';
706 $self->{state} = URI_BEFORE_WSP_STATE;
707 $self->{c} = $self->{get_char}->();
708 redo A;
709 } else {
710 $self->{t}->{type} = FUNCTION_TOKEN;
711 $self->{state} = BEFORE_TOKEN_STATE;
712 $self->{c} = $self->{get_char}->();
713 return $self->{t};
714 #redo A;
715 }
716 } else {
717 $self->{state} = BEFORE_TOKEN_STATE;
718 # reconsume
719 return $self->{t};
720 #redo A;
721 }
722 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
723 while ({
724 0x0020 => 1, # SP
725 0x0009 => 1, # \t
726 0x000D => 1, # \r
727 0x000A => 1, # \n
728 0x000C => 1, # \f
729 }->{$self->{c}}) {
730 $self->{c} = $self->{get_char}->();
731 }
732 if ($self->{c} == -1) {
733 $self->{t}->{type} = {
734 URI_TOKEN, URI_INVALID_TOKEN,
735 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
736 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
737 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
738 }->{$self->{t}->{type}};
739 $self->{state} = BEFORE_TOKEN_STATE;
740 $self->{c} = $self->{get_char}->();
741 return $self->{t};
742 #redo A;
743 } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
744 ## TODO: Should we consider matches of "(" and ")"?
745 $self->{t}->{type} = {
746 URI_TOKEN, URI_INVALID_TOKEN,
747 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
748 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
749 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
750 }->{$self->{t}->{type}};
751 $self->{state} = URI_UNQUOTED_STATE;
752 $self->{c} = $self->{get_char}->();
753 redo A;
754 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
755 $self->{state} = STRING_STATE; $q = $self->{c};
756 $self->{c} = $self->{get_char}->();
757 redo A;
758 } elsif ($self->{c} == 0x0029) { # )
759 $self->{state} = BEFORE_TOKEN_STATE;
760 $self->{c} = $self->{get_char}->();
761 return $self->{t};
762 #redo A;
763 } elsif ($self->{c} == 0x005C) { # \
764 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
765 $self->{c} = $self->{get_char}->();
766 redo A;
767 } else {
768 $self->{t}->{value} .= chr $self->{c};
769 $self->{state} = URI_UNQUOTED_STATE;
770 $self->{c} = $self->{get_char}->();
771 redo A;
772 }
773 } elsif ($self->{state} == URI_UNQUOTED_STATE) {
774 if ({
775 0x0020 => 1, # SP
776 0x0009 => 1, # \t
777 0x000D => 1, # \r
778 0x000A => 1, # \n
779 0x000C => 1, # \f
780 }->{$self->{c}}) {
781 $self->{state} = URI_AFTER_WSP_STATE;
782 $self->{c} = $self->{get_char}->();
783 redo A;
784 } elsif ($self->{c} == -1) {
785 $self->{t}->{type} = {
786 URI_TOKEN, URI_INVALID_TOKEN,
787 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
788 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
789 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
790 }->{$self->{t}->{type}};
791 $self->{state} = BEFORE_TOKEN_STATE;
792 $self->{c} = $self->{get_char}->();
793 return $self->{t};
794 #redo A;
795 } elsif ($self->{c} < 0x0020 or {
796 0x0022 => 1, # "
797 0x0027 => 1, # '
798 0x0028 => 1, # (
799 }->{$self->{c}}) { # C0 or (
800 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
801 $self->{t}->{type} = {
802 URI_TOKEN, URI_INVALID_TOKEN,
803 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
804 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
805 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
806 }->{$self->{t}->{type}};
807 # stay in the state.
808 $self->{c} = $self->{get_char}->();
809 redo A;
810 } elsif ($self->{c} == 0x0029) { # )
811 $self->{state} = BEFORE_TOKEN_STATE;
812 $self->{c} = $self->{get_char}->();
813 return $self->{t};
814 #redo A;
815 } elsif ($self->{c} == 0x005C) { # \
816 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
817 $self->{c} = $self->{get_char}->();
818 redo A;
819 } else {
820 $self->{t}->{value} .= chr $self->{c};
821 # stay in the state.
822 $self->{c} = $self->{get_char}->();
823 redo A;
824 }
825 } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
826 if ({
827 0x0020 => 1, # SP
828 0x0009 => 1, # \t
829 0x000D => 1, # \r
830 0x000A => 1, # \n
831 0x000C => 1, # \f
832 }->{$self->{c}}) {
833 # stay in the state.
834 $self->{c} = $self->{get_char}->();
835 redo A;
836 } elsif ($self->{c} == -1) {
837 $self->{t}->{type} = {
838 URI_TOKEN, URI_INVALID_TOKEN,
839 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
840 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
841 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
842 }->{$self->{t}->{type}};
843 $self->{state} = BEFORE_TOKEN_STATE;
844 $self->{c} = $self->{get_char}->();
845 return $self->{t};
846 #redo A;
847 } elsif ($self->{c} == 0x0029) { # )
848 $self->{state} = BEFORE_TOKEN_STATE;
849 $self->{c} = $self->{get_char}->();
850 return $self->{t};
851 #redo A;
852 } elsif ($self->{c} == 0x005C) { # \
853 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
854 $self->{c} = $self->{get_char}->();
855 redo A;
856 } else {
857 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
858 $self->{t}->{type} = {
859 URI_TOKEN, URI_INVALID_TOKEN,
860 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
861 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
862 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
863 }->{$self->{t}->{type}};
864 # stay in the state.
865 $self->{c} = $self->{get_char}->();
866 redo A;
867 }
868 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
869 $self->{t}->{has_escape} = 1;
870 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
871 ## NOTE: second character of |unicode| in |escape|.
872 $char = $self->{c} - 0x0030;
873 $self->{state} = ESCAPE_STATE; $i = 2;
874 $self->{c} = $self->{get_char}->();
875 redo A;
876 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
877 ## NOTE: second character of |unicode| in |escape|.
878 $char = $self->{c} - 0x0041 + 0xA;
879 $self->{state} = ESCAPE_STATE; $i = 2;
880 $self->{c} = $self->{get_char}->();
881 redo A;
882 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
883 ## NOTE: second character of |unicode| in |escape|.
884 $char = $self->{c} - 0x0061 + 0xA;
885 $self->{state} = ESCAPE_STATE; $i = 2;
886 $self->{c} = $self->{get_char}->();
887 redo A;
888 } elsif ($self->{c} == 0x000A or # \n
889 $self->{c} == 0x000C) { # \f
890 if ($q == 0) {
891 #
892 } elsif ($q == 1) {
893 ## NOTE: In |escape| in |URI|.
894 $self->{t}->{type} = {
895 URI_TOKEN, URI_INVALID_TOKEN,
896 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
897 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
898 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
899 }->{$self->{t}->{type}};
900 $self->{t}->{value} .= chr $self->{c};
901 $self->{state} = URI_UNQUOTED_STATE;
902 $self->{c} = $self->{get_char}->();
903 redo A;
904 } else {
905 ## Note: In |nl| in ... in |string| or |ident|.
906 $self->{state} = STRING_STATE;
907 $self->{c} = $self->{get_char}->();
908 redo A;
909 }
910 } elsif ($self->{c} == 0x000D) { # \r
911 if ($q == 0) {
912 #
913 } elsif ($q == 1) {
914 ## NOTE: In |escape| in |URI|.
915 $self->{t}->{type} = {
916 URI_TOKEN, URI_INVALID_TOKEN,
917 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
918 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
919 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
920 }->{$self->{t}->{type}};
921 $self->{state} = ESCAPE_BEFORE_LF_STATE;
922 $self->{c} = $self->{get_char}->();
923 redo A;
924 } else {
925 ## Note: In |nl| in ... in |string| or |ident|.
926 $self->{state} = ESCAPE_BEFORE_LF_STATE;
927 $self->{c} = $self->{get_char}->();
928 redo A;
929 }
930 } elsif ($self->{c} == -1) {
931 #
932 } else {
933 ## NOTE: second character of |escape|.
934 $self->{t}->{value} .= chr $self->{c};
935 $self->{state} = $q == 0 ? NAME_STATE :
936 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
937 $self->{c} = $self->{get_char}->();
938 redo A;
939 }
940
941 if ($q == 0) {
942 if ($self->{t}->{type} == DIMENSION_TOKEN) {
943 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
944 $self->{state} = BEFORE_TOKEN_STATE;
945 # reprocess
946 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
947 line => $self->{line},
948 column => $self->{column} - 2};
949 unshift @{$self->{token}}, {type => MINUS_TOKEN,
950 line => $self->{line},
951 column => $self->{column} - 1};
952 ## BUG: line and column might be wrong if they are on the
953 ## line boundary.
954 $self->{t}->{type} = NUMBER_TOKEN;
955 $self->{t}->{value} = '';
956 return $self->{t};
957 #redo A;
958 } elsif (length $self->{t}->{value}) {
959 $self->{state} = BEFORE_TOKEN_STATE;
960 # reprocess
961 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
962 line => $self->{line},
963 column => $self->{column} - 1};
964 ## BUG: line and column might be wrong if they are on the
965 ## line boundary.
966 return $self->{t};
967 #redo A;
968 } else {
969 $self->{state} = BEFORE_TOKEN_STATE;
970 # reprocess
971 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
972 line => $self->{line},
973 column => $self->{column} - 1};
974 ## BUG: line and column might be wrong if they are on the
975 ## line boundary.
976 $self->{t}->{type} = NUMBER_TOKEN;
977 $self->{t}->{value} = '';
978 return $self->{t};
979 #redo A;
980 }
981 } else {
982 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
983 $self->{state} = BEFORE_TOKEN_STATE;
984 # reprocess
985 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
986 line => $self->{line},
987 column => $self->{column} - 2};
988 return {type => MINUS_TOKEN,
989 line => $self->{line},
990 column => $self->{column} - 1};
991 ## BUG: line and column might be wrong if they are on the
992 ## line boundary.
993 #redo A;
994 } elsif (length $self->{t}->{value}) {
995 $self->{state} = BEFORE_TOKEN_STATE;
996 # reprocess
997 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
998 line => $self->{line},
999 column => $self->{column} - 1};
1000 ## BUG: line and column might be wrong if they are on the
1001 ## line boundary.
1002 return $self->{t};
1003 #redo A;
1004 } else {
1005 $self->{state} = BEFORE_TOKEN_STATE;
1006 # reprocess
1007 return {type => DELIM_TOKEN, value => '\\',
1008 line => $self->{line},
1009 column => $self->{column} - 1};
1010 ## BUG: line and column might be wrong if they are on the
1011 ## line boundary.
1012 #redo A;
1013 }
1014 }
1015 } elsif ($q == 1) {
1016 $self->{state} = URI_UNQUOTED_STATE;
1017 $self->{c} = $self->{get_char}->();
1018 redo A;
1019 } else {
1020 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
1021 line => $self->{line},
1022 column => $self->{column} - 1};
1023 ## BUG: line and column might be wrong if they are on the
1024 ## line boundary.
1025 $self->{t}->{type} = {
1026 STRING_TOKEN, INVALID_TOKEN,
1027 URI_TOKEN, URI_INVALID_TOKEN,
1028 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1029 }->{$self->{t}->{type}} || $self->{t}->{type};
1030 $self->{state} = BEFORE_TOKEN_STATE;
1031 # reprocess
1032 return $self->{t};
1033 #redo A;
1034 }
1035 } elsif ($self->{state} == ESCAPE_STATE) {
1036 ## NOTE: third..seventh character of |unicode| in |escape|.
1037 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
1038 $char = $char * 0x10 + $self->{c} - 0x0030;
1039 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1040 $self->{c} = $self->{get_char}->();
1041 redo A;
1042 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
1043 $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
1044 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1045 $self->{c} = $self->{get_char}->();
1046 redo A;
1047 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
1048 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
1049 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1050 $self->{c} = $self->{get_char}->();
1051 redo A;
1052 } elsif ($self->{c} == 0x0020 or # SP
1053 $self->{c} == 0x000A or # \n
1054 $self->{c} == 0x0009 or # \t
1055 $self->{c} == 0x000C) { # \f
1056 $self->{t}->{value} .= chr $char;
1057 $self->{state} = $q == 0 ? NAME_STATE :
1058 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1059 $self->{c} = $self->{get_char}->();
1060 redo A;
1061 } elsif ($self->{c} == 0x000D) { # \r
1062 $self->{state} = ESCAPE_BEFORE_LF_STATE;
1063 $self->{c} = $self->{get_char}->();
1064 redo A;
1065 } else {
1066 $self->{t}->{value} .= chr $char;
1067 $self->{state} = $q == 0 ? NAME_STATE :
1068 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1069 # reconsume
1070 redo A;
1071 }
1072 } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
1073 ## NOTE: eightth character of |unicode| in |escape|.
1074 if ($self->{c} == 0x0020 or # SP
1075 $self->{c} == 0x000A or # \n
1076 $self->{c} == 0x0009 or # \t
1077 $self->{c} == 0x000C) { # \f
1078 $self->{t}->{value} .= chr $char;
1079 $self->{state} = $q == 0 ? NAME_STATE :
1080 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1081 $self->{c} = $self->{get_char}->();
1082 redo A;
1083 } elsif ($self->{c} == 0x000D) { # \r
1084 $self->{state} = ESCAPE_BEFORE_NL_STATE;
1085 $self->{c} = $self->{get_char}->();
1086 redo A;
1087 } else {
1088 $self->{t}->{value} .= chr $char;
1089 $self->{state} = $q == 0 ? NAME_STATE :
1090 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1091 # reconsume
1092 redo A;
1093 }
1094 } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1095 ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1096 if ($self->{c} == 0x000A) { # \n
1097 $self->{state} = $q == 0 ? NAME_STATE :
1098 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1099 $self->{c} = $self->{get_char}->();
1100 redo A;
1101 } else {
1102 $self->{state} = $q == 0 ? NAME_STATE :
1103 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1104 # reprocess
1105 redo A;
1106 }
1107 } elsif ($self->{state} == STRING_STATE) {
1108 ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1109 ## a character in |invalid$Q| in |invalid| in |INVALID|,
1110 ## where |$Q = $q == 0x0022 ? 1 : 2|.
1111 ## Or, in |URI|.
1112 if ($self->{c} == 0x005C) { # \
1113 $self->{state} = ESCAPE_OPEN_STATE;
1114 $self->{c} = $self->{get_char}->();
1115 redo A;
1116 } elsif ($self->{c} == $q) { # " | '
1117 if ($self->{t}->{type} == STRING_TOKEN) {
1118 $self->{state} = BEFORE_TOKEN_STATE;
1119 $self->{c} = $self->{get_char}->();
1120 return $self->{t};
1121 #redo A;
1122 } else {
1123 $self->{state} = URI_AFTER_WSP_STATE;
1124 $self->{c} = $self->{get_char}->();
1125 redo A;
1126 }
1127 } elsif ($self->{c} == 0x000A or # \n
1128 $self->{c} == 0x000D or # \r
1129 $self->{c} == 0x000C or # \f
1130 $self->{c} == -1) {
1131 $self->{t}->{type} = {
1132 STRING_TOKEN, INVALID_TOKEN,
1133 INVALID_TOKEN, INVALID_TOKEN,
1134 URI_TOKEN, URI_INVALID_TOKEN,
1135 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1136 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1137 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1138 }->{$self->{t}->{type}};
1139 $self->{state} = BEFORE_TOKEN_STATE;
1140 # reconsume
1141 return $self->{t};
1142 #redo A;
1143 } else {
1144 $self->{t}->{value} .= chr $self->{c};
1145 # stay in the state
1146 $self->{c} = $self->{get_char}->();
1147 redo A;
1148 }
1149 } elsif ($self->{state} == NUMBER_STATE) {
1150 ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1151 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1152 $self->{t}->{value} .= chr $self->{c};
1153 # stay in the state
1154 $self->{c} = $self->{get_char}->();
1155 redo A;
1156 } elsif ($self->{c} == 0x002E) { # .
1157 $self->{state} = NUMBER_DOT_STATE;
1158 $self->{c} = $self->{get_char}->();
1159 redo A;
1160 } else {
1161 $self->{t}->{number} = $self->{t}->{value};
1162 $self->{t}->{value} = '';
1163 $self->{state} = AFTER_NUMBER_STATE;
1164 # reprocess
1165 redo A;
1166 }
1167 } elsif ($self->{state} == NUMBER_DOT_STATE) {
1168 ## NOTE: The character immediately following |.| in |num|.
1169 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1170 $self->{t}->{value} .= '.' . chr $self->{c};
1171 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1172 $self->{c} = $self->{get_char}->();
1173 redo A;
1174 } else {
1175 unshift @{$self->{token}}, {type => DOT_TOKEN};
1176 $self->{t}->{number} = $self->{t}->{value};
1177 $self->{t}->{value} = '';
1178 $self->{state} = BEFORE_TOKEN_STATE;
1179 # reprocess
1180 return $self->{t};
1181 #redo A;
1182 }
1183 } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1184 ## NOTE: The character immediately following |.| at the beginning of |num|.
1185 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1186 $self->{t}->{value} .= '.' . chr $self->{c};
1187 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1188 $self->{c} = $self->{get_char}->();
1189 redo A;
1190 } else {
1191 $self->{state} = BEFORE_TOKEN_STATE;
1192 # reprocess
1193 return {type => DOT_TOKEN,
1194 line => $self->{line}, column => $self->{column} - 1};
1195 ## BUG: line and column might be wrong if they are on the
1196 ## line boundary.
1197 #redo A;
1198 }
1199 } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1200 ## NOTE: |[0-9]| in |num| after |.|.
1201 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1202 $self->{t}->{value} .= chr $self->{c};
1203 # stay in the state
1204 $self->{c} = $self->{get_char}->();
1205 redo A;
1206 } else {
1207 $self->{t}->{number} = $self->{t}->{value};
1208 $self->{t}->{value} = '';
1209 $self->{state} = AFTER_NUMBER_STATE;
1210 # reprocess
1211 redo A;
1212 }
1213 } else {
1214 die "$0: Unknown state |$self->{state}|";
1215 }
1216 } # A
1217 } # get_next_token
1218
1219 sub serialize_token ($$) {
1220 shift;
1221 my $t = shift;
1222
1223 ## NOTE: This function is not intended for roundtrip-able serialization.
1224
1225 if ($t->{type} == IDENT_TOKEN) {
1226 return $t->{value};
1227 } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1228 return '@' . $t->{value};
1229 } elsif ($t->{type} == HASH_TOKEN) {
1230 return '#' . $t->{value};
1231 } elsif ($t->{type} == FUNCTION_TOKEN) {
1232 return $t->{value} . '(';
1233 } elsif ($t->{type} == URI_TOKEN) {
1234 return 'url(' . $t->{value} . ')';
1235 } elsif ($t->{type} == URI_INVALID_TOKEN) {
1236 return 'url(' . $t->{value};
1237 } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1238 return 'url-prefix(' . $t->{value} . ')';
1239 } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1240 return 'url-prefix(' . $t->{value};
1241 } elsif ($t->{type} == STRING_TOKEN) {
1242 return '"' . $t->{value} . '"';
1243 } elsif ($t->{type} == INVALID_TOKEN) {
1244 return '"' . $t->{value};
1245 } elsif ($t->{type} == NUMBER_TOKEN) {
1246 return $t->{number};
1247 } elsif ($t->{type} == DIMENSION_TOKEN) {
1248 return $t->{number} . $t->{value};
1249 } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1250 return $t->{number} . '%';
1251 } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1252 return 'U+' . $t->{value};
1253 } elsif ($t->{type} == DELIM_TOKEN) {
1254 return $t->{value};
1255 } elsif ($t->{type} == PLUS_TOKEN) {
1256 return '+';
1257 } elsif ($t->{type} == GREATER_TOKEN) {
1258 return '>';
1259 } elsif ($t->{type} == COMMA_TOKEN) {
1260 return ',';
1261 } elsif ($t->{type} == TILDE_TOKEN) {
1262 return '~';
1263 } elsif ($t->{type} == DASHMATCH_TOKEN) {
1264 return '|=';
1265 } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1266 return '^=';
1267 } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1268 return '$=';
1269 } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1270 return '*=';
1271 } elsif ($t->{type} == INCLUDES_TOKEN) {
1272 return '~=';
1273 } elsif ($t->{type} == SEMICOLON_TOKEN) {
1274 return ';';
1275 } elsif ($t->{type} == LBRACE_TOKEN) {
1276 return '{';
1277 } elsif ($t->{type} == RBRACE_TOKEN) {
1278 return '}';
1279 } elsif ($t->{type} == LPAREN_TOKEN) {
1280 return '(';
1281 } elsif ($t->{type} == RPAREN_TOKEN) {
1282 return ')';
1283 } elsif ($t->{type} == LBRACKET_TOKEN) {
1284 return '[';
1285 } elsif ($t->{type} == RBRACKET_TOKEN) {
1286 return ']';
1287 } elsif ($t->{type} == S_TOKEN) {
1288 return ' ';
1289 } elsif ($t->{type} == CDO_TOKEN) {
1290 return '<!--';
1291 } elsif ($t->{type} == CDC_TOKEN) {
1292 return '-->';
1293 } elsif ($t->{type} == COMMENT_TOKEN) {
1294 return '/**/';
1295 } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1296 return '/*';
1297 } elsif ($t->{type} == EOF_TOKEN) {
1298 return '{EOF}';
1299 } elsif ($t->{type} == MINUS_TOKEN) {
1300 return '-';
1301 } elsif ($t->{type} == STAR_TOKEN) {
1302 return '*';
1303 } elsif ($t->{type} == VBAR_TOKEN) {
1304 return '|';
1305 } elsif ($t->{type} == COLON_TOKEN) {
1306 return ':';
1307 } elsif ($t->{type} == MATCH_TOKEN) {
1308 return '=';
1309 } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1310 return '!';
1311 } else {
1312 return '{'.$t->{type}.'}';
1313 }
1314 } # serialize_token
1315
1316 =head1 LICENSE
1317
1318 Copyright 2007 Wakaba <w@suika.fam.cx>
1319
1320 This library is free software; you can redistribute it
1321 and/or modify it under the same terms as Perl itself.
1322
1323 =cut
1324
1325 1;
1326 # $Date: 2008/01/20 04:02:25 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24