/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (show annotations) (download)
Sun Jan 20 04:02:25 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.16: +103 -6 lines
++ whatpm/Whatpm/CSS/ChangeLog	20 Jan 2008 04:02:20 -0000
2008-01-20  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm (parse_char_string): Revise |$tt->{get_char}| callback
	so that it sets |$tt->{line}| and |$tt->{column}| options.  Some
	error handler calling codes are modified for the experimental
	support for more precious reporting of error location.

	* Tokenizer.pm (new): The |onerror| option has been removed, since
	it was never used.
	(get_next_token): Limited and experimental support for token
	emittion with the information on the position where it occurs.
	(serialize_token): New function.

1 package Whatpm::CSS::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.16 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 require Exporter;
6 push our @ISA, 'Exporter';
7
8 sub BEFORE_TOKEN_STATE () { 0 }
9 sub BEFORE_NMSTART_STATE () { 1 }
10 sub NAME_STATE () { 2 }
11 sub ESCAPE_OPEN_STATE () { 3 }
12 sub STRING_STATE () { 4 }
13 sub HASH_OPEN_STATE () { 5 }
14 sub NUMBER_STATE () { 6 }
15 sub NUMBER_FRACTION_STATE () { 7 }
16 sub AFTER_NUMBER_STATE () { 8 }
17 sub URI_BEFORE_WSP_STATE () { 9 }
18 sub ESCAPE_STATE () { 10 }
19 sub ESCAPE_BEFORE_LF_STATE () { 11 }
20 sub ESCAPE_BEFORE_NL_STATE () { 12 }
21 sub NUMBER_DOT_STATE () { 13 }
22 sub NUMBER_DOT_NUMBER_STATE () { 14 }
23 sub DELIM_STATE () { 15 }
24 sub URI_UNQUOTED_STATE () { 16 }
25 sub URI_AFTER_WSP_STATE () { 17 }
26 sub AFTER_AT_STATE () { 18 }
27 sub AFTER_AT_HYPHEN_STATE () { 19 }
28
29 sub IDENT_TOKEN () { 1 }
30 sub ATKEYWORD_TOKEN () { 2 }
31 sub HASH_TOKEN () { 3 }
32 sub FUNCTION_TOKEN () { 4 }
33 sub URI_TOKEN () { 5 }
34 sub URI_INVALID_TOKEN () { 6 }
35 sub URI_PREFIX_TOKEN () { 7 }
36 sub URI_PREFIX_INVALID_TOKEN () { 8 }
37 sub STRING_TOKEN () { 9 }
38 sub INVALID_TOKEN () { 10 }
39 sub NUMBER_TOKEN () { 11 }
40 sub DIMENSION_TOKEN () { 12 }
41 sub PERCENTAGE_TOKEN () { 13 }
42 sub UNICODE_RANGE_TOKEN () { 14 }
43 sub DELIM_TOKEN () { 16 }
44 sub PLUS_TOKEN () { 17 }
45 sub GREATER_TOKEN () { 18 }
46 sub COMMA_TOKEN () { 19 }
47 sub TILDE_TOKEN () { 20 }
48 sub DASHMATCH_TOKEN () { 21 }
49 sub PREFIXMATCH_TOKEN () { 22 }
50 sub SUFFIXMATCH_TOKEN () { 23 }
51 sub SUBSTRINGMATCH_TOKEN () { 24 }
52 sub INCLUDES_TOKEN () { 25 }
53 sub SEMICOLON_TOKEN () { 26 }
54 sub LBRACE_TOKEN () { 27 }
55 sub RBRACE_TOKEN () { 28 }
56 sub LPAREN_TOKEN () { 29 }
57 sub RPAREN_TOKEN () { 30 }
58 sub LBRACKET_TOKEN () { 31 }
59 sub RBRACKET_TOKEN () { 32 }
60 sub S_TOKEN () { 33 }
61 sub CDO_TOKEN () { 34 }
62 sub CDC_TOKEN () { 35 }
63 sub COMMENT_TOKEN () { 36 }
64 sub COMMENT_INVALID_TOKEN () { 37 }
65 sub EOF_TOKEN () { 38 }
66 sub MINUS_TOKEN () { 39 }
67 sub STAR_TOKEN () { 40 }
68 sub VBAR_TOKEN () { 41 }
69 sub DOT_TOKEN () { 42 }
70 sub COLON_TOKEN () { 43 }
71 sub MATCH_TOKEN () { 44 }
72 sub EXCLAMATION_TOKEN () { 45 }
73
74 our @TokenName = qw(
75 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79 LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81 );
82
83 our @EXPORT_OK = qw(
84 IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85 URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86 STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87 UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88 TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89 SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90 RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91 S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92 MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93 EXCLAMATION_TOKEN
94 );
95
96 our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97
98 sub new ($) {
99 my $self = bless {token => [], get_char => sub { -1 }}, shift;
100 return $self;
101 } # new
102
103 sub init ($) {
104 my $self = shift;
105 $self->{state} = BEFORE_TOKEN_STATE;
106 $self->{c} = $self->{get_char}->();
107 #$self->{t} = {type => token-type, value => value, number => number};
108 } # init
109
110 sub get_next_token ($) {
111 my $self = shift;
112 if (@{$self->{token}}) {
113 return shift @{$self->{token}};
114 }
115
116 my $char;
117 my $num; # |{num}|, if any.
118 my $i; # |$i + 1|th character in |unicode| in |escape|.
119 my $q;
120 ## NOTE:
121 ## 0: in |ident|.
122 ## 1: in |URI| outside of |string|.
123 ## 0x0022: in |string1| or |invalid1|.
124 ## 0x0027: in |string2| or |invalid2|.
125
126 A: {
127 if ($self->{state} == BEFORE_TOKEN_STATE) {
128 if ($self->{c} == 0x002D) { # -
129 ## NOTE: |-| in |ident| in |IDENT|
130 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
131 $self->{state} = BEFORE_NMSTART_STATE;
132 $self->{c} = $self->{get_char}->();
133 redo A;
134 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
135 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
136 $self->{c} = $self->{get_char}->();
137 if ($self->{c} == 0x002B) { # +
138 $self->{c} = $self->{get_char}->();
139 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
140 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
141 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
142 $self->{c} == 0x003F) { # ?
143 $self->{t}->{value} = chr $self->{c};
144 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
145 $self->{c} = $self->{get_char}->();
146 C: for (2..6) {
147 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
148 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
149 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
150 $self->{c} == 0x003F) { # ?
151 $self->{t}->{value} .= chr $self->{c};
152 $self->{c} = $self->{get_char}->();
153 } else {
154 last C;
155 }
156 } # C
157
158 if ($self->{c} == 0x002D) { # -
159 $self->{c} = $self->{get_char}->();
160 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
161 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
162 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
163 $self->{t}->{value} .= '-' . chr $self->{c};
164 $self->{c} = $self->{get_char}->();
165 C: for (2..6) {
166 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
167 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
168 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
169 $self->{t}->{value} .= chr $self->{c};
170 $self->{c} = $self->{get_char}->();
171 } else {
172 last C;
173 }
174 } # C
175
176 #
177 } else {
178 my $token = $self->{t};
179 $self->{t} = {type => IDENT_TOKEN, value => '-'};
180 $self->{state} = BEFORE_NMSTART_STATE;
181 # reprocess
182 return $token;
183 #redo A;
184 }
185 }
186
187 $self->{state} = BEFORE_TOKEN_STATE;
188 # reprocess
189 return $self->{t};
190 #redo A;
191 } else {
192 unshift @{$self->{token}}, {type => PLUS_TOKEN};
193 $self->{state} = BEFORE_TOKEN_STATE;
194 # reprocess
195 return $self->{t};
196 #redo A;
197 }
198 } else {
199 $self->{state} = NAME_STATE;
200 # reprocess
201 redo A;
202 }
203 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
204 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
205 $self->{c} == 0x005F or # _
206 $self->{c} > 0x007F) { # nonascii
207 ## NOTE: |nmstart| in |ident| in |IDENT|
208 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
209 $self->{state} = NAME_STATE;
210 $self->{c} = $self->{get_char}->();
211 redo A;
212 } elsif ($self->{c} == 0x005C) { # \
213 ## NOTE: |nmstart| in |ident| in |IDENT|
214 $self->{t} = {type => IDENT_TOKEN, value => ''};
215 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
216 $self->{c} = $self->{get_char}->();
217 redo A;
218 } elsif ($self->{c} == 0x0040) { # @
219 ## NOTE: |@| in |ATKEYWORD|
220 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
221 $self->{state} = AFTER_AT_STATE;
222 $self->{c} = $self->{get_char}->();
223 redo A;
224 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
225 $self->{t} = {type => STRING_TOKEN, value => ''};
226 $self->{state} = STRING_STATE; $q = $self->{c};
227 $self->{c} = $self->{get_char}->();
228 redo A;
229 } elsif ($self->{c} == 0x0023) { # #
230 ## NOTE: |#| in |HASH|.
231 $self->{t} = {type => HASH_TOKEN, value => ''};
232 $self->{state} = HASH_OPEN_STATE;
233 $self->{c} = $self->{get_char}->();
234 redo A;
235 } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
236 ## NOTE: |num|.
237 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
238 $self->{state} = NUMBER_STATE;
239 $self->{c} = $self->{get_char}->();
240 redo A;
241 } elsif ($self->{c} == 0x002E) { # .
242 ## NOTE: |num|.
243 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
244 $self->{state} = NUMBER_FRACTION_STATE;
245 $self->{c} = $self->{get_char}->();
246 redo A;
247 } elsif ($self->{c} == 0x002F) { # /
248 $self->{c} = $self->{get_char}->();
249 if ($self->{c} == 0x002A) { # *
250 C: {
251 $self->{c} = $self->{get_char}->();
252 if ($self->{c} == 0x002A) { # *
253 D: {
254 $self->{c} = $self->{get_char}->();
255 if ($self->{c} == 0x002F) { # /
256 #
257 } elsif ($self->{c} == 0x002A) { # *
258 redo D;
259 } else {
260 redo C;
261 }
262 } # D
263 } elsif ($self->{c} == -1) {
264 # stay in the state
265 # reprocess
266 return {type => COMMENT_INVALID_TOKEN};
267 #redo A;
268 } else {
269 redo C;
270 }
271 } # C
272
273 # stay in the state.
274 $self->{c} = $self->{get_char}->();
275 redo A;
276 } else {
277 # stay in the state.
278 # reprocess
279 return {type => DELIM_TOKEN, value => '/'};
280 #redo A;
281 }
282 } elsif ($self->{c} == 0x003C) { # <
283 ## NOTE: |CDO|
284 $self->{c} = $self->{get_char}->();
285 if ($self->{c} == 0x0021) { # !
286 $self->{c} = $self->{get_char}->();
287 if ($self->{c} == 0x002D) { # -
288 $self->{c} = $self->{get_char}->();
289 if ($self->{c} == 0x002D) { # -
290 $self->{state} = BEFORE_TOKEN_STATE;
291 $self->{c} = $self->{get_char}->();
292 return {type => CDO_TOKEN};
293 #redo A;
294 } else {
295 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
296 ## NOTE: |-| in |ident| in |IDENT|
297 $self->{t} = {type => IDENT_TOKEN, value => '-'};
298 $self->{state} = BEFORE_NMSTART_STATE;
299 #reprocess
300 return {type => DELIM_TOKEN, value => '<'};
301 #redo A;
302 }
303 } else {
304 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
305 $self->{state} = BEFORE_TOKEN_STATE;
306 #reprocess
307 return {type => DELIM_TOKEN, value => '<'};
308 #redo A;
309 }
310 } else {
311 $self->{state} = BEFORE_TOKEN_STATE;
312 #reprocess
313 return {type => DELIM_TOKEN, value => '<'};
314 #redo A;
315 }
316 } elsif (my $t = {
317 0x0021 => EXCLAMATION_TOKEN, # !
318 0x002D => MINUS_TOKEN, # -
319 0x002E => DOT_TOKEN, # .
320 0x003A => COLON_TOKEN, # :
321 0x003B => SEMICOLON_TOKEN, # ;
322 0x003D => MATCH_TOKEN, # =
323 0x007B => LBRACE_TOKEN, # {
324 0x007D => RBRACE_TOKEN, # }
325 0x0028 => LPAREN_TOKEN, # (
326 0x0029 => RPAREN_TOKEN, # )
327 0x005B => LBRACKET_TOKEN, # [
328 0x005D => RBRACKET_TOKEN, # ]
329 }->{$self->{c}}) {
330 my ($l, $c) = ($self->{line}, $self->{column});
331 # stay in the state
332 $self->{c} = $self->{get_char}->($self);
333 return {type => $t, line => $l, column => $c};
334 # redo A;
335 } elsif ({
336 0x0020 => 1, # SP
337 0x0009 => 1, # \t
338 0x000D => 1, # \r
339 0x000A => 1, # \n
340 0x000C => 1, # \f
341 }->{$self->{c}}) {
342 W: {
343 $self->{c} = $self->{get_char}->();
344 if ({
345 0x0020 => 1, # SP
346 0x0009 => 1, # \t
347 0x000D => 1, # \r
348 0x000A => 1, # \n
349 0x000C => 1, # \f
350 }->{$self->{c}}) {
351 redo W;
352 } elsif (my $v = {
353 0x002B => PLUS_TOKEN, # +
354 0x003E => GREATER_TOKEN, # >
355 0x002C => COMMA_TOKEN, # ,
356 0x007E => TILDE_TOKEN, # ~
357 }->{$self->{c}}) {
358 # stay in the state
359 $self->{c} = $self->{get_char}->();
360 return {type => $v};
361 #redo A;
362 } else {
363 # stay in the state
364 # reprocess
365 return {type => S_TOKEN};
366 #redo A;
367 }
368 } # W
369 } elsif (my $v = {
370 0x007C => DASHMATCH_TOKEN, # |
371 0x005E => PREFIXMATCH_TOKEN, # ^
372 0x0024 => SUFFIXMATCH_TOKEN, # $
373 0x002A => SUBSTRINGMATCH_TOKEN, # *
374 }->{$self->{c}}) {
375 my $c = $self->{c};
376 $self->{c} = $self->{get_char}->();
377 if ($self->{c} == 0x003D) { # =
378 # stay in the state
379 $self->{c} = $self->{get_char}->();
380 return {type => $v};
381 #redo A;
382 } elsif ($v = {
383 0x002A => STAR_TOKEN, # *
384 0x007C => VBAR_TOKEN, # |
385 }->{$c}) {
386 # stay in the state.
387 # reprocess
388 return {type => $v};
389 #redo A;
390 } else {
391 # stay in the state
392 # reprocess
393 return {type => DELIM_TOKEN, value => chr $c};
394 #redo A;
395 }
396 } elsif ($self->{c} == 0x002B) { # +
397 # stay in the state
398 $self->{c} = $self->{get_char}->();
399 return {type => PLUS_TOKEN};
400 #redo A;
401 } elsif ($self->{c} == 0x003E) { # >
402 # stay in the state
403 $self->{c} = $self->{get_char}->();
404 return {type => GREATER_TOKEN};
405 #redo A;
406 } elsif ($self->{c} == 0x002C) { # ,
407 # stay in the state
408 $self->{c} = $self->{get_char}->();
409 return {type => COMMA_TOKEN};
410 #redo A;
411 } elsif ($self->{c} == 0x007E) { # ~
412 $self->{c} = $self->{get_char}->();
413 if ($self->{c} == 0x003D) { # =
414 # stay in the state
415 $self->{c} = $self->{get_char}->();
416 return {type => INCLUDES_TOKEN};
417 #redo A;
418 } else {
419 # stay in the state
420 # reprocess
421 return {type => TILDE_TOKEN};
422 #redo A;
423 }
424 } elsif ($self->{c} == -1) {
425 # stay in the state
426 $self->{c} = $self->{get_char}->();
427 return {type => EOF_TOKEN};
428 #redo A;
429 } else {
430 # stay in the state
431 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
432 $self->{c} = $self->{get_char}->();
433 return $self->{t};
434 #redo A;
435 }
436 } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
437 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
438 ## |FUNCTION|)
439 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
440 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
441 $self->{c} == 0x005F or # _
442 $self->{c} > 0x007F) { # nonascii
443 $self->{t}->{value} .= chr $self->{c};
444 $self->{t}->{type} = DIMENSION_TOKEN
445 if $self->{t}->{type} == NUMBER_TOKEN;
446 $self->{state} = NAME_STATE;
447 $self->{c} = $self->{get_char}->();
448 redo A;
449 } elsif ($self->{c} == 0x005C) { # \
450 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
451 $self->{c} = $self->{get_char}->();
452 redo A;
453 } elsif ($self->{c} == 0x002D) { # -
454 if ($self->{t}->{type} == IDENT_TOKEN) {
455 $self->{c} = $self->{get_char}->();
456 if ($self->{c} == 0x003E) { # >
457 $self->{state} = BEFORE_TOKEN_STATE;
458 $self->{c} = $self->{get_char}->();
459 return {type => CDC_TOKEN};
460 #redo A;
461 } else {
462 ## NOTE: |-|, |-|, $self->{c}
463 #$self->{t} = {type => IDENT_TOKEN, value => '-'};
464 # stay in the state
465 # reconsume
466 return {type => MINUS_TOKEN};
467 #redo A;
468 }
469 } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
470 $self->{c} = $self->{get_char}->();
471 if ($self->{c} == 0x003E) { # >
472 unshift @{$self->{token}}, {type => CDC_TOKEN};
473 $self->{t}->{type} = NUMBER_TOKEN;
474 $self->{t}->{value} = '';
475 $self->{state} = BEFORE_TOKEN_STATE;
476 $self->{c} = $self->{get_char}->();
477 return $self->{t};
478 #redo A;
479 } else {
480 ## NOTE: |-|, |-|, $self->{c}
481 my $t = $self->{t};
482 $t->{type} = NUMBER_TOKEN;
483 $t->{value} = '';
484 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
485 unshift @{$self->{token}}, {type => MINUS_TOKEN};
486 # stay in the state
487 # reconsume
488 return $t;
489 #redo A;
490 }
491 } else {
492 #
493 }
494 } else {
495 #
496 }
497
498 if ($self->{t}->{type} == DIMENSION_TOKEN) {
499 ## NOTE: |-| after |NUMBER|.
500 unshift @{$self->{token}}, {type => MINUS_TOKEN};
501 $self->{state} = BEFORE_TOKEN_STATE;
502 # reprocess
503 $self->{t}->{type} = NUMBER_TOKEN;
504 $self->{t}->{value} = '';
505 return $self->{t};
506 } else {
507 ## NOTE: |-| not followed by |nmstart|.
508 $self->{state} = BEFORE_TOKEN_STATE;
509 # reprocess
510 return {type => MINUS_TOKEN};
511 }
512 } elsif ($self->{state} == AFTER_AT_STATE) {
513 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
514 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
515 $self->{c} == 0x005F or # _
516 $self->{c} > 0x007F) { # nonascii
517 $self->{t}->{value} .= chr $self->{c};
518 $self->{state} = NAME_STATE;
519 $self->{c} = $self->{get_char}->();
520 redo A;
521 } elsif ($self->{c} == 0x002D) { # -
522 $self->{t}->{value} .= '-';
523 $self->{state} = AFTER_AT_HYPHEN_STATE;
524 $self->{c} = $self->{get_char}->();
525 redo A;
526 } elsif ($self->{c} == 0x005C) { # \
527 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
528 $self->{c} = $self->{get_char}->();
529 redo A;
530 } else {
531 $self->{state} = BEFORE_TOKEN_STATE;
532 # reprocess
533 return {type => DELIM_TOKEN, value => '@'};
534 }
535 } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
536 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
537 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
538 $self->{c} == 0x005F or # _
539 $self->{c} > 0x007F) { # nonascii
540 $self->{t}->{value} .= chr $self->{c};
541 $self->{state} = NAME_STATE;
542 $self->{c} = $self->{get_char}->();
543 redo A;
544 } elsif ($self->{c} == 0x002D) { # -
545 $self->{c} = $self->{get_char}->();
546 if ($self->{c} == 0x003E) { # >
547 unshift @{$self->{token}}, {type => CDC_TOKEN};
548 $self->{state} = BEFORE_TOKEN_STATE;
549 $self->{c} = $self->{get_char}->();
550 return {type => DELIM_TOKEN, value => '@'};
551 #redo A;
552 } else {
553 unshift @{$self->{token}}, {type => MINUS_TOKEN};
554 $self->{t} = {type => IDENT_TOKEN, value => '-'};
555 $self->{state} = BEFORE_NMSTART_STATE;
556 # reprocess
557 return {type => DELIM_TOKEN, value => '@'};
558 #redo A;
559 }
560 } elsif ($self->{c} == 0x005C) { # \
561 ## TODO: @-\{nl}
562 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
563 $self->{c} = $self->{get_char}->();
564 redo A;
565 } else {
566 unshift @{$self->{token}}, {type => MINUS_TOKEN};
567 $self->{state} = BEFORE_TOKEN_STATE;
568 # reprocess
569 return {type => DELIM_TOKEN, value => '@'};
570 }
571 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
572 if ($self->{c} == 0x002D) { # -
573 ## NOTE: |-| in |ident|.
574 $self->{t}->{hyphen} = 1;
575 $self->{t}->{value} = '-';
576 $self->{t}->{type} = DIMENSION_TOKEN;
577 $self->{state} = BEFORE_NMSTART_STATE;
578 $self->{c} = $self->{get_char}->();
579 redo A;
580 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
581 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
582 $self->{c} == 0x005F or # _
583 $self->{c} > 0x007F) { # nonascii
584 ## NOTE: |nmstart| in |ident|.
585 $self->{t}->{value} = chr $self->{c};
586 $self->{t}->{type} = DIMENSION_TOKEN;
587 $self->{state} = NAME_STATE;
588 $self->{c} = $self->{get_char}->();
589 redo A;
590 } elsif ($self->{c} == 0x005C) { # \
591 ## NOTE: |nmstart| in |ident| in |IDENT|
592 $self->{t}->{value} = '';
593 $self->{t}->{type} = DIMENSION_TOKEN;
594 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
595 $self->{c} = $self->{get_char}->();
596 redo A;
597 } elsif ($self->{c} == 0x0025) { # %
598 $self->{t}->{type} = PERCENTAGE_TOKEN;
599 $self->{state} = BEFORE_TOKEN_STATE;
600 $self->{c} = $self->{get_char}->();
601 return $self->{t};
602 #redo A;
603 } else {
604 $self->{state} = BEFORE_TOKEN_STATE;
605 # reprocess
606 return $self->{t};
607 #redo A;
608 }
609 } elsif ($self->{state} == HASH_OPEN_STATE) {
610 ## NOTE: The first |nmchar| in |name| in |HASH|.
611 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
612 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
613 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
614 $self->{c} == 0x002D or # -
615 $self->{c} == 0x005F or # _
616 $self->{c} > 0x007F) { # nonascii
617 $self->{t}->{value} .= chr $self->{c};
618 $self->{state} = NAME_STATE;
619 $self->{c} = $self->{get_char}->();
620 redo A;
621 } elsif ($self->{c} == 0x005C) { # \
622 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
623 $self->{c} = $self->{get_char}->();
624 redo A;
625 } else {
626 $self->{state} = BEFORE_TOKEN_STATE;
627 # reprocess
628 return {type => DELIM_TOKEN, value => '#'};
629 #redo A;
630 }
631 } elsif ($self->{state} == NAME_STATE) {
632 ## NOTE: |nmchar| in (|ident| or |name|).
633 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
634 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
635 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
636 $self->{c} == 0x005F or # _
637 $self->{c} == 0x002D or # -
638 $self->{c} > 0x007F) { # nonascii
639 $self->{t}->{value} .= chr $self->{c};
640 # stay in the state
641 $self->{c} = $self->{get_char}->();
642 redo A;
643 } elsif ($self->{c} == 0x005C) { # \
644 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
645 $self->{c} = $self->{get_char}->();
646 redo A;
647 } elsif ($self->{c} == 0x0028 and # (
648 $self->{t}->{type} == IDENT_TOKEN) { # (
649 my $func_name = $self->{t}->{value};
650 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
651 if ($func_name eq 'url' or $func_name eq 'url-prefix') {
652 if ($self->{t}->{has_escape}) {
653 ## TODO: warn
654 }
655 $self->{t}->{type}
656 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
657 $self->{t}->{value} = '';
658 $self->{state} = URI_BEFORE_WSP_STATE;
659 $self->{c} = $self->{get_char}->();
660 redo A;
661 } else {
662 $self->{t}->{type} = FUNCTION_TOKEN;
663 $self->{state} = BEFORE_TOKEN_STATE;
664 $self->{c} = $self->{get_char}->();
665 return $self->{t};
666 #redo A;
667 }
668 } else {
669 $self->{state} = BEFORE_TOKEN_STATE;
670 # reconsume
671 return $self->{t};
672 #redo A;
673 }
674 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
675 while ({
676 0x0020 => 1, # SP
677 0x0009 => 1, # \t
678 0x000D => 1, # \r
679 0x000A => 1, # \n
680 0x000C => 1, # \f
681 }->{$self->{c}}) {
682 $self->{c} = $self->{get_char}->();
683 }
684 if ($self->{c} == -1) {
685 $self->{t}->{type} = {
686 URI_TOKEN, URI_INVALID_TOKEN,
687 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690 }->{$self->{t}->{type}};
691 $self->{state} = BEFORE_TOKEN_STATE;
692 $self->{c} = $self->{get_char}->();
693 return $self->{t};
694 #redo A;
695 } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
696 ## TODO: Should we consider matches of "(" and ")"?
697 $self->{t}->{type} = {
698 URI_TOKEN, URI_INVALID_TOKEN,
699 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
700 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
701 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
702 }->{$self->{t}->{type}};
703 $self->{state} = URI_UNQUOTED_STATE;
704 $self->{c} = $self->{get_char}->();
705 redo A;
706 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
707 $self->{state} = STRING_STATE; $q = $self->{c};
708 $self->{c} = $self->{get_char}->();
709 redo A;
710 } elsif ($self->{c} == 0x0029) { # )
711 $self->{state} = BEFORE_TOKEN_STATE;
712 $self->{c} = $self->{get_char}->();
713 return $self->{t};
714 #redo A;
715 } elsif ($self->{c} == 0x005C) { # \
716 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
717 $self->{c} = $self->{get_char}->();
718 redo A;
719 } else {
720 $self->{t}->{value} .= chr $self->{c};
721 $self->{state} = URI_UNQUOTED_STATE;
722 $self->{c} = $self->{get_char}->();
723 redo A;
724 }
725 } elsif ($self->{state} == URI_UNQUOTED_STATE) {
726 if ({
727 0x0020 => 1, # SP
728 0x0009 => 1, # \t
729 0x000D => 1, # \r
730 0x000A => 1, # \n
731 0x000C => 1, # \f
732 }->{$self->{c}}) {
733 $self->{state} = URI_AFTER_WSP_STATE;
734 $self->{c} = $self->{get_char}->();
735 redo A;
736 } elsif ($self->{c} == -1) {
737 $self->{t}->{type} = {
738 URI_TOKEN, URI_INVALID_TOKEN,
739 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
740 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
741 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
742 }->{$self->{t}->{type}};
743 $self->{state} = BEFORE_TOKEN_STATE;
744 $self->{c} = $self->{get_char}->();
745 return $self->{t};
746 #redo A;
747 } elsif ($self->{c} < 0x0020 or {
748 0x0022 => 1, # "
749 0x0027 => 1, # '
750 0x0028 => 1, # (
751 }->{$self->{c}}) { # C0 or (
752 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
753 $self->{t}->{type} = {
754 URI_TOKEN, URI_INVALID_TOKEN,
755 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
756 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
757 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
758 }->{$self->{t}->{type}};
759 # stay in the state.
760 $self->{c} = $self->{get_char}->();
761 redo A;
762 } elsif ($self->{c} == 0x0029) { # )
763 $self->{state} = BEFORE_TOKEN_STATE;
764 $self->{c} = $self->{get_char}->();
765 return $self->{t};
766 #redo A;
767 } elsif ($self->{c} == 0x005C) { # \
768 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
769 $self->{c} = $self->{get_char}->();
770 redo A;
771 } else {
772 $self->{t}->{value} .= chr $self->{c};
773 # stay in the state.
774 $self->{c} = $self->{get_char}->();
775 redo A;
776 }
777 } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
778 if ({
779 0x0020 => 1, # SP
780 0x0009 => 1, # \t
781 0x000D => 1, # \r
782 0x000A => 1, # \n
783 0x000C => 1, # \f
784 }->{$self->{c}}) {
785 # stay in the state.
786 $self->{c} = $self->{get_char}->();
787 redo A;
788 } elsif ($self->{c} == -1) {
789 $self->{t}->{type} = {
790 URI_TOKEN, URI_INVALID_TOKEN,
791 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
792 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
793 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
794 }->{$self->{t}->{type}};
795 $self->{state} = BEFORE_TOKEN_STATE;
796 $self->{c} = $self->{get_char}->();
797 return $self->{t};
798 #redo A;
799 } elsif ($self->{c} == 0x0029) { # )
800 $self->{state} = BEFORE_TOKEN_STATE;
801 $self->{c} = $self->{get_char}->();
802 return $self->{t};
803 #redo A;
804 } elsif ($self->{c} == 0x005C) { # \
805 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
806 $self->{c} = $self->{get_char}->();
807 redo A;
808 } else {
809 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
810 $self->{t}->{type} = {
811 URI_TOKEN, URI_INVALID_TOKEN,
812 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
813 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
814 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
815 }->{$self->{t}->{type}};
816 # stay in the state.
817 $self->{c} = $self->{get_char}->();
818 redo A;
819 }
820 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
821 $self->{t}->{has_escape} = 1;
822 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
823 ## NOTE: second character of |unicode| in |escape|.
824 $char = $self->{c} - 0x0030;
825 $self->{state} = ESCAPE_STATE; $i = 2;
826 $self->{c} = $self->{get_char}->();
827 redo A;
828 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
829 ## NOTE: second character of |unicode| in |escape|.
830 $char = $self->{c} - 0x0041 + 0xA;
831 $self->{state} = ESCAPE_STATE; $i = 2;
832 $self->{c} = $self->{get_char}->();
833 redo A;
834 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
835 ## NOTE: second character of |unicode| in |escape|.
836 $char = $self->{c} - 0x0061 + 0xA;
837 $self->{state} = ESCAPE_STATE; $i = 2;
838 $self->{c} = $self->{get_char}->();
839 redo A;
840 } elsif ($self->{c} == 0x000A or # \n
841 $self->{c} == 0x000C) { # \f
842 if ($q == 0) {
843 #
844 } elsif ($q == 1) {
845 ## NOTE: In |escape| in |URI|.
846 $self->{t}->{type} = {
847 URI_TOKEN, URI_INVALID_TOKEN,
848 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
849 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
850 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
851 }->{$self->{t}->{type}};
852 $self->{t}->{value} .= chr $self->{c};
853 $self->{state} = URI_UNQUOTED_STATE;
854 $self->{c} = $self->{get_char}->();
855 redo A;
856 } else {
857 ## Note: In |nl| in ... in |string| or |ident|.
858 $self->{state} = STRING_STATE;
859 $self->{c} = $self->{get_char}->();
860 redo A;
861 }
862 } elsif ($self->{c} == 0x000D) { # \r
863 if ($q == 0) {
864 #
865 } elsif ($q == 1) {
866 ## NOTE: In |escape| in |URI|.
867 $self->{t}->{type} = {
868 URI_TOKEN, URI_INVALID_TOKEN,
869 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
870 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872 }->{$self->{t}->{type}};
873 $self->{state} = ESCAPE_BEFORE_LF_STATE;
874 $self->{c} = $self->{get_char}->();
875 redo A;
876 } else {
877 ## Note: In |nl| in ... in |string| or |ident|.
878 $self->{state} = ESCAPE_BEFORE_LF_STATE;
879 $self->{c} = $self->{get_char}->();
880 redo A;
881 }
882 } elsif ($self->{c} == -1) {
883 #
884 } else {
885 ## NOTE: second character of |escape|.
886 $self->{t}->{value} .= chr $self->{c};
887 $self->{state} = $q == 0 ? NAME_STATE :
888 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
889 $self->{c} = $self->{get_char}->();
890 redo A;
891 }
892
893 if ($q == 0) {
894 if ($self->{t}->{type} == DIMENSION_TOKEN) {
895 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
896 $self->{state} = BEFORE_TOKEN_STATE;
897 # reprocess
898 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
899 unshift @{$self->{token}}, {type => MINUS_TOKEN};
900 $self->{t}->{type} = NUMBER_TOKEN;
901 $self->{t}->{value} = '';
902 return $self->{t};
903 #redo A;
904 } elsif (length $self->{t}->{value}) {
905 $self->{state} = BEFORE_TOKEN_STATE;
906 # reprocess
907 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
908 return $self->{t};
909 #redo A;
910 } else {
911 $self->{state} = BEFORE_TOKEN_STATE;
912 # reprocess
913 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
914 $self->{t}->{type} = NUMBER_TOKEN;
915 $self->{t}->{value} = '';
916 return $self->{t};
917 #redo A;
918 }
919 } else {
920 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
921 $self->{state} = BEFORE_TOKEN_STATE;
922 # reprocess
923 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
924 return {type => MINUS_TOKEN};
925 #redo A;
926 } elsif (length $self->{t}->{value}) {
927 $self->{state} = BEFORE_TOKEN_STATE;
928 # reprocess
929 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
930 return $self->{t};
931 #redo A;
932 } else {
933 $self->{state} = BEFORE_TOKEN_STATE;
934 # reprocess
935 return {type => DELIM_TOKEN, value => '\\'};
936 #redo A;
937 }
938 }
939 } elsif ($q == 1) {
940 $self->{state} = URI_UNQUOTED_STATE;
941 $self->{c} = $self->{get_char}->();
942 redo A;
943 } else {
944 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
945 $self->{t}->{type} = {
946 STRING_TOKEN, INVALID_TOKEN,
947 URI_TOKEN, URI_INVALID_TOKEN,
948 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
949 }->{$self->{t}->{type}} || $self->{t}->{type};
950 $self->{state} = BEFORE_TOKEN_STATE;
951 # reprocess
952 return $self->{t};
953 #redo A;
954 }
955 } elsif ($self->{state} == ESCAPE_STATE) {
956 ## NOTE: third..seventh character of |unicode| in |escape|.
957 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
958 $char = $char * 0x10 + $self->{c} - 0x0030;
959 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
960 $self->{c} = $self->{get_char}->();
961 redo A;
962 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
963 $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
964 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
965 $self->{c} = $self->{get_char}->();
966 redo A;
967 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
968 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
969 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
970 $self->{c} = $self->{get_char}->();
971 redo A;
972 } elsif ($self->{c} == 0x0020 or # SP
973 $self->{c} == 0x000A or # \n
974 $self->{c} == 0x0009 or # \t
975 $self->{c} == 0x000C) { # \f
976 $self->{t}->{value} .= chr $char;
977 $self->{state} = $q == 0 ? NAME_STATE :
978 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
979 $self->{c} = $self->{get_char}->();
980 redo A;
981 } elsif ($self->{c} == 0x000D) { # \r
982 $self->{state} = ESCAPE_BEFORE_LF_STATE;
983 $self->{c} = $self->{get_char}->();
984 redo A;
985 } else {
986 $self->{t}->{value} .= chr $char;
987 $self->{state} = $q == 0 ? NAME_STATE :
988 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
989 # reconsume
990 redo A;
991 }
992 } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
993 ## NOTE: eightth character of |unicode| in |escape|.
994 if ($self->{c} == 0x0020 or # SP
995 $self->{c} == 0x000A or # \n
996 $self->{c} == 0x0009 or # \t
997 $self->{c} == 0x000C) { # \f
998 $self->{t}->{value} .= chr $char;
999 $self->{state} = $q == 0 ? NAME_STATE :
1000 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1001 $self->{c} = $self->{get_char}->();
1002 redo A;
1003 } elsif ($self->{c} == 0x000D) { # \r
1004 $self->{state} = ESCAPE_BEFORE_NL_STATE;
1005 $self->{c} = $self->{get_char}->();
1006 redo A;
1007 } else {
1008 $self->{t}->{value} .= chr $char;
1009 $self->{state} = $q == 0 ? NAME_STATE :
1010 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1011 # reconsume
1012 redo A;
1013 }
1014 } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1015 ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1016 if ($self->{c} == 0x000A) { # \n
1017 $self->{state} = $q == 0 ? NAME_STATE :
1018 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1019 $self->{c} = $self->{get_char}->();
1020 redo A;
1021 } else {
1022 $self->{state} = $q == 0 ? NAME_STATE :
1023 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1024 # reprocess
1025 redo A;
1026 }
1027 } elsif ($self->{state} == STRING_STATE) {
1028 ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1029 ## a character in |invalid$Q| in |invalid| in |INVALID|,
1030 ## where |$Q = $q == 0x0022 ? 1 : 2|.
1031 ## Or, in |URI|.
1032 if ($self->{c} == 0x005C) { # \
1033 $self->{state} = ESCAPE_OPEN_STATE;
1034 $self->{c} = $self->{get_char}->();
1035 redo A;
1036 } elsif ($self->{c} == $q) { # " | '
1037 if ($self->{t}->{type} == STRING_TOKEN) {
1038 $self->{state} = BEFORE_TOKEN_STATE;
1039 $self->{c} = $self->{get_char}->();
1040 return $self->{t};
1041 #redo A;
1042 } else {
1043 $self->{state} = URI_AFTER_WSP_STATE;
1044 $self->{c} = $self->{get_char}->();
1045 redo A;
1046 }
1047 } elsif ($self->{c} == 0x000A or # \n
1048 $self->{c} == 0x000D or # \r
1049 $self->{c} == 0x000C or # \f
1050 $self->{c} == -1) {
1051 $self->{t}->{type} = {
1052 STRING_TOKEN, INVALID_TOKEN,
1053 INVALID_TOKEN, INVALID_TOKEN,
1054 URI_TOKEN, URI_INVALID_TOKEN,
1055 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1056 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1057 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1058 }->{$self->{t}->{type}};
1059 $self->{state} = BEFORE_TOKEN_STATE;
1060 # reconsume
1061 return $self->{t};
1062 #redo A;
1063 } else {
1064 $self->{t}->{value} .= chr $self->{c};
1065 # stay in the state
1066 $self->{c} = $self->{get_char}->();
1067 redo A;
1068 }
1069 } elsif ($self->{state} == NUMBER_STATE) {
1070 ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1071 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1072 $self->{t}->{value} .= chr $self->{c};
1073 # stay in the state
1074 $self->{c} = $self->{get_char}->();
1075 redo A;
1076 } elsif ($self->{c} == 0x002E) { # .
1077 $self->{state} = NUMBER_DOT_STATE;
1078 $self->{c} = $self->{get_char}->();
1079 redo A;
1080 } else {
1081 $self->{t}->{number} = $self->{t}->{value};
1082 $self->{t}->{value} = '';
1083 $self->{state} = AFTER_NUMBER_STATE;
1084 # reprocess
1085 redo A;
1086 }
1087 } elsif ($self->{state} == NUMBER_DOT_STATE) {
1088 ## NOTE: The character immediately following |.| in |num|.
1089 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1090 $self->{t}->{value} .= '.' . chr $self->{c};
1091 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1092 $self->{c} = $self->{get_char}->();
1093 redo A;
1094 } else {
1095 unshift @{$self->{token}}, {type => DOT_TOKEN};
1096 $self->{t}->{number} = $self->{t}->{value};
1097 $self->{t}->{value} = '';
1098 $self->{state} = BEFORE_TOKEN_STATE;
1099 # reprocess
1100 return $self->{t};
1101 #redo A;
1102 }
1103 } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1104 ## NOTE: The character immediately following |.| at the beginning of |num|.
1105 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1106 $self->{t}->{value} .= '.' . chr $self->{c};
1107 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1108 $self->{c} = $self->{get_char}->();
1109 redo A;
1110 } else {
1111 $self->{state} = BEFORE_TOKEN_STATE;
1112 # reprocess
1113 return {type => DOT_TOKEN};
1114 #redo A;
1115 }
1116 } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1117 ## NOTE: |[0-9]| in |num| after |.|.
1118 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1119 $self->{t}->{value} .= chr $self->{c};
1120 # stay in the state
1121 $self->{c} = $self->{get_char}->();
1122 redo A;
1123 } else {
1124 $self->{t}->{number} = $self->{t}->{value};
1125 $self->{t}->{value} = '';
1126 $self->{state} = AFTER_NUMBER_STATE;
1127 # reprocess
1128 redo A;
1129 }
1130 } else {
1131 die "$0: Unknown state |$self->{state}|";
1132 }
1133 } # A
1134 } # get_next_token
1135
1136 sub serialize_token ($$) {
1137 shift;
1138 my $t = shift;
1139
1140 ## NOTE: This function is not intended for roundtrip-able serialization.
1141
1142 if ($t->{type} == IDENT_TOKEN) {
1143 return $t->{value};
1144 } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1145 return '@' . $t->{value};
1146 } elsif ($t->{type} == HASH_TOKEN) {
1147 return '#' . $t->{value};
1148 } elsif ($t->{type} == FUNCTION_TOKEN) {
1149 return $t->{value} . '(';
1150 } elsif ($t->{type} == URI_TOKEN) {
1151 return 'url(' . $t->{value} . ')';
1152 } elsif ($t->{type} == URI_INVALID_TOKEN) {
1153 return 'url(' . $t->{value};
1154 } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1155 return 'url-prefix(' . $t->{value} . ')';
1156 } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1157 return 'url-prefix(' . $t->{value};
1158 } elsif ($t->{type} == STRING_TOKEN) {
1159 return '"' . $t->{value} . '"';
1160 } elsif ($t->{type} == INVALID_TOKEN) {
1161 return '"' . $t->{value};
1162 } elsif ($t->{type} == NUMBER_TOKEN) {
1163 return $t->{number};
1164 } elsif ($t->{type} == DIMENSION_TOKEN) {
1165 return $t->{number} . $t->{value};
1166 } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1167 return $t->{number} . '%';
1168 } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1169 return 'U+' . $t->{value};
1170 } elsif ($t->{type} == DELIM_TOKEN) {
1171 return $t->{value};
1172 } elsif ($t->{type} == PLUS_TOKEN) {
1173 return '+';
1174 } elsif ($t->{type} == GREATER_TOKEN) {
1175 return '>';
1176 } elsif ($t->{type} == COMMA_TOKEN) {
1177 return ',';
1178 } elsif ($t->{type} == TILDE_TOKEN) {
1179 return '~';
1180 } elsif ($t->{type} == DASHMATCH_TOKEN) {
1181 return '|=';
1182 } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1183 return '^=';
1184 } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1185 return '$=';
1186 } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1187 return '*=';
1188 } elsif ($t->{type} == INCLUDES_TOKEN) {
1189 return '~=';
1190 } elsif ($t->{type} == SEMICOLON_TOKEN) {
1191 return ';';
1192 } elsif ($t->{type} == LBRACE_TOKEN) {
1193 return '{';
1194 } elsif ($t->{type} == RBRACE_TOKEN) {
1195 return '}';
1196 } elsif ($t->{type} == LPAREN_TOKEN) {
1197 return '(';
1198 } elsif ($t->{type} == RPAREN_TOKEN) {
1199 return ')';
1200 } elsif ($t->{type} == LBRACKET_TOKEN) {
1201 return '[';
1202 } elsif ($t->{type} == RBRACKET_TOKEN) {
1203 return ']';
1204 } elsif ($t->{type} == S_TOKEN) {
1205 return ' ';
1206 } elsif ($t->{type} == CDO_TOKEN) {
1207 return '<!--';
1208 } elsif ($t->{type} == CDC_TOKEN) {
1209 return '-->';
1210 } elsif ($t->{type} == COMMENT_TOKEN) {
1211 return '/**/';
1212 } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1213 return '/*';
1214 } elsif ($t->{type} == EOF_TOKEN) {
1215 return '{EOF}';
1216 } elsif ($t->{type} == MINUS_TOKEN) {
1217 return '-';
1218 } elsif ($t->{type} == STAR_TOKEN) {
1219 return '*';
1220 } elsif ($t->{type} == VBAR_TOKEN) {
1221 return '|';
1222 } elsif ($t->{type} == COLON_TOKEN) {
1223 return ':';
1224 } elsif ($t->{type} == MATCH_TOKEN) {
1225 return '=';
1226 } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1227 return '!';
1228 } else {
1229 return '{'.$t->{type}.'}';
1230 }
1231 } # serialize_token
1232
1233 =head1 LICENSE
1234
1235 Copyright 2007 Wakaba <w@suika.fam.cx>
1236
1237 This library is free software; you can redistribute it
1238 and/or modify it under the same terms as Perl itself.
1239
1240 =cut
1241
1242 1;
1243 # $Date: 2007/10/17 10:46:26 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24