/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.20 - (show annotations) (download)
Sat Jan 26 14:48:09 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.19: +7 -5 lines
++ whatpm/t/ChangeLog	26 Jan 2008 14:46:58 -0000
	* css-font.dat: New test data for 'font-weight'
	and 'font-size' are added.

	* css-visual.dat: New test data for leading and
	trailing zeros are added.

2008-01-26  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/CSS/ChangeLog	26 Jan 2008 14:46:14 -0000
	* Parser.pm ('font-weight' parser): Support for '+'.

	* Tokenizer.pm: Normalize number stored in |NUMBER_TOKEN|,
	|PERCENTAGE_TOKEN|, and |DIMENSION_TOKEN|.

2008-01-26  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::CSS::Tokenizer;
2 use strict;
3 our $VERSION=do{my @r=(q$Revision: 1.19 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4
5 require Exporter;
6 push our @ISA, 'Exporter';
7
8 sub BEFORE_TOKEN_STATE () { 0 }
9 sub BEFORE_NMSTART_STATE () { 1 }
10 sub NAME_STATE () { 2 }
11 sub ESCAPE_OPEN_STATE () { 3 }
12 sub STRING_STATE () { 4 }
13 sub HASH_OPEN_STATE () { 5 }
14 sub NUMBER_STATE () { 6 }
15 sub NUMBER_FRACTION_STATE () { 7 }
16 sub AFTER_NUMBER_STATE () { 8 }
17 sub URI_BEFORE_WSP_STATE () { 9 }
18 sub ESCAPE_STATE () { 10 }
19 sub ESCAPE_BEFORE_LF_STATE () { 11 }
20 sub ESCAPE_BEFORE_NL_STATE () { 12 }
21 sub NUMBER_DOT_STATE () { 13 }
22 sub NUMBER_DOT_NUMBER_STATE () { 14 }
23 sub DELIM_STATE () { 15 }
24 sub URI_UNQUOTED_STATE () { 16 }
25 sub URI_AFTER_WSP_STATE () { 17 }
26 sub AFTER_AT_STATE () { 18 }
27 sub AFTER_AT_HYPHEN_STATE () { 19 }
28
29 sub IDENT_TOKEN () { 1 }
30 sub ATKEYWORD_TOKEN () { 2 }
31 sub HASH_TOKEN () { 3 }
32 sub FUNCTION_TOKEN () { 4 }
33 sub URI_TOKEN () { 5 }
34 sub URI_INVALID_TOKEN () { 6 }
35 sub URI_PREFIX_TOKEN () { 7 }
36 sub URI_PREFIX_INVALID_TOKEN () { 8 }
37 sub STRING_TOKEN () { 9 }
38 sub INVALID_TOKEN () { 10 }
39 sub NUMBER_TOKEN () { 11 }
40 sub DIMENSION_TOKEN () { 12 }
41 sub PERCENTAGE_TOKEN () { 13 }
42 sub UNICODE_RANGE_TOKEN () { 14 }
43 sub DELIM_TOKEN () { 16 }
44 sub PLUS_TOKEN () { 17 }
45 sub GREATER_TOKEN () { 18 }
46 sub COMMA_TOKEN () { 19 }
47 sub TILDE_TOKEN () { 20 }
48 sub DASHMATCH_TOKEN () { 21 }
49 sub PREFIXMATCH_TOKEN () { 22 }
50 sub SUFFIXMATCH_TOKEN () { 23 }
51 sub SUBSTRINGMATCH_TOKEN () { 24 }
52 sub INCLUDES_TOKEN () { 25 }
53 sub SEMICOLON_TOKEN () { 26 }
54 sub LBRACE_TOKEN () { 27 }
55 sub RBRACE_TOKEN () { 28 }
56 sub LPAREN_TOKEN () { 29 }
57 sub RPAREN_TOKEN () { 30 }
58 sub LBRACKET_TOKEN () { 31 }
59 sub RBRACKET_TOKEN () { 32 }
60 sub S_TOKEN () { 33 }
61 sub CDO_TOKEN () { 34 }
62 sub CDC_TOKEN () { 35 }
63 sub COMMENT_TOKEN () { 36 }
64 sub COMMENT_INVALID_TOKEN () { 37 }
65 sub EOF_TOKEN () { 38 }
66 sub MINUS_TOKEN () { 39 }
67 sub STAR_TOKEN () { 40 }
68 sub VBAR_TOKEN () { 41 }
69 sub DOT_TOKEN () { 42 }
70 sub COLON_TOKEN () { 43 }
71 sub MATCH_TOKEN () { 44 }
72 sub EXCLAMATION_TOKEN () { 45 }
73
74 our @TokenName = qw(
75 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79 LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81 );
82
83 our @EXPORT_OK = qw(
84 IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85 URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86 STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87 UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88 TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89 SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90 RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91 S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92 MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93 EXCLAMATION_TOKEN
94 );
95
96 our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97
98 sub new ($) {
99 my $self = bless {token => [], get_char => sub { -1 }}, shift;
100 return $self;
101 } # new
102
103 sub init ($) {
104 my $self = shift;
105 $self->{state} = BEFORE_TOKEN_STATE;
106 $self->{c} = $self->{get_char}->($self);
107 #$self->{t} = {type => token-type, value => value, number => number};
108 } # init
109
110 sub get_next_token ($) {
111 my $self = shift;
112 if (@{$self->{token}}) {
113 return shift @{$self->{token}};
114 }
115
116 my $char;
117 my $num; # |{num}|, if any.
118 my $i; # |$i + 1|th character in |unicode| in |escape|.
119 my $q;
120 ## NOTE:
121 ## 0: in |ident|.
122 ## 1: in |URI| outside of |string|.
123 ## 0x0022: in |string1| or |invalid1|.
124 ## 0x0027: in |string2| or |invalid2|.
125
126 A: {
127 if ($self->{state} == BEFORE_TOKEN_STATE) {
128 if ($self->{c} == 0x002D) { # -
129 ## NOTE: |-| in |ident| in |IDENT|
130 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
131 line => $self->{line}, column => $self->{column}};
132 $self->{state} = BEFORE_NMSTART_STATE;
133 $self->{c} = $self->{get_char}->($self);
134 redo A;
135 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
136 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
137 line => $self->{line}, column => $self->{column}};
138 $self->{c} = $self->{get_char}->($self);
139 if ($self->{c} == 0x002B) { # +
140 my ($l, $c) = ($self->{line}, $self->{column});
141 $self->{c} = $self->{get_char}->($self);
142 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
145 $self->{c} == 0x003F) { # ?
146 $self->{t}->{value} = chr $self->{c};
147 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
148 $self->{c} = $self->{get_char}->($self);
149 C: for (2..6) {
150 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
151 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
152 (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
153 $self->{c} == 0x003F) { # ?
154 $self->{t}->{value} .= chr $self->{c};
155 $self->{c} = $self->{get_char}->($self);
156 } else {
157 last C;
158 }
159 } # C
160
161 if ($self->{c} == 0x002D) { # -
162 $self->{c} = $self->{get_char}->($self);
163 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
164 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
165 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
166 $self->{t}->{value} .= '-' . chr $self->{c};
167 $self->{c} = $self->{get_char}->($self);
168 C: for (2..6) {
169 if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
170 (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
171 (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
172 $self->{t}->{value} .= chr $self->{c};
173 $self->{c} = $self->{get_char}->($self);
174 } else {
175 last C;
176 }
177 } # C
178
179 #
180 } else {
181 my $token = $self->{t};
182 $self->{t} = {type => IDENT_TOKEN, value => '-',
183 line => $self->{line},
184 column => $self->{column}};
185 $self->{state} = BEFORE_NMSTART_STATE;
186 # reprocess
187 return $token;
188 #redo A;
189 }
190 }
191
192 $self->{state} = BEFORE_TOKEN_STATE;
193 # reprocess
194 return $self->{t};
195 #redo A;
196 } else {
197 unshift @{$self->{token}},
198 {type => PLUS_TOKEN, line => $l, column => $c};
199 $self->{state} = BEFORE_TOKEN_STATE;
200 # reprocess
201 return $self->{t};
202 #redo A;
203 }
204 } else {
205 $self->{state} = NAME_STATE;
206 # reprocess
207 redo A;
208 }
209 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
210 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
211 $self->{c} == 0x005F or # _
212 $self->{c} > 0x007F) { # nonascii
213 ## NOTE: |nmstart| in |ident| in |IDENT|
214 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
215 line => $self->{line}, column => $self->{column}};
216 $self->{state} = NAME_STATE;
217 $self->{c} = $self->{get_char}->($self);
218 redo A;
219 } elsif ($self->{c} == 0x005C) { # \
220 ## NOTE: |nmstart| in |ident| in |IDENT|
221 $self->{t} = {type => IDENT_TOKEN, value => '',
222 line => $self->{line}, column => $self->{column}};
223 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
224 $self->{c} = $self->{get_char}->($self);
225 redo A;
226 } elsif ($self->{c} == 0x0040) { # @
227 ## NOTE: |@| in |ATKEYWORD|
228 $self->{t} = {type => ATKEYWORD_TOKEN, value => '',
229 line => $self->{line}, column => $self->{column}};
230 $self->{state} = AFTER_AT_STATE;
231 $self->{c} = $self->{get_char}->($self);
232 redo A;
233 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
234 $self->{t} = {type => STRING_TOKEN, value => '',
235 line => $self->{line}, column => $self->{column}};
236 $self->{state} = STRING_STATE; $q = $self->{c};
237 $self->{c} = $self->{get_char}->($self);
238 redo A;
239 } elsif ($self->{c} == 0x0023) { # #
240 ## NOTE: |#| in |HASH|.
241 $self->{t} = {type => HASH_TOKEN, value => '',
242 line => $self->{line}, column => $self->{column}};
243 $self->{state} = HASH_OPEN_STATE;
244 $self->{c} = $self->{get_char}->($self);
245 redo A;
246 } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
247 ## NOTE: |num|.
248 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c},
249 line => $self->{line}, column => $self->{column}};
250 ## NOTE: 'value' is renamed as 'number' later.
251 $self->{state} = NUMBER_STATE;
252 $self->{c} = $self->{get_char}->($self);
253 redo A;
254 } elsif ($self->{c} == 0x002E) { # .
255 ## NOTE: |num|.
256 $self->{t} = {type => NUMBER_TOKEN, value => '0',
257 line => $self->{line}, column => $self->{column}};
258 ## NOTE: 'value' is renamed as 'number' later.
259 $self->{state} = NUMBER_FRACTION_STATE;
260 $self->{c} = $self->{get_char}->($self);
261 redo A;
262 } elsif ($self->{c} == 0x002F) { # /
263 my ($l, $c) = ($self->{line}, $self->{column});
264 $self->{c} = $self->{get_char}->($self);
265 if ($self->{c} == 0x002A) { # *
266 C: {
267 $self->{c} = $self->{get_char}->($self);
268 if ($self->{c} == 0x002A) { # *
269 D: {
270 $self->{c} = $self->{get_char}->($self);
271 if ($self->{c} == 0x002F) { # /
272 #
273 } elsif ($self->{c} == 0x002A) { # *
274 redo D;
275 } else {
276 redo C;
277 }
278 } # D
279 } elsif ($self->{c} == -1) {
280 # stay in the state
281 # reprocess
282 return {type => COMMENT_INVALID_TOKEN};
283 #redo A;
284 } else {
285 redo C;
286 }
287 } # C
288
289 # stay in the state.
290 $self->{c} = $self->{get_char}->($self);
291 redo A;
292 } else {
293 # stay in the state.
294 # reprocess
295 return {type => DELIM_TOKEN, value => '/', line => $l, column => $c};
296 #redo A;
297 }
298 } elsif ($self->{c} == 0x003C) { # <
299 my ($l, $c) = ($self->{line}, $self->{column});
300 ## NOTE: |CDO|
301 $self->{c} = $self->{get_char}->($self);
302 if ($self->{c} == 0x0021) { # !
303 $self->{c} = $self->{get_char}->($self);
304 if ($self->{c} == 0x002D) { # -
305 $self->{c} = $self->{get_char}->($self);
306 if ($self->{c} == 0x002D) { # -
307 $self->{state} = BEFORE_TOKEN_STATE;
308 $self->{c} = $self->{get_char}->($self);
309 return {type => CDO_TOKEN, line => $l, column => $c};
310 #redo A;
311 } else {
312 unshift @{$self->{token}},
313 {type => EXCLAMATION_TOKEN, line => $l, column => $c + 1};
314 ## NOTE: |-| in |ident| in |IDENT|
315 $self->{t} = {type => IDENT_TOKEN, value => '-',
316 line => $l, column => $c + 2};
317 $self->{state} = BEFORE_NMSTART_STATE;
318 #reprocess
319 return {type => DELIM_TOKEN, value => '<',
320 line => $l, column => $c};
321 #redo A;
322 }
323 } else {
324 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN,
325 line => $l, column => $c + 1};
326 $self->{state} = BEFORE_TOKEN_STATE;
327 #reprocess
328 return {type => DELIM_TOKEN, value => '<',
329 line => $l, column => $c};
330 #redo A;
331 }
332 } else {
333 $self->{state} = BEFORE_TOKEN_STATE;
334 #reprocess
335 return {type => DELIM_TOKEN, value => '<',
336 line => $l, column => $c};
337 #redo A;
338 }
339 } elsif (my $t = {
340 0x0021 => EXCLAMATION_TOKEN, # !
341 0x002D => MINUS_TOKEN, # -
342 0x002E => DOT_TOKEN, # .
343 0x003A => COLON_TOKEN, # :
344 0x003B => SEMICOLON_TOKEN, # ;
345 0x003D => MATCH_TOKEN, # =
346 0x007B => LBRACE_TOKEN, # {
347 0x007D => RBRACE_TOKEN, # }
348 0x0028 => LPAREN_TOKEN, # (
349 0x0029 => RPAREN_TOKEN, # )
350 0x005B => LBRACKET_TOKEN, # [
351 0x005D => RBRACKET_TOKEN, # ]
352 }->{$self->{c}}) {
353 my ($l, $c) = ($self->{line}, $self->{column});
354 # stay in the state
355 $self->{c} = $self->{get_char}->($self);
356 return {type => $t, line => $l, column => $c};
357 # redo A;
358 } elsif ({
359 0x0020 => 1, # SP
360 0x0009 => 1, # \t
361 0x000D => 1, # \r
362 0x000A => 1, # \n
363 0x000C => 1, # \f
364 }->{$self->{c}}) {
365 my ($l, $c) = ($self->{line}, $self->{column});
366 W: {
367 $self->{c} = $self->{get_char}->($self);
368 if ({
369 0x0020 => 1, # SP
370 0x0009 => 1, # \t
371 0x000D => 1, # \r
372 0x000A => 1, # \n
373 0x000C => 1, # \f
374 }->{$self->{c}}) {
375 redo W;
376 } elsif (my $v = {
377 0x002B => PLUS_TOKEN, # +
378 0x003E => GREATER_TOKEN, # >
379 0x002C => COMMA_TOKEN, # ,
380 0x007E => TILDE_TOKEN, # ~
381 }->{$self->{c}}) {
382 my ($l, $c) = ($self->{line}, $self->{column});
383 # stay in the state
384 $self->{c} = $self->{get_char}->($self);
385 return {type => $v, line => $l, column => $c};
386 #redo A;
387 } else {
388 # stay in the state
389 # reprocess
390 return {type => S_TOKEN, line => $l, column => $c};
391 #redo A;
392 }
393 } # W
394 } elsif (my $v = {
395 0x007C => DASHMATCH_TOKEN, # |
396 0x005E => PREFIXMATCH_TOKEN, # ^
397 0x0024 => SUFFIXMATCH_TOKEN, # $
398 0x002A => SUBSTRINGMATCH_TOKEN, # *
399 }->{$self->{c}}) {
400 my ($line, $column) = ($self->{line}, $self->{column});
401 my $c = $self->{c};
402 $self->{c} = $self->{get_char}->($self);
403 if ($self->{c} == 0x003D) { # =
404 # stay in the state
405 $self->{c} = $self->{get_char}->($self);
406 return {type => $v, line => $line, column => $column};
407 #redo A;
408 } elsif ($v = {
409 0x002A => STAR_TOKEN, # *
410 0x007C => VBAR_TOKEN, # |
411 }->{$c}) {
412 # stay in the state.
413 # reprocess
414 return {type => $v, line => $line, column => $column};
415 #redo A;
416 } else {
417 # stay in the state
418 # reprocess
419 return {type => DELIM_TOKEN, value => chr $c,
420 line => $line, column => $column};
421 #redo A;
422 }
423 } elsif ($self->{c} == 0x002B) { # +
424 my ($l, $c) = ($self->{line}, $self->{column});
425 # stay in the state
426 $self->{c} = $self->{get_char}->($self);
427 return {type => PLUS_TOKEN, line => $l, column => $c};
428 #redo A;
429 } elsif ($self->{c} == 0x003E) { # >
430 my ($l, $c) = ($self->{line}, $self->{column});
431 # stay in the state
432 $self->{c} = $self->{get_char}->($self);
433 return {type => GREATER_TOKEN, line => $l, column => $c};
434 #redo A;
435 } elsif ($self->{c} == 0x002C) { # ,
436 my ($l, $c) = ($self->{line}, $self->{column});
437 # stay in the state
438 $self->{c} = $self->{get_char}->($self);
439 return {type => COMMA_TOKEN, line => $l, column => $c};
440 #redo A;
441 } elsif ($self->{c} == 0x007E) { # ~
442 my ($l, $c) = ($self->{line}, $self->{column});
443 $self->{c} = $self->{get_char}->($self);
444 if ($self->{c} == 0x003D) { # =
445 # stay in the state
446 $self->{c} = $self->{get_char}->($self);
447 return {type => INCLUDES_TOKEN, line => $l, column => $c};
448 #redo A;
449 } else {
450 # stay in the state
451 # reprocess
452 return {type => TILDE_TOKEN, line => $l, column => $c};
453 #redo A;
454 }
455 } elsif ($self->{c} == -1) {
456 # stay in the state
457 $self->{c} = $self->{get_char}->($self);
458 return {type => EOF_TOKEN,
459 line => $self->{line}, column => $self->{column}};
460 #redo A;
461 } else {
462 # stay in the state
463 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c},
464 line => $self->{line}, column => $self->{column}};
465 $self->{c} = $self->{get_char}->($self);
466 return $self->{t};
467 #redo A;
468 }
469 } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
470 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
471 ## |FUNCTION|)
472 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
473 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
474 $self->{c} == 0x005F or # _
475 $self->{c} > 0x007F) { # nonascii
476 $self->{t}->{value} .= chr $self->{c};
477 $self->{t}->{type} = DIMENSION_TOKEN
478 if $self->{t}->{type} == NUMBER_TOKEN;
479 $self->{state} = NAME_STATE;
480 $self->{c} = $self->{get_char}->($self);
481 redo A;
482 } elsif ($self->{c} == 0x005C) { # \
483 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
484 $self->{c} = $self->{get_char}->($self);
485 redo A;
486 } elsif ($self->{c} == 0x002D) { # -
487 if ($self->{t}->{type} == IDENT_TOKEN) {
488 $self->{c} = $self->{get_char}->($self);
489 if ($self->{c} == 0x003E) { # >
490 $self->{state} = BEFORE_TOKEN_STATE;
491 $self->{c} = $self->{get_char}->($self);
492 return {type => CDC_TOKEN,
493 line => $self->{t}->{line},
494 column => $self->{t}->{column}};
495 #redo A;
496 } else {
497 ## NOTE: |-|, |-|, $self->{c}
498 #$self->{t} = {type => IDENT_TOKEN, value => '-'};
499 $self->{t}->{column}++;
500 # stay in the state
501 # reconsume
502 return {type => MINUS_TOKEN,
503 line => $self->{t}->{line},
504 column => $self->{t}->{column} - 1};
505 #redo A;
506 }
507 } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
508 my ($l, $c) = ($self->{line}, $self->{column}); # second '-'
509 $self->{c} = $self->{get_char}->($self);
510 if ($self->{c} == 0x003E) { # >
511 unshift @{$self->{token}}, {type => CDC_TOKEN};
512 $self->{t}->{type} = NUMBER_TOKEN;
513 $self->{t}->{value} = '';
514 $self->{state} = BEFORE_TOKEN_STATE;
515 $self->{c} = $self->{get_char}->($self);
516 return $self->{t};
517 #redo A;
518 } else {
519 ## NOTE: NUMBER, |-|, |-|, $self->{c}
520 my $t = $self->{t};
521 $t->{type} = NUMBER_TOKEN;
522 $t->{value} = '';
523 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
524 line => $l, column => $c};
525 unshift @{$self->{token}}, {type => MINUS_TOKEN,
526 line => $l, column => $c - 1};
527 # stay in the state
528 # reconsume
529 return $t;
530 #redo A;
531 }
532 } else {
533 #
534 }
535 } else {
536 #
537 }
538
539 if ($self->{t}->{type} == DIMENSION_TOKEN) {
540 ## NOTE: |-| after |NUMBER|.
541 unshift @{$self->{token}}, {type => MINUS_TOKEN,
542 line => $self->{line},
543 column => $self->{column} - 1};
544 ## BUG: column might be wrong if on the line boundary.
545 $self->{state} = BEFORE_TOKEN_STATE;
546 # reprocess
547 $self->{t}->{type} = NUMBER_TOKEN;
548 $self->{t}->{value} = '';
549 return $self->{t};
550 } else {
551 ## NOTE: |-| not followed by |nmstart|.
552 $self->{state} = BEFORE_TOKEN_STATE;
553 # reprocess
554 return {type => MINUS_TOKEN,
555 line => $self->{line}, column => $self->{column} - 1};
556 ## BUG: column might be wrong if on the line boundary.
557 }
558 } elsif ($self->{state} == AFTER_AT_STATE) {
559 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
560 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
561 $self->{c} == 0x005F or # _
562 $self->{c} > 0x007F) { # nonascii
563 $self->{t}->{value} .= chr $self->{c};
564 $self->{state} = NAME_STATE;
565 $self->{c} = $self->{get_char}->($self);
566 redo A;
567 } elsif ($self->{c} == 0x002D) { # -
568 $self->{t}->{value} .= '-';
569 $self->{state} = AFTER_AT_HYPHEN_STATE;
570 $self->{c} = $self->{get_char}->($self);
571 redo A;
572 } elsif ($self->{c} == 0x005C) { # \
573 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
574 $self->{c} = $self->{get_char}->($self);
575 redo A;
576 } else {
577 $self->{state} = BEFORE_TOKEN_STATE;
578 # reprocess
579 return {type => DELIM_TOKEN, value => '@',
580 line => $self->{t}->{line},
581 column => $self->{t}->{column}};
582 }
583 } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
584 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
585 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
586 $self->{c} == 0x005F or # _
587 $self->{c} > 0x007F) { # nonascii
588 $self->{t}->{value} .= chr $self->{c};
589 $self->{state} = NAME_STATE;
590 $self->{c} = $self->{get_char}->($self);
591 redo A;
592 } elsif ($self->{c} == 0x002D) { # -
593 $self->{c} = $self->{get_char}->($self);
594 if ($self->{c} == 0x003E) { # >
595 unshift @{$self->{token}}, {type => CDC_TOKEN};
596 $self->{state} = BEFORE_TOKEN_STATE;
597 $self->{c} = $self->{get_char}->($self);
598 return {type => DELIM_TOKEN, value => '@'};
599 #redo A;
600 } else {
601 unshift @{$self->{token}}, {type => MINUS_TOKEN};
602 $self->{t} = {type => IDENT_TOKEN, value => '-'};
603 $self->{state} = BEFORE_NMSTART_STATE;
604 # reprocess
605 return {type => DELIM_TOKEN, value => '@'};
606 #redo A;
607 }
608 } elsif ($self->{c} == 0x005C) { # \
609 ## TODO: @-\{nl}
610 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
611 $self->{c} = $self->{get_char}->($self);
612 redo A;
613 } else {
614 unshift @{$self->{token}}, {type => MINUS_TOKEN};
615 $self->{state} = BEFORE_TOKEN_STATE;
616 # reprocess
617 return {type => DELIM_TOKEN, value => '@'};
618 }
619 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
620 if ($self->{c} == 0x002D) { # -
621 ## NOTE: |-| in |ident|.
622 $self->{t}->{hyphen} = 1;
623 $self->{t}->{value} = '-';
624 $self->{t}->{type} = DIMENSION_TOKEN;
625 $self->{state} = BEFORE_NMSTART_STATE;
626 $self->{c} = $self->{get_char}->($self);
627 redo A;
628 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
629 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
630 $self->{c} == 0x005F or # _
631 $self->{c} > 0x007F) { # nonascii
632 ## NOTE: |nmstart| in |ident|.
633 $self->{t}->{value} = chr $self->{c};
634 $self->{t}->{type} = DIMENSION_TOKEN;
635 $self->{state} = NAME_STATE;
636 $self->{c} = $self->{get_char}->($self);
637 redo A;
638 } elsif ($self->{c} == 0x005C) { # \
639 ## NOTE: |nmstart| in |ident| in |IDENT|
640 $self->{t}->{value} = '';
641 $self->{t}->{type} = DIMENSION_TOKEN;
642 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
643 $self->{c} = $self->{get_char}->($self);
644 redo A;
645 } elsif ($self->{c} == 0x0025) { # %
646 $self->{t}->{type} = PERCENTAGE_TOKEN;
647 $self->{state} = BEFORE_TOKEN_STATE;
648 $self->{c} = $self->{get_char}->($self);
649 return $self->{t};
650 #redo A;
651 } else {
652 $self->{state} = BEFORE_TOKEN_STATE;
653 # reprocess
654 return $self->{t};
655 #redo A;
656 }
657 } elsif ($self->{state} == HASH_OPEN_STATE) {
658 ## NOTE: The first |nmchar| in |name| in |HASH|.
659 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
660 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
661 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
662 $self->{c} == 0x002D or # -
663 $self->{c} == 0x005F or # _
664 $self->{c} > 0x007F) { # nonascii
665 $self->{t}->{value} .= chr $self->{c};
666 $self->{state} = NAME_STATE;
667 $self->{c} = $self->{get_char}->($self);
668 redo A;
669 } elsif ($self->{c} == 0x005C) { # \
670 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
671 $self->{c} = $self->{get_char}->($self);
672 redo A;
673 } else {
674 $self->{state} = BEFORE_TOKEN_STATE;
675 # reprocess
676 return {type => DELIM_TOKEN, value => '#',
677 line => $self->{t}->{line},
678 column => $self->{t}->{column}};
679 #redo A;
680 }
681 } elsif ($self->{state} == NAME_STATE) {
682 ## NOTE: |nmchar| in (|ident| or |name|).
683 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
684 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
685 (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
686 $self->{c} == 0x005F or # _
687 $self->{c} == 0x002D or # -
688 $self->{c} > 0x007F) { # nonascii
689 $self->{t}->{value} .= chr $self->{c};
690 # stay in the state
691 $self->{c} = $self->{get_char}->($self);
692 redo A;
693 } elsif ($self->{c} == 0x005C) { # \
694 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
695 $self->{c} = $self->{get_char}->($self);
696 redo A;
697 } elsif ($self->{c} == 0x0028 and # (
698 $self->{t}->{type} == IDENT_TOKEN) { # (
699 my $func_name = $self->{t}->{value};
700 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
701 if ($func_name eq 'url' or $func_name eq 'url-prefix') {
702 if ($self->{t}->{has_escape}) {
703 ## TODO: warn
704 }
705 $self->{t}->{type}
706 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
707 $self->{t}->{value} = '';
708 $self->{state} = URI_BEFORE_WSP_STATE;
709 $self->{c} = $self->{get_char}->($self);
710 redo A;
711 } else {
712 $self->{t}->{type} = FUNCTION_TOKEN;
713 $self->{state} = BEFORE_TOKEN_STATE;
714 $self->{c} = $self->{get_char}->($self);
715 return $self->{t};
716 #redo A;
717 }
718 } else {
719 $self->{state} = BEFORE_TOKEN_STATE;
720 # reconsume
721 return $self->{t};
722 #redo A;
723 }
724 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
725 while ({
726 0x0020 => 1, # SP
727 0x0009 => 1, # \t
728 0x000D => 1, # \r
729 0x000A => 1, # \n
730 0x000C => 1, # \f
731 }->{$self->{c}}) {
732 $self->{c} = $self->{get_char}->($self);
733 }
734 if ($self->{c} == -1) {
735 $self->{t}->{type} = {
736 URI_TOKEN, URI_INVALID_TOKEN,
737 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
738 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
739 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
740 }->{$self->{t}->{type}};
741 $self->{state} = BEFORE_TOKEN_STATE;
742 $self->{c} = $self->{get_char}->($self);
743 return $self->{t};
744 #redo A;
745 } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
746 ## TODO: Should we consider matches of "(" and ")"?
747 $self->{t}->{type} = {
748 URI_TOKEN, URI_INVALID_TOKEN,
749 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
750 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
751 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
752 }->{$self->{t}->{type}};
753 $self->{state} = URI_UNQUOTED_STATE;
754 $self->{c} = $self->{get_char}->($self);
755 redo A;
756 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
757 $self->{state} = STRING_STATE; $q = $self->{c};
758 $self->{c} = $self->{get_char}->($self);
759 redo A;
760 } elsif ($self->{c} == 0x0029) { # )
761 $self->{state} = BEFORE_TOKEN_STATE;
762 $self->{c} = $self->{get_char}->($self);
763 return $self->{t};
764 #redo A;
765 } elsif ($self->{c} == 0x005C) { # \
766 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
767 $self->{c} = $self->{get_char}->($self);
768 redo A;
769 } else {
770 $self->{t}->{value} .= chr $self->{c};
771 $self->{state} = URI_UNQUOTED_STATE;
772 $self->{c} = $self->{get_char}->($self);
773 redo A;
774 }
775 } elsif ($self->{state} == URI_UNQUOTED_STATE) {
776 if ({
777 0x0020 => 1, # SP
778 0x0009 => 1, # \t
779 0x000D => 1, # \r
780 0x000A => 1, # \n
781 0x000C => 1, # \f
782 }->{$self->{c}}) {
783 $self->{state} = URI_AFTER_WSP_STATE;
784 $self->{c} = $self->{get_char}->($self);
785 redo A;
786 } elsif ($self->{c} == -1) {
787 $self->{t}->{type} = {
788 URI_TOKEN, URI_INVALID_TOKEN,
789 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
790 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
791 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
792 }->{$self->{t}->{type}};
793 $self->{state} = BEFORE_TOKEN_STATE;
794 $self->{c} = $self->{get_char}->($self);
795 return $self->{t};
796 #redo A;
797 } elsif ($self->{c} < 0x0020 or {
798 0x0022 => 1, # "
799 0x0027 => 1, # '
800 0x0028 => 1, # (
801 }->{$self->{c}}) { # C0 or (
802 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
803 $self->{t}->{type} = {
804 URI_TOKEN, URI_INVALID_TOKEN,
805 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
806 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
807 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
808 }->{$self->{t}->{type}};
809 # stay in the state.
810 $self->{c} = $self->{get_char}->($self);
811 redo A;
812 } elsif ($self->{c} == 0x0029) { # )
813 $self->{state} = BEFORE_TOKEN_STATE;
814 $self->{c} = $self->{get_char}->($self);
815 return $self->{t};
816 #redo A;
817 } elsif ($self->{c} == 0x005C) { # \
818 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
819 $self->{c} = $self->{get_char}->($self);
820 redo A;
821 } else {
822 $self->{t}->{value} .= chr $self->{c};
823 # stay in the state.
824 $self->{c} = $self->{get_char}->($self);
825 redo A;
826 }
827 } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
828 if ({
829 0x0020 => 1, # SP
830 0x0009 => 1, # \t
831 0x000D => 1, # \r
832 0x000A => 1, # \n
833 0x000C => 1, # \f
834 }->{$self->{c}}) {
835 # stay in the state.
836 $self->{c} = $self->{get_char}->($self);
837 redo A;
838 } elsif ($self->{c} == -1) {
839 $self->{t}->{type} = {
840 URI_TOKEN, URI_INVALID_TOKEN,
841 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
842 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
843 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
844 }->{$self->{t}->{type}};
845 $self->{state} = BEFORE_TOKEN_STATE;
846 $self->{c} = $self->{get_char}->($self);
847 return $self->{t};
848 #redo A;
849 } elsif ($self->{c} == 0x0029) { # )
850 $self->{state} = BEFORE_TOKEN_STATE;
851 $self->{c} = $self->{get_char}->($self);
852 return $self->{t};
853 #redo A;
854 } elsif ($self->{c} == 0x005C) { # \
855 $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
856 $self->{c} = $self->{get_char}->($self);
857 redo A;
858 } else {
859 ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
860 $self->{t}->{type} = {
861 URI_TOKEN, URI_INVALID_TOKEN,
862 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
863 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
864 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
865 }->{$self->{t}->{type}};
866 # stay in the state.
867 $self->{c} = $self->{get_char}->($self);
868 redo A;
869 }
870 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
871 $self->{t}->{has_escape} = 1;
872 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
873 ## NOTE: second character of |unicode| in |escape|.
874 $char = $self->{c} - 0x0030;
875 $self->{state} = ESCAPE_STATE; $i = 2;
876 $self->{c} = $self->{get_char}->($self);
877 redo A;
878 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
879 ## NOTE: second character of |unicode| in |escape|.
880 $char = $self->{c} - 0x0041 + 0xA;
881 $self->{state} = ESCAPE_STATE; $i = 2;
882 $self->{c} = $self->{get_char}->($self);
883 redo A;
884 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
885 ## NOTE: second character of |unicode| in |escape|.
886 $char = $self->{c} - 0x0061 + 0xA;
887 $self->{state} = ESCAPE_STATE; $i = 2;
888 $self->{c} = $self->{get_char}->($self);
889 redo A;
890 } elsif ($self->{c} == 0x000A or # \n
891 $self->{c} == 0x000C) { # \f
892 if ($q == 0) {
893 #
894 } elsif ($q == 1) {
895 ## NOTE: In |escape| in |URI|.
896 $self->{t}->{type} = {
897 URI_TOKEN, URI_INVALID_TOKEN,
898 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
899 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
900 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
901 }->{$self->{t}->{type}};
902 $self->{t}->{value} .= chr $self->{c};
903 $self->{state} = URI_UNQUOTED_STATE;
904 $self->{c} = $self->{get_char}->($self);
905 redo A;
906 } else {
907 ## Note: In |nl| in ... in |string| or |ident|.
908 $self->{state} = STRING_STATE;
909 $self->{c} = $self->{get_char}->($self);
910 redo A;
911 }
912 } elsif ($self->{c} == 0x000D) { # \r
913 if ($q == 0) {
914 #
915 } elsif ($q == 1) {
916 ## NOTE: In |escape| in |URI|.
917 $self->{t}->{type} = {
918 URI_TOKEN, URI_INVALID_TOKEN,
919 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
920 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
921 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
922 }->{$self->{t}->{type}};
923 $self->{state} = ESCAPE_BEFORE_LF_STATE;
924 $self->{c} = $self->{get_char}->($self);
925 redo A;
926 } else {
927 ## Note: In |nl| in ... in |string| or |ident|.
928 $self->{state} = ESCAPE_BEFORE_LF_STATE;
929 $self->{c} = $self->{get_char}->($self);
930 redo A;
931 }
932 } elsif ($self->{c} == -1) {
933 #
934 } else {
935 ## NOTE: second character of |escape|.
936 $self->{t}->{value} .= chr $self->{c};
937 $self->{state} = $q == 0 ? NAME_STATE :
938 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
939 $self->{c} = $self->{get_char}->($self);
940 redo A;
941 }
942
943 if ($q == 0) {
944 if ($self->{t}->{type} == DIMENSION_TOKEN) {
945 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
946 $self->{state} = BEFORE_TOKEN_STATE;
947 # reprocess
948 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
949 line => $self->{line},
950 column => $self->{column} - 2};
951 unshift @{$self->{token}}, {type => MINUS_TOKEN,
952 line => $self->{line},
953 column => $self->{column} - 1};
954 ## BUG: line and column might be wrong if they are on the
955 ## line boundary.
956 $self->{t}->{type} = NUMBER_TOKEN;
957 $self->{t}->{value} = '';
958 return $self->{t};
959 #redo A;
960 } elsif (length $self->{t}->{value}) {
961 $self->{state} = BEFORE_TOKEN_STATE;
962 # reprocess
963 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
964 line => $self->{line},
965 column => $self->{column} - 1};
966 ## BUG: line and column might be wrong if they are on the
967 ## line boundary.
968 return $self->{t};
969 #redo A;
970 } else {
971 $self->{state} = BEFORE_TOKEN_STATE;
972 # reprocess
973 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
974 line => $self->{line},
975 column => $self->{column} - 1};
976 ## BUG: line and column might be wrong if they are on the
977 ## line boundary.
978 $self->{t}->{type} = NUMBER_TOKEN;
979 $self->{t}->{value} = '';
980 return $self->{t};
981 #redo A;
982 }
983 } else {
984 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
985 $self->{state} = BEFORE_TOKEN_STATE;
986 # reprocess
987 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
988 line => $self->{line},
989 column => $self->{column} - 2};
990 return {type => MINUS_TOKEN,
991 line => $self->{line},
992 column => $self->{column} - 1};
993 ## BUG: line and column might be wrong if they are on the
994 ## line boundary.
995 #redo A;
996 } elsif (length $self->{t}->{value}) {
997 $self->{state} = BEFORE_TOKEN_STATE;
998 # reprocess
999 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
1000 line => $self->{line},
1001 column => $self->{column} - 1};
1002 ## BUG: line and column might be wrong if they are on the
1003 ## line boundary.
1004 return $self->{t};
1005 #redo A;
1006 } else {
1007 $self->{state} = BEFORE_TOKEN_STATE;
1008 # reprocess
1009 return {type => DELIM_TOKEN, value => '\\',
1010 line => $self->{line},
1011 column => $self->{column} - 1};
1012 ## BUG: line and column might be wrong if they are on the
1013 ## line boundary.
1014 #redo A;
1015 }
1016 }
1017 } elsif ($q == 1) {
1018 $self->{state} = URI_UNQUOTED_STATE;
1019 $self->{c} = $self->{get_char}->($self);
1020 redo A;
1021 } else {
1022 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
1023 line => $self->{line},
1024 column => $self->{column} - 1};
1025 ## BUG: line and column might be wrong if they are on the
1026 ## line boundary.
1027 $self->{t}->{type} = {
1028 STRING_TOKEN, INVALID_TOKEN,
1029 URI_TOKEN, URI_INVALID_TOKEN,
1030 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1031 }->{$self->{t}->{type}} || $self->{t}->{type};
1032 $self->{state} = BEFORE_TOKEN_STATE;
1033 # reprocess
1034 return $self->{t};
1035 #redo A;
1036 }
1037 } elsif ($self->{state} == ESCAPE_STATE) {
1038 ## NOTE: third..seventh character of |unicode| in |escape|.
1039 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
1040 $char = $char * 0x10 + $self->{c} - 0x0030;
1041 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1042 $self->{c} = $self->{get_char}->($self);
1043 redo A;
1044 } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
1045 $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
1046 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1047 $self->{c} = $self->{get_char}->($self);
1048 redo A;
1049 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
1050 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
1051 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1052 $self->{c} = $self->{get_char}->($self);
1053 redo A;
1054 } elsif ($self->{c} == 0x0020 or # SP
1055 $self->{c} == 0x000A or # \n
1056 $self->{c} == 0x0009 or # \t
1057 $self->{c} == 0x000C) { # \f
1058 $self->{t}->{value} .= chr $char;
1059 $self->{state} = $q == 0 ? NAME_STATE :
1060 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1061 $self->{c} = $self->{get_char}->($self);
1062 redo A;
1063 } elsif ($self->{c} == 0x000D) { # \r
1064 $self->{state} = ESCAPE_BEFORE_LF_STATE;
1065 $self->{c} = $self->{get_char}->($self);
1066 redo A;
1067 } else {
1068 $self->{t}->{value} .= chr $char;
1069 $self->{state} = $q == 0 ? NAME_STATE :
1070 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1071 # reconsume
1072 redo A;
1073 }
1074 } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
1075 ## NOTE: eightth character of |unicode| in |escape|.
1076 if ($self->{c} == 0x0020 or # SP
1077 $self->{c} == 0x000A or # \n
1078 $self->{c} == 0x0009 or # \t
1079 $self->{c} == 0x000C) { # \f
1080 $self->{t}->{value} .= chr $char;
1081 $self->{state} = $q == 0 ? NAME_STATE :
1082 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1083 $self->{c} = $self->{get_char}->($self);
1084 redo A;
1085 } elsif ($self->{c} == 0x000D) { # \r
1086 $self->{state} = ESCAPE_BEFORE_NL_STATE;
1087 $self->{c} = $self->{get_char}->($self);
1088 redo A;
1089 } else {
1090 $self->{t}->{value} .= chr $char;
1091 $self->{state} = $q == 0 ? NAME_STATE :
1092 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1093 # reconsume
1094 redo A;
1095 }
1096 } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1097 ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1098 if ($self->{c} == 0x000A) { # \n
1099 $self->{state} = $q == 0 ? NAME_STATE :
1100 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1101 $self->{c} = $self->{get_char}->($self);
1102 redo A;
1103 } else {
1104 $self->{state} = $q == 0 ? NAME_STATE :
1105 $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1106 # reprocess
1107 redo A;
1108 }
1109 } elsif ($self->{state} == STRING_STATE) {
1110 ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1111 ## a character in |invalid$Q| in |invalid| in |INVALID|,
1112 ## where |$Q = $q == 0x0022 ? 1 : 2|.
1113 ## Or, in |URI|.
1114 if ($self->{c} == 0x005C) { # \
1115 $self->{state} = ESCAPE_OPEN_STATE;
1116 $self->{c} = $self->{get_char}->($self);
1117 redo A;
1118 } elsif ($self->{c} == $q) { # " | '
1119 if ($self->{t}->{type} == STRING_TOKEN) {
1120 $self->{state} = BEFORE_TOKEN_STATE;
1121 $self->{c} = $self->{get_char}->($self);
1122 return $self->{t};
1123 #redo A;
1124 } else {
1125 $self->{state} = URI_AFTER_WSP_STATE;
1126 $self->{c} = $self->{get_char}->($self);
1127 redo A;
1128 }
1129 } elsif ($self->{c} == 0x000A or # \n
1130 $self->{c} == 0x000D or # \r
1131 $self->{c} == 0x000C or # \f
1132 $self->{c} == -1) {
1133 $self->{t}->{type} = {
1134 STRING_TOKEN, INVALID_TOKEN,
1135 INVALID_TOKEN, INVALID_TOKEN,
1136 URI_TOKEN, URI_INVALID_TOKEN,
1137 URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1138 URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1139 URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1140 }->{$self->{t}->{type}};
1141 $self->{state} = BEFORE_TOKEN_STATE;
1142 # reconsume
1143 return $self->{t};
1144 #redo A;
1145 } else {
1146 $self->{t}->{value} .= chr $self->{c};
1147 # stay in the state
1148 $self->{c} = $self->{get_char}->($self);
1149 redo A;
1150 }
1151 } elsif ($self->{state} == NUMBER_STATE) {
1152 ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1153 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1154 $self->{t}->{value} .= chr $self->{c};
1155 # stay in the state
1156 $self->{c} = $self->{get_char}->($self);
1157 redo A;
1158 } elsif ($self->{c} == 0x002E) { # .
1159 $self->{state} = NUMBER_DOT_STATE;
1160 $self->{c} = $self->{get_char}->($self);
1161 redo A;
1162 } else {
1163 $self->{t}->{number} = 0+$self->{t}->{value};
1164 $self->{t}->{value} = '';
1165 $self->{state} = AFTER_NUMBER_STATE;
1166 # reprocess
1167 redo A;
1168 }
1169 } elsif ($self->{state} == NUMBER_DOT_STATE) {
1170 ## NOTE: The character immediately following |.| in |num|.
1171 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1172 $self->{t}->{value} .= '.' . chr $self->{c};
1173 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1174 $self->{c} = $self->{get_char}->($self);
1175 redo A;
1176 } else {
1177 unshift @{$self->{token}}, {type => DOT_TOKEN};
1178 $self->{t}->{number} = 0+$self->{t}->{value};
1179 $self->{t}->{value} = '';
1180 $self->{state} = BEFORE_TOKEN_STATE;
1181 # reprocess
1182 return $self->{t};
1183 #redo A;
1184 }
1185 } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1186 ## NOTE: The character immediately following |.| at the beginning of |num|.
1187 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1188 $self->{t}->{value} .= '.' . chr $self->{c};
1189 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1190 $self->{c} = $self->{get_char}->($self);
1191 redo A;
1192 } else {
1193 $self->{state} = BEFORE_TOKEN_STATE;
1194 # reprocess
1195 return {type => DOT_TOKEN,
1196 line => $self->{line}, column => $self->{column} - 1};
1197 ## BUG: line and column might be wrong if they are on the
1198 ## line boundary.
1199 #redo A;
1200 }
1201 } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1202 ## NOTE: |[0-9]| in |num| after |.|.
1203 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1204 $self->{t}->{value} .= chr $self->{c};
1205 # stay in the state
1206 $self->{c} = $self->{get_char}->($self);
1207 redo A;
1208 } else {
1209 $self->{t}->{number} = 0+$self->{t}->{value};
1210 $self->{t}->{value} = '';
1211 $self->{state} = AFTER_NUMBER_STATE;
1212 # reprocess
1213 redo A;
1214 }
1215 } else {
1216 die "$0: Unknown state |$self->{state}|";
1217 }
1218 } # A
1219 } # get_next_token
1220
1221 sub serialize_token ($$) {
1222 shift;
1223 my $t = shift;
1224
1225 ## NOTE: This function is not intended for roundtrip-able serialization.
1226
1227 if ($t->{type} == IDENT_TOKEN) {
1228 return $t->{value};
1229 } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1230 return '@' . $t->{value};
1231 } elsif ($t->{type} == HASH_TOKEN) {
1232 return '#' . $t->{value};
1233 } elsif ($t->{type} == FUNCTION_TOKEN) {
1234 return $t->{value} . '(';
1235 } elsif ($t->{type} == URI_TOKEN) {
1236 return 'url(' . $t->{value} . ')';
1237 } elsif ($t->{type} == URI_INVALID_TOKEN) {
1238 return 'url(' . $t->{value};
1239 } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1240 return 'url-prefix(' . $t->{value} . ')';
1241 } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1242 return 'url-prefix(' . $t->{value};
1243 } elsif ($t->{type} == STRING_TOKEN) {
1244 return '"' . $t->{value} . '"';
1245 } elsif ($t->{type} == INVALID_TOKEN) {
1246 return '"' . $t->{value};
1247 } elsif ($t->{type} == NUMBER_TOKEN) {
1248 return $t->{number};
1249 } elsif ($t->{type} == DIMENSION_TOKEN) {
1250 return $t->{number} . $t->{value};
1251 } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1252 return $t->{number} . '%';
1253 } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1254 return 'U+' . $t->{value};
1255 } elsif ($t->{type} == DELIM_TOKEN) {
1256 return $t->{value};
1257 } elsif ($t->{type} == PLUS_TOKEN) {
1258 return '+';
1259 } elsif ($t->{type} == GREATER_TOKEN) {
1260 return '>';
1261 } elsif ($t->{type} == COMMA_TOKEN) {
1262 return ',';
1263 } elsif ($t->{type} == TILDE_TOKEN) {
1264 return '~';
1265 } elsif ($t->{type} == DASHMATCH_TOKEN) {
1266 return '|=';
1267 } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1268 return '^=';
1269 } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1270 return '$=';
1271 } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1272 return '*=';
1273 } elsif ($t->{type} == INCLUDES_TOKEN) {
1274 return '~=';
1275 } elsif ($t->{type} == SEMICOLON_TOKEN) {
1276 return ';';
1277 } elsif ($t->{type} == LBRACE_TOKEN) {
1278 return '{';
1279 } elsif ($t->{type} == RBRACE_TOKEN) {
1280 return '}';
1281 } elsif ($t->{type} == LPAREN_TOKEN) {
1282 return '(';
1283 } elsif ($t->{type} == RPAREN_TOKEN) {
1284 return ')';
1285 } elsif ($t->{type} == LBRACKET_TOKEN) {
1286 return '[';
1287 } elsif ($t->{type} == RBRACKET_TOKEN) {
1288 return ']';
1289 } elsif ($t->{type} == S_TOKEN) {
1290 return ' ';
1291 } elsif ($t->{type} == CDO_TOKEN) {
1292 return '<!--';
1293 } elsif ($t->{type} == CDC_TOKEN) {
1294 return '-->';
1295 } elsif ($t->{type} == COMMENT_TOKEN) {
1296 return '/**/';
1297 } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1298 return '/*';
1299 } elsif ($t->{type} == EOF_TOKEN) {
1300 return '{EOF}';
1301 } elsif ($t->{type} == MINUS_TOKEN) {
1302 return '-';
1303 } elsif ($t->{type} == STAR_TOKEN) {
1304 return '*';
1305 } elsif ($t->{type} == VBAR_TOKEN) {
1306 return '|';
1307 } elsif ($t->{type} == COLON_TOKEN) {
1308 return ':';
1309 } elsif ($t->{type} == MATCH_TOKEN) {
1310 return '=';
1311 } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1312 return '!';
1313 } else {
1314 return '{'.$t->{type}.'}';
1315 }
1316 } # serialize_token
1317
1318 =head1 LICENSE
1319
1320 Copyright 2007 Wakaba <w@suika.fam.cx>
1321
1322 This library is free software; you can redistribute it
1323 and/or modify it under the same terms as Perl itself.
1324
1325 =cut
1326
1327 1;
1328 # $Date: 2008/01/26 09:30:47 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24