/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (hide annotations) (download)
Sun Jan 20 06:15:20 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.17: +135 -52 lines
++ whatpm/Whatpm/CSS/ChangeLog	20 Jan 2008 06:15:14 -0000
	* Parser.pm, SelectorsParser.pm: |{href}| parameter added
	to all the onerror invocations.  The |{onerror}| function
	is no longer called with |{line}| and |{column}| parameters.

	* Tokenizer.pm: All token are now given |{line}| and |{column}|
	values.

2008-01-20  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3 wakaba 1.18 our $VERSION=do{my @r=(q$Revision: 1.17 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.14 require Exporter;
6     push our @ISA, 'Exporter';
7    
8 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
9     sub BEFORE_NMSTART_STATE () { 1 }
10     sub NAME_STATE () { 2 }
11     sub ESCAPE_OPEN_STATE () { 3 }
12     sub STRING_STATE () { 4 }
13     sub HASH_OPEN_STATE () { 5 }
14     sub NUMBER_STATE () { 6 }
15     sub NUMBER_FRACTION_STATE () { 7 }
16     sub AFTER_NUMBER_STATE () { 8 }
17     sub URI_BEFORE_WSP_STATE () { 9 }
18     sub ESCAPE_STATE () { 10 }
19     sub ESCAPE_BEFORE_LF_STATE () { 11 }
20     sub ESCAPE_BEFORE_NL_STATE () { 12 }
21     sub NUMBER_DOT_STATE () { 13 }
22     sub NUMBER_DOT_NUMBER_STATE () { 14 }
23     sub DELIM_STATE () { 15 }
24 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
25     sub URI_AFTER_WSP_STATE () { 17 }
26     sub AFTER_AT_STATE () { 18 }
27     sub AFTER_AT_HYPHEN_STATE () { 19 }
28 wakaba 1.2
29     sub IDENT_TOKEN () { 1 }
30     sub ATKEYWORD_TOKEN () { 2 }
31     sub HASH_TOKEN () { 3 }
32     sub FUNCTION_TOKEN () { 4 }
33     sub URI_TOKEN () { 5 }
34     sub URI_INVALID_TOKEN () { 6 }
35     sub URI_PREFIX_TOKEN () { 7 }
36     sub URI_PREFIX_INVALID_TOKEN () { 8 }
37     sub STRING_TOKEN () { 9 }
38     sub INVALID_TOKEN () { 10 }
39     sub NUMBER_TOKEN () { 11 }
40     sub DIMENSION_TOKEN () { 12 }
41     sub PERCENTAGE_TOKEN () { 13 }
42     sub UNICODE_RANGE_TOKEN () { 14 }
43     sub DELIM_TOKEN () { 16 }
44     sub PLUS_TOKEN () { 17 }
45     sub GREATER_TOKEN () { 18 }
46     sub COMMA_TOKEN () { 19 }
47     sub TILDE_TOKEN () { 20 }
48     sub DASHMATCH_TOKEN () { 21 }
49     sub PREFIXMATCH_TOKEN () { 22 }
50     sub SUFFIXMATCH_TOKEN () { 23 }
51     sub SUBSTRINGMATCH_TOKEN () { 24 }
52     sub INCLUDES_TOKEN () { 25 }
53     sub SEMICOLON_TOKEN () { 26 }
54     sub LBRACE_TOKEN () { 27 }
55     sub RBRACE_TOKEN () { 28 }
56     sub LPAREN_TOKEN () { 29 }
57     sub RPAREN_TOKEN () { 30 }
58     sub LBRACKET_TOKEN () { 31 }
59     sub RBRACKET_TOKEN () { 32 }
60     sub S_TOKEN () { 33 }
61     sub CDO_TOKEN () { 34 }
62     sub CDC_TOKEN () { 35 }
63     sub COMMENT_TOKEN () { 36 }
64     sub COMMENT_INVALID_TOKEN () { 37 }
65     sub EOF_TOKEN () { 38 }
66 wakaba 1.13 sub MINUS_TOKEN () { 39 }
67     sub STAR_TOKEN () { 40 }
68     sub VBAR_TOKEN () { 41 }
69     sub DOT_TOKEN () { 42 }
70     sub COLON_TOKEN () { 43 }
71     sub MATCH_TOKEN () { 44 }
72     sub EXCLAMATION_TOKEN () { 45 }
73 wakaba 1.2
74     our @TokenName = qw(
75 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77 wakaba 1.6 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78 wakaba 1.2 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80 wakaba 1.13 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81 wakaba 1.2 );
82    
83 wakaba 1.14 our @EXPORT_OK = qw(
84     IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85     URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86     STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87     UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88     TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89     SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90     RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91     S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92     MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93     EXCLAMATION_TOKEN
94     );
95    
96     our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97    
98 wakaba 1.1 sub new ($) {
99 wakaba 1.17 my $self = bless {token => [], get_char => sub { -1 }}, shift;
100 wakaba 1.1 return $self;
101     } # new
102    
103     sub init ($) {
104     my $self = shift;
105     $self->{state} = BEFORE_TOKEN_STATE;
106     $self->{c} = $self->{get_char}->();
107 wakaba 1.5 #$self->{t} = {type => token-type, value => value, number => number};
108 wakaba 1.1 } # init
109    
110     sub get_next_token ($) {
111     my $self = shift;
112     if (@{$self->{token}}) {
113     return shift @{$self->{token}};
114     }
115    
116     my $char;
117     my $num; # |{num}|, if any.
118     my $i; # |$i + 1|th character in |unicode| in |escape|.
119 wakaba 1.3 my $q;
120     ## NOTE:
121     ## 0: in |ident|.
122     ## 1: in |URI| outside of |string|.
123     ## 0x0022: in |string1| or |invalid1|.
124     ## 0x0027: in |string2| or |invalid2|.
125 wakaba 1.1
126     A: {
127     if ($self->{state} == BEFORE_TOKEN_STATE) {
128     if ($self->{c} == 0x002D) { # -
129     ## NOTE: |-| in |ident| in |IDENT|
130 wakaba 1.18 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
131     line => $self->{line}, column => $self->{column}};
132 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
133     $self->{c} = $self->{get_char}->();
134     redo A;
135 wakaba 1.5 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
136 wakaba 1.18 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
137     line => $self->{line}, column => $self->{column}};
138 wakaba 1.5 $self->{c} = $self->{get_char}->();
139     if ($self->{c} == 0x002B) { # +
140 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
141 wakaba 1.5 $self->{c} = $self->{get_char}->();
142     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
145     $self->{c} == 0x003F) { # ?
146 wakaba 1.12 $self->{t}->{value} = chr $self->{c};
147 wakaba 1.5 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
148     $self->{c} = $self->{get_char}->();
149     C: for (2..6) {
150     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
151     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
152     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
153     $self->{c} == 0x003F) { # ?
154     $self->{t}->{value} .= chr $self->{c};
155     $self->{c} = $self->{get_char}->();
156     } else {
157     last C;
158     }
159     } # C
160    
161     if ($self->{c} == 0x002D) { # -
162     $self->{c} = $self->{get_char}->();
163     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
164     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
165     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
166     $self->{t}->{value} .= '-' . chr $self->{c};
167     $self->{c} = $self->{get_char}->();
168     C: for (2..6) {
169     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
170     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
171     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
172     $self->{t}->{value} .= chr $self->{c};
173     $self->{c} = $self->{get_char}->();
174     } else {
175     last C;
176     }
177     } # C
178    
179     #
180     } else {
181     my $token = $self->{t};
182 wakaba 1.18 $self->{t} = {type => IDENT_TOKEN, value => '-',
183     line => $self->{line},
184     column => $self->{column}};
185 wakaba 1.5 $self->{state} = BEFORE_NMSTART_STATE;
186     # reprocess
187     return $token;
188     #redo A;
189     }
190     }
191    
192     $self->{state} = BEFORE_TOKEN_STATE;
193     # reprocess
194     return $self->{t};
195     #redo A;
196     } else {
197 wakaba 1.18 unshift @{$self->{token}},
198     {type => PLUS_TOKEN, line => $l, column => $c};
199 wakaba 1.5 $self->{state} = BEFORE_TOKEN_STATE;
200     # reprocess
201     return $self->{t};
202     #redo A;
203     }
204     } else {
205     $self->{state} = NAME_STATE;
206     # reprocess
207     redo A;
208     }
209 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
210     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
211 wakaba 1.1 $self->{c} == 0x005F or # _
212     $self->{c} > 0x007F) { # nonascii
213     ## NOTE: |nmstart| in |ident| in |IDENT|
214 wakaba 1.18 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
215     line => $self->{line}, column => $self->{column}};
216 wakaba 1.1 $self->{state} = NAME_STATE;
217     $self->{c} = $self->{get_char}->();
218     redo A;
219     } elsif ($self->{c} == 0x005C) { # \
220     ## NOTE: |nmstart| in |ident| in |IDENT|
221 wakaba 1.18 $self->{t} = {type => IDENT_TOKEN, value => '',
222     line => $self->{line}, column => $self->{column}};
223 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
224     $self->{c} = $self->{get_char}->();
225     redo A;
226     } elsif ($self->{c} == 0x0040) { # @
227     ## NOTE: |@| in |ATKEYWORD|
228 wakaba 1.18 $self->{t} = {type => ATKEYWORD_TOKEN, value => '',
229     line => $self->{line}, column => $self->{column}};
230 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
231 wakaba 1.1 $self->{c} = $self->{get_char}->();
232     redo A;
233 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
234 wakaba 1.18 $self->{t} = {type => STRING_TOKEN, value => '',
235     line => $self->{line}, column => $self->{column}};
236 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
237 wakaba 1.1 $self->{c} = $self->{get_char}->();
238     redo A;
239     } elsif ($self->{c} == 0x0023) { # #
240     ## NOTE: |#| in |HASH|.
241 wakaba 1.18 $self->{t} = {type => HASH_TOKEN, value => '',
242     line => $self->{line}, column => $self->{column}};
243 wakaba 1.1 $self->{state} = HASH_OPEN_STATE;
244     $self->{c} = $self->{get_char}->();
245     redo A;
246     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
247     ## NOTE: |num|.
248 wakaba 1.18 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c},
249     line => $self->{line}, column => $self->{column}};
250 wakaba 1.1 $self->{state} = NUMBER_STATE;
251     $self->{c} = $self->{get_char}->();
252     redo A;
253     } elsif ($self->{c} == 0x002E) { # .
254     ## NOTE: |num|.
255 wakaba 1.18 $self->{t} = {type => NUMBER_TOKEN, value => '0',
256     line => $self->{line}, column => $self->{column}};
257 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
258     $self->{c} = $self->{get_char}->();
259     redo A;
260 wakaba 1.4 } elsif ($self->{c} == 0x002F) { # /
261 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
262 wakaba 1.4 $self->{c} = $self->{get_char}->();
263     if ($self->{c} == 0x002A) { # *
264     C: {
265     $self->{c} = $self->{get_char}->();
266     if ($self->{c} == 0x002A) { # *
267     D: {
268     $self->{c} = $self->{get_char}->();
269     if ($self->{c} == 0x002F) { # /
270     #
271     } elsif ($self->{c} == 0x002A) { # *
272     redo D;
273     } else {
274     redo C;
275     }
276     } # D
277     } elsif ($self->{c} == -1) {
278     # stay in the state
279     # reprocess
280     return {type => COMMENT_INVALID_TOKEN};
281     #redo A;
282     } else {
283     redo C;
284     }
285     } # C
286    
287     # stay in the state.
288     $self->{c} = $self->{get_char}->();
289     redo A;
290     } else {
291     # stay in the state.
292     # reprocess
293 wakaba 1.18 return {type => DELIM_TOKEN, value => '/', line => $l, column => $c};
294 wakaba 1.4 #redo A;
295     }
296 wakaba 1.1 } elsif ($self->{c} == 0x003C) { # <
297 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
298 wakaba 1.1 ## NOTE: |CDO|
299     $self->{c} = $self->{get_char}->();
300     if ($self->{c} == 0x0021) { # !
301     $self->{c} = $self->{get_char}->();
302 wakaba 1.9 if ($self->{c} == 0x002D) { # -
303 wakaba 1.1 $self->{c} = $self->{get_char}->();
304 wakaba 1.9 if ($self->{c} == 0x002D) { # -
305 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
306     $self->{c} = $self->{get_char}->();
307 wakaba 1.18 return {type => CDO_TOKEN, line => $l, column => $c};
308 wakaba 1.1 #redo A;
309     } else {
310 wakaba 1.18 unshift @{$self->{token}},
311     {type => EXCLAMATION_TOKEN, line => $l, column => $c + 1};
312 wakaba 1.1 ## NOTE: |-| in |ident| in |IDENT|
313 wakaba 1.18 $self->{t} = {type => IDENT_TOKEN, value => '-',
314     line => $l, column => $c + 2};
315 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
316     #reprocess
317 wakaba 1.18 return {type => DELIM_TOKEN, value => '<',
318     line => $l, column => $c};
319 wakaba 1.1 #redo A;
320     }
321     } else {
322 wakaba 1.18 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN,
323     line => $l, column => $c + 1};
324 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
325     #reprocess
326 wakaba 1.18 return {type => DELIM_TOKEN, value => '<',
327     line => $l, column => $c};
328 wakaba 1.1 #redo A;
329     }
330     } else {
331     $self->{state} = BEFORE_TOKEN_STATE;
332     #reprocess
333 wakaba 1.18 return {type => DELIM_TOKEN, value => '<',
334     line => $l, column => $c};
335 wakaba 1.1 #redo A;
336     }
337 wakaba 1.2 } elsif (my $t = {
338 wakaba 1.13 0x0021 => EXCLAMATION_TOKEN, # !
339     0x002D => MINUS_TOKEN, # -
340     0x002E => DOT_TOKEN, # .
341     0x003A => COLON_TOKEN, # :
342     0x003B => SEMICOLON_TOKEN, # ;
343     0x003D => MATCH_TOKEN, # =
344     0x007B => LBRACE_TOKEN, # {
345     0x007D => RBRACE_TOKEN, # }
346     0x0028 => LPAREN_TOKEN, # (
347     0x0029 => RPAREN_TOKEN, # )
348     0x005B => LBRACKET_TOKEN, # [
349     0x005D => RBRACKET_TOKEN, # ]
350 wakaba 1.1 }->{$self->{c}}) {
351 wakaba 1.17 my ($l, $c) = ($self->{line}, $self->{column});
352 wakaba 1.1 # stay in the state
353 wakaba 1.17 $self->{c} = $self->{get_char}->($self);
354     return {type => $t, line => $l, column => $c};
355 wakaba 1.1 # redo A;
356     } elsif ({
357     0x0020 => 1, # SP
358     0x0009 => 1, # \t
359     0x000D => 1, # \r
360     0x000A => 1, # \n
361     0x000C => 1, # \f
362     }->{$self->{c}}) {
363 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
364 wakaba 1.1 W: {
365     $self->{c} = $self->{get_char}->();
366     if ({
367     0x0020 => 1, # SP
368     0x0009 => 1, # \t
369     0x000D => 1, # \r
370     0x000A => 1, # \n
371     0x000C => 1, # \f
372     }->{$self->{c}}) {
373     redo W;
374     } elsif (my $v = {
375     0x002B => PLUS_TOKEN, # +
376     0x003E => GREATER_TOKEN, # >
377     0x002C => COMMA_TOKEN, # ,
378     0x007E => TILDE_TOKEN, # ~
379     }->{$self->{c}}) {
380 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
381 wakaba 1.1 # stay in the state
382     $self->{c} = $self->{get_char}->();
383 wakaba 1.18 return {type => $v, line => $l, column => $c};
384 wakaba 1.1 #redo A;
385     } else {
386     # stay in the state
387     # reprocess
388 wakaba 1.18 return {type => S_TOKEN, line => $l, column => $c};
389 wakaba 1.1 #redo A;
390     }
391     } # W
392     } elsif (my $v = {
393     0x007C => DASHMATCH_TOKEN, # |
394     0x005E => PREFIXMATCH_TOKEN, # ^
395     0x0024 => SUFFIXMATCH_TOKEN, # $
396     0x002A => SUBSTRINGMATCH_TOKEN, # *
397     }->{$self->{c}}) {
398 wakaba 1.18 my ($line, $column) = ($self->{line}, $self->{column});
399 wakaba 1.2 my $c = $self->{c};
400 wakaba 1.1 $self->{c} = $self->{get_char}->();
401     if ($self->{c} == 0x003D) { # =
402     # stay in the state
403     $self->{c} = $self->{get_char}->();
404 wakaba 1.18 return {type => $v, line => $line, column => $column};
405 wakaba 1.1 #redo A;
406 wakaba 1.13 } elsif ($v = {
407     0x002A => STAR_TOKEN, # *
408     0x007C => VBAR_TOKEN, # |
409     }->{$c}) {
410     # stay in the state.
411     # reprocess
412 wakaba 1.18 return {type => $v, line => $line, column => $column};
413 wakaba 1.13 #redo A;
414 wakaba 1.1 } else {
415     # stay in the state
416     # reprocess
417 wakaba 1.18 return {type => DELIM_TOKEN, value => chr $c,
418     line => $line, column => $column};
419 wakaba 1.1 #redo A;
420     }
421     } elsif ($self->{c} == 0x002B) { # +
422 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
423 wakaba 1.1 # stay in the state
424     $self->{c} = $self->{get_char}->();
425 wakaba 1.18 return {type => PLUS_TOKEN, line => $l, column => $c};
426 wakaba 1.1 #redo A;
427     } elsif ($self->{c} == 0x003E) { # >
428 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
429 wakaba 1.1 # stay in the state
430     $self->{c} = $self->{get_char}->();
431 wakaba 1.18 return {type => GREATER_TOKEN, line => $l, column => $c};
432 wakaba 1.1 #redo A;
433     } elsif ($self->{c} == 0x002C) { # ,
434 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
435 wakaba 1.1 # stay in the state
436     $self->{c} = $self->{get_char}->();
437 wakaba 1.18 return {type => COMMA_TOKEN, line => $l, column => $c};
438 wakaba 1.1 #redo A;
439     } elsif ($self->{c} == 0x007E) { # ~
440 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column});
441 wakaba 1.1 $self->{c} = $self->{get_char}->();
442     if ($self->{c} == 0x003D) { # =
443     # stay in the state
444     $self->{c} = $self->{get_char}->();
445 wakaba 1.18 return {type => INCLUDES_TOKEN, line => $l, column => $c};
446 wakaba 1.1 #redo A;
447     } else {
448     # stay in the state
449     # reprocess
450 wakaba 1.18 return {type => TILDE_TOKEN, line => $l, column => $c};
451 wakaba 1.1 #redo A;
452     }
453     } elsif ($self->{c} == -1) {
454     # stay in the state
455     $self->{c} = $self->{get_char}->();
456 wakaba 1.18 return {type => EOF_TOKEN,
457     line => $self->{line}, column => $self->{column}};
458 wakaba 1.1 #redo A;
459     } else {
460     # stay in the state
461 wakaba 1.18 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c},
462     line => $self->{line}, column => $self->{column}};
463 wakaba 1.1 $self->{c} = $self->{get_char}->();
464 wakaba 1.5 return $self->{t};
465 wakaba 1.1 #redo A;
466     }
467     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
468 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
469     ## |FUNCTION|)
470 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
471     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
472 wakaba 1.1 $self->{c} == 0x005F or # _
473     $self->{c} > 0x007F) { # nonascii
474 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
475     $self->{t}->{type} = DIMENSION_TOKEN
476     if $self->{t}->{type} == NUMBER_TOKEN;
477 wakaba 1.1 $self->{state} = NAME_STATE;
478     $self->{c} = $self->{get_char}->();
479     redo A;
480     } elsif ($self->{c} == 0x005C) { # \
481     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
482     $self->{c} = $self->{get_char}->();
483     redo A;
484 wakaba 1.10 } elsif ($self->{c} == 0x002D) { # -
485     if ($self->{t}->{type} == IDENT_TOKEN) {
486     $self->{c} = $self->{get_char}->();
487     if ($self->{c} == 0x003E) { # >
488     $self->{state} = BEFORE_TOKEN_STATE;
489     $self->{c} = $self->{get_char}->();
490 wakaba 1.18 return {type => CDC_TOKEN,
491     line => $self->{t}->{line},
492     column => $self->{t}->{column}};
493 wakaba 1.10 #redo A;
494     } else {
495     ## NOTE: |-|, |-|, $self->{c}
496     #$self->{t} = {type => IDENT_TOKEN, value => '-'};
497 wakaba 1.18 $self->{t}->{column}++;
498 wakaba 1.10 # stay in the state
499     # reconsume
500 wakaba 1.18 return {type => MINUS_TOKEN,
501     line => $self->{t}->{line},
502     column => $self->{t}->{column} - 1};
503 wakaba 1.10 #redo A;
504     }
505     } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
506 wakaba 1.18 my ($l, $c) = ($self->{line}, $self->{column}); # second '-'
507 wakaba 1.1 $self->{c} = $self->{get_char}->();
508 wakaba 1.10 if ($self->{c} == 0x003E) { # >
509     unshift @{$self->{token}}, {type => CDC_TOKEN};
510     $self->{t}->{type} = NUMBER_TOKEN;
511     $self->{t}->{value} = '';
512     $self->{state} = BEFORE_TOKEN_STATE;
513     $self->{c} = $self->{get_char}->();
514     return $self->{t};
515     #redo A;
516     } else {
517 wakaba 1.18 ## NOTE: NUMBER, |-|, |-|, $self->{c}
518 wakaba 1.10 my $t = $self->{t};
519     $t->{type} = NUMBER_TOKEN;
520     $t->{value} = '';
521 wakaba 1.18 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
522     line => $l, column => $c};
523     unshift @{$self->{token}}, {type => MINUS_TOKEN,
524     line => $l, column => $c - 1};
525 wakaba 1.10 # stay in the state
526     # reconsume
527     return $t;
528     #redo A;
529     }
530 wakaba 1.1 } else {
531 wakaba 1.10 #
532 wakaba 1.1 }
533     } else {
534 wakaba 1.10 #
535     }
536    
537     if ($self->{t}->{type} == DIMENSION_TOKEN) {
538     ## NOTE: |-| after |NUMBER|.
539 wakaba 1.18 unshift @{$self->{token}}, {type => MINUS_TOKEN,
540     line => $self->{line},
541     column => $self->{column} - 1};
542     ## BUG: column might be wrong if on the line boundary.
543 wakaba 1.10 $self->{state} = BEFORE_TOKEN_STATE;
544     # reprocess
545     $self->{t}->{type} = NUMBER_TOKEN;
546     $self->{t}->{value} = '';
547     return $self->{t};
548     } else {
549     ## NOTE: |-| not followed by |nmstart|.
550     $self->{state} = BEFORE_TOKEN_STATE;
551     # reprocess
552 wakaba 1.18 return {type => MINUS_TOKEN,
553     line => $self->{line}, column => $self->{column} - 1};
554     ## BUG: column might be wrong if on the line boundary.
555 wakaba 1.1 }
556 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
557     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
558     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
559     $self->{c} == 0x005F or # _
560     $self->{c} > 0x007F) { # nonascii
561 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
562 wakaba 1.3 $self->{state} = NAME_STATE;
563     $self->{c} = $self->{get_char}->();
564     redo A;
565     } elsif ($self->{c} == 0x002D) { # -
566 wakaba 1.5 $self->{t}->{value} .= '-';
567 wakaba 1.3 $self->{state} = AFTER_AT_HYPHEN_STATE;
568     $self->{c} = $self->{get_char}->();
569     redo A;
570     } elsif ($self->{c} == 0x005C) { # \
571     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
572     $self->{c} = $self->{get_char}->();
573     redo A;
574     } else {
575     $self->{state} = BEFORE_TOKEN_STATE;
576     # reprocess
577 wakaba 1.18 return {type => DELIM_TOKEN, value => '@',
578     line => $self->{t}->{line},
579     column => $self->{t}->{column}};
580 wakaba 1.3 }
581     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
582     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
583     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
584     $self->{c} == 0x005F or # _
585     $self->{c} > 0x007F) { # nonascii
586 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
587 wakaba 1.3 $self->{state} = NAME_STATE;
588     $self->{c} = $self->{get_char}->();
589     redo A;
590     } elsif ($self->{c} == 0x002D) { # -
591     $self->{c} = $self->{get_char}->();
592     if ($self->{c} == 0x003E) { # >
593 wakaba 1.4 unshift @{$self->{token}}, {type => CDC_TOKEN};
594 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
595     $self->{c} = $self->{get_char}->();
596 wakaba 1.4 return {type => DELIM_TOKEN, value => '@'};
597 wakaba 1.3 #redo A;
598     } else {
599 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
600 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
601 wakaba 1.3 $self->{state} = BEFORE_NMSTART_STATE;
602     # reprocess
603     return {type => DELIM_TOKEN, value => '@'};
604     #redo A;
605     }
606     } elsif ($self->{c} == 0x005C) { # \
607     ## TODO: @-\{nl}
608     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
609     $self->{c} = $self->{get_char}->();
610     redo A;
611     } else {
612 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
613 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
614     # reprocess
615     return {type => DELIM_TOKEN, value => '@'};
616     }
617 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
618     if ($self->{c} == 0x002D) { # -
619     ## NOTE: |-| in |ident|.
620 wakaba 1.10 $self->{t}->{hyphen} = 1;
621 wakaba 1.5 $self->{t}->{value} = '-';
622 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
623 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
624     $self->{c} = $self->{get_char}->();
625     redo A;
626 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
627     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
628 wakaba 1.1 $self->{c} == 0x005F or # _
629     $self->{c} > 0x007F) { # nonascii
630     ## NOTE: |nmstart| in |ident|.
631 wakaba 1.5 $self->{t}->{value} = chr $self->{c};
632     $self->{t}->{type} = DIMENSION_TOKEN;
633 wakaba 1.1 $self->{state} = NAME_STATE;
634     $self->{c} = $self->{get_char}->();
635     redo A;
636     } elsif ($self->{c} == 0x005C) { # \
637     ## NOTE: |nmstart| in |ident| in |IDENT|
638 wakaba 1.5 $self->{t}->{value} = '';
639 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
640 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
641     $self->{c} = $self->{get_char}->();
642     redo A;
643     } elsif ($self->{c} == 0x0025) { # %
644 wakaba 1.5 $self->{t}->{type} = PERCENTAGE_TOKEN;
645 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
646     $self->{c} = $self->{get_char}->();
647 wakaba 1.5 return $self->{t};
648 wakaba 1.1 #redo A;
649     } else {
650     $self->{state} = BEFORE_TOKEN_STATE;
651     # reprocess
652 wakaba 1.5 return $self->{t};
653 wakaba 1.1 #redo A;
654     }
655     } elsif ($self->{state} == HASH_OPEN_STATE) {
656     ## NOTE: The first |nmchar| in |name| in |HASH|.
657 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
658     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
659     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
660 wakaba 1.1 $self->{c} == 0x002D or # -
661     $self->{c} == 0x005F or # _
662     $self->{c} > 0x007F) { # nonascii
663 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
664 wakaba 1.1 $self->{state} = NAME_STATE;
665     $self->{c} = $self->{get_char}->();
666     redo A;
667     } elsif ($self->{c} == 0x005C) { # \
668     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
669     $self->{c} = $self->{get_char}->();
670     redo A;
671     } else {
672     $self->{state} = BEFORE_TOKEN_STATE;
673 wakaba 1.9 # reprocess
674 wakaba 1.18 return {type => DELIM_TOKEN, value => '#',
675     line => $self->{t}->{line},
676     column => $self->{t}->{column}};
677 wakaba 1.1 #redo A;
678     }
679     } elsif ($self->{state} == NAME_STATE) {
680     ## NOTE: |nmchar| in (|ident| or |name|).
681 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
682     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
683     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
684 wakaba 1.1 $self->{c} == 0x005F or # _
685     $self->{c} == 0x002D or # -
686     $self->{c} > 0x007F) { # nonascii
687 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
688 wakaba 1.1 # stay in the state
689     $self->{c} = $self->{get_char}->();
690     redo A;
691     } elsif ($self->{c} == 0x005C) { # \
692 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
693 wakaba 1.1 $self->{c} = $self->{get_char}->();
694     redo A;
695     } elsif ($self->{c} == 0x0028 and # (
696 wakaba 1.5 $self->{t}->{type} == IDENT_TOKEN) { # (
697     my $func_name = $self->{t}->{value};
698 wakaba 1.3 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
699     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
700 wakaba 1.5 if ($self->{t}->{has_escape}) {
701 wakaba 1.3 ## TODO: warn
702     }
703 wakaba 1.5 $self->{t}->{type}
704 wakaba 1.3 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
705 wakaba 1.5 $self->{t}->{value} = '';
706 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
707     $self->{c} = $self->{get_char}->();
708     redo A;
709     } else {
710 wakaba 1.5 $self->{t}->{type} = FUNCTION_TOKEN;
711 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
712     $self->{c} = $self->{get_char}->();
713 wakaba 1.5 return $self->{t};
714 wakaba 1.1 #redo A;
715     }
716     } else {
717     $self->{state} = BEFORE_TOKEN_STATE;
718     # reconsume
719 wakaba 1.5 return $self->{t};
720 wakaba 1.1 #redo A;
721     }
722 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
723     while ({
724     0x0020 => 1, # SP
725     0x0009 => 1, # \t
726     0x000D => 1, # \r
727     0x000A => 1, # \n
728     0x000C => 1, # \f
729     }->{$self->{c}}) {
730     $self->{c} = $self->{get_char}->();
731     }
732     if ($self->{c} == -1) {
733 wakaba 1.5 $self->{t}->{type} = {
734 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
735     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
736     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
737     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
738 wakaba 1.5 }->{$self->{t}->{type}};
739 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
740     $self->{c} = $self->{get_char}->();
741 wakaba 1.5 return $self->{t};
742 wakaba 1.3 #redo A;
743     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
744     ## TODO: Should we consider matches of "(" and ")"?
745 wakaba 1.5 $self->{t}->{type} = {
746 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
747     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
748     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
749     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
750 wakaba 1.5 }->{$self->{t}->{type}};
751 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
752     $self->{c} = $self->{get_char}->();
753     redo A;
754     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
755     $self->{state} = STRING_STATE; $q = $self->{c};
756     $self->{c} = $self->{get_char}->();
757     redo A;
758     } elsif ($self->{c} == 0x0029) { # )
759     $self->{state} = BEFORE_TOKEN_STATE;
760     $self->{c} = $self->{get_char}->();
761 wakaba 1.5 return $self->{t};
762 wakaba 1.3 #redo A;
763     } elsif ($self->{c} == 0x005C) { # \
764     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
765     $self->{c} = $self->{get_char}->();
766     redo A;
767     } else {
768 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
769 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
770     $self->{c} = $self->{get_char}->();
771     redo A;
772     }
773     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
774     if ({
775     0x0020 => 1, # SP
776     0x0009 => 1, # \t
777     0x000D => 1, # \r
778     0x000A => 1, # \n
779     0x000C => 1, # \f
780     }->{$self->{c}}) {
781     $self->{state} = URI_AFTER_WSP_STATE;
782     $self->{c} = $self->{get_char}->();
783     redo A;
784     } elsif ($self->{c} == -1) {
785 wakaba 1.5 $self->{t}->{type} = {
786 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
787     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
788     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
789     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
790 wakaba 1.5 }->{$self->{t}->{type}};
791 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
792     $self->{c} = $self->{get_char}->();
793 wakaba 1.5 return $self->{t};
794 wakaba 1.3 #redo A;
795     } elsif ($self->{c} < 0x0020 or {
796     0x0022 => 1, # "
797     0x0027 => 1, # '
798     0x0028 => 1, # (
799     }->{$self->{c}}) { # C0 or (
800     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
801 wakaba 1.5 $self->{t}->{type} = {
802 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
803     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
804     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
805     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
806 wakaba 1.5 }->{$self->{t}->{type}};
807 wakaba 1.3 # stay in the state.
808     $self->{c} = $self->{get_char}->();
809     redo A;
810     } elsif ($self->{c} == 0x0029) { # )
811     $self->{state} = BEFORE_TOKEN_STATE;
812     $self->{c} = $self->{get_char}->();
813 wakaba 1.5 return $self->{t};
814 wakaba 1.3 #redo A;
815     } elsif ($self->{c} == 0x005C) { # \
816     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
817     $self->{c} = $self->{get_char}->();
818     redo A;
819     } else {
820 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
821 wakaba 1.3 # stay in the state.
822     $self->{c} = $self->{get_char}->();
823     redo A;
824     }
825     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
826     if ({
827     0x0020 => 1, # SP
828     0x0009 => 1, # \t
829     0x000D => 1, # \r
830     0x000A => 1, # \n
831     0x000C => 1, # \f
832     }->{$self->{c}}) {
833     # stay in the state.
834     $self->{c} = $self->{get_char}->();
835     redo A;
836     } elsif ($self->{c} == -1) {
837 wakaba 1.5 $self->{t}->{type} = {
838 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
839     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
840     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
841     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
842 wakaba 1.5 }->{$self->{t}->{type}};
843 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
844     $self->{c} = $self->{get_char}->();
845 wakaba 1.5 return $self->{t};
846 wakaba 1.3 #redo A;
847     } elsif ($self->{c} == 0x0029) { # )
848     $self->{state} = BEFORE_TOKEN_STATE;
849     $self->{c} = $self->{get_char}->();
850 wakaba 1.5 return $self->{t};
851 wakaba 1.3 #redo A;
852     } elsif ($self->{c} == 0x005C) { # \
853     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
854     $self->{c} = $self->{get_char}->();
855     redo A;
856     } else {
857     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
858 wakaba 1.5 $self->{t}->{type} = {
859 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
860     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
861     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
862     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
863 wakaba 1.5 }->{$self->{t}->{type}};
864 wakaba 1.3 # stay in the state.
865     $self->{c} = $self->{get_char}->();
866     redo A;
867     }
868 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
869 wakaba 1.5 $self->{t}->{has_escape} = 1;
870 wakaba 1.1 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
871     ## NOTE: second character of |unicode| in |escape|.
872     $char = $self->{c} - 0x0030;
873     $self->{state} = ESCAPE_STATE; $i = 2;
874     $self->{c} = $self->{get_char}->();
875     redo A;
876     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
877     ## NOTE: second character of |unicode| in |escape|.
878     $char = $self->{c} - 0x0041 + 0xA;
879     $self->{state} = ESCAPE_STATE; $i = 2;
880     $self->{c} = $self->{get_char}->();
881     redo A;
882 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
883 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
884 wakaba 1.7 $char = $self->{c} - 0x0061 + 0xA;
885 wakaba 1.1 $self->{state} = ESCAPE_STATE; $i = 2;
886     $self->{c} = $self->{get_char}->();
887     redo A;
888     } elsif ($self->{c} == 0x000A or # \n
889     $self->{c} == 0x000C) { # \f
890     if ($q == 0) {
891 wakaba 1.7 #
892 wakaba 1.3 } elsif ($q == 1) {
893     ## NOTE: In |escape| in |URI|.
894 wakaba 1.5 $self->{t}->{type} = {
895 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
896     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
897     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
898     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
899 wakaba 1.5 }->{$self->{t}->{type}};
900     $self->{t}->{value} .= chr $self->{c};
901 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
902     $self->{c} = $self->{get_char}->();
903     redo A;
904 wakaba 1.1 } else {
905     ## Note: In |nl| in ... in |string| or |ident|.
906     $self->{state} = STRING_STATE;
907     $self->{c} = $self->{get_char}->();
908     redo A;
909     }
910     } elsif ($self->{c} == 0x000D) { # \r
911     if ($q == 0) {
912 wakaba 1.7 #
913 wakaba 1.3 } elsif ($q == 1) {
914 wakaba 1.7 ## NOTE: In |escape| in |URI|.
915 wakaba 1.5 $self->{t}->{type} = {
916 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
917     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
918     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
919     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
920 wakaba 1.5 }->{$self->{t}->{type}};
921 wakaba 1.8 $self->{state} = ESCAPE_BEFORE_LF_STATE;
922 wakaba 1.3 $self->{c} = $self->{get_char}->();
923     redo A;
924 wakaba 1.1 } else {
925     ## Note: In |nl| in ... in |string| or |ident|.
926     $self->{state} = ESCAPE_BEFORE_LF_STATE;
927     $self->{c} = $self->{get_char}->();
928     redo A;
929     }
930 wakaba 1.7 } elsif ($self->{c} == -1) {
931     #
932 wakaba 1.1 } else {
933     ## NOTE: second character of |escape|.
934 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
935 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
936     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
937 wakaba 1.1 $self->{c} = $self->{get_char}->();
938     redo A;
939     }
940 wakaba 1.7
941     if ($q == 0) {
942 wakaba 1.10 if ($self->{t}->{type} == DIMENSION_TOKEN) {
943     if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
944     $self->{state} = BEFORE_TOKEN_STATE;
945     # reprocess
946 wakaba 1.18 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
947     line => $self->{line},
948     column => $self->{column} - 2};
949     unshift @{$self->{token}}, {type => MINUS_TOKEN,
950     line => $self->{line},
951     column => $self->{column} - 1};
952     ## BUG: line and column might be wrong if they are on the
953     ## line boundary.
954 wakaba 1.10 $self->{t}->{type} = NUMBER_TOKEN;
955     $self->{t}->{value} = '';
956     return $self->{t};
957     #redo A;
958     } elsif (length $self->{t}->{value}) {
959     $self->{state} = BEFORE_TOKEN_STATE;
960     # reprocess
961 wakaba 1.18 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
962     line => $self->{line},
963     column => $self->{column} - 1};
964     ## BUG: line and column might be wrong if they are on the
965     ## line boundary.
966 wakaba 1.10 return $self->{t};
967     #redo A;
968     } else {
969     $self->{state} = BEFORE_TOKEN_STATE;
970     # reprocess
971 wakaba 1.18 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
972     line => $self->{line},
973     column => $self->{column} - 1};
974     ## BUG: line and column might be wrong if they are on the
975     ## line boundary.
976 wakaba 1.10 $self->{t}->{type} = NUMBER_TOKEN;
977     $self->{t}->{value} = '';
978     return $self->{t};
979     #redo A;
980     }
981 wakaba 1.7 } else {
982 wakaba 1.10 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
983     $self->{state} = BEFORE_TOKEN_STATE;
984     # reprocess
985 wakaba 1.18 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
986     line => $self->{line},
987     column => $self->{column} - 2};
988     return {type => MINUS_TOKEN,
989     line => $self->{line},
990     column => $self->{column} - 1};
991     ## BUG: line and column might be wrong if they are on the
992     ## line boundary.
993 wakaba 1.10 #redo A;
994     } elsif (length $self->{t}->{value}) {
995     $self->{state} = BEFORE_TOKEN_STATE;
996     # reprocess
997 wakaba 1.18 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
998     line => $self->{line},
999     column => $self->{column} - 1};
1000     ## BUG: line and column might be wrong if they are on the
1001     ## line boundary.
1002 wakaba 1.10 return $self->{t};
1003     #redo A;
1004     } else {
1005     $self->{state} = BEFORE_TOKEN_STATE;
1006     # reprocess
1007 wakaba 1.18 return {type => DELIM_TOKEN, value => '\\',
1008     line => $self->{line},
1009     column => $self->{column} - 1};
1010     ## BUG: line and column might be wrong if they are on the
1011     ## line boundary.
1012 wakaba 1.10 #redo A;
1013     }
1014 wakaba 1.7 }
1015 wakaba 1.8 } elsif ($q == 1) {
1016     $self->{state} = URI_UNQUOTED_STATE;
1017 wakaba 1.7 $self->{c} = $self->{get_char}->();
1018     redo A;
1019 wakaba 1.8 } else {
1020 wakaba 1.18 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
1021     line => $self->{line},
1022     column => $self->{column} - 1};
1023     ## BUG: line and column might be wrong if they are on the
1024     ## line boundary.
1025 wakaba 1.8 $self->{t}->{type} = {
1026     STRING_TOKEN, INVALID_TOKEN,
1027     URI_TOKEN, URI_INVALID_TOKEN,
1028     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1029     }->{$self->{t}->{type}} || $self->{t}->{type};
1030     $self->{state} = BEFORE_TOKEN_STATE;
1031     # reprocess
1032     return $self->{t};
1033     #redo A;
1034 wakaba 1.7 }
1035 wakaba 1.1 } elsif ($self->{state} == ESCAPE_STATE) {
1036     ## NOTE: third..seventh character of |unicode| in |escape|.
1037     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
1038     $char = $char * 0x10 + $self->{c} - 0x0030;
1039     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1040     $self->{c} = $self->{get_char}->();
1041     redo A;
1042     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
1043     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
1044     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1045     $self->{c} = $self->{get_char}->();
1046     redo A;
1047 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
1048 wakaba 1.7 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
1049 wakaba 1.1 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1050     $self->{c} = $self->{get_char}->();
1051     redo A;
1052     } elsif ($self->{c} == 0x0020 or # SP
1053     $self->{c} == 0x000A or # \n
1054     $self->{c} == 0x0009 or # \t
1055     $self->{c} == 0x000C) { # \f
1056 wakaba 1.5 $self->{t}->{value} .= chr $char;
1057 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1058     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1059 wakaba 1.1 $self->{c} = $self->{get_char}->();
1060     redo A;
1061     } elsif ($self->{c} == 0x000D) { # \r
1062     $self->{state} = ESCAPE_BEFORE_LF_STATE;
1063     $self->{c} = $self->{get_char}->();
1064     redo A;
1065     } else {
1066 wakaba 1.5 $self->{t}->{value} .= chr $char;
1067 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1068     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1069 wakaba 1.1 # reconsume
1070     redo A;
1071     }
1072     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
1073     ## NOTE: eightth character of |unicode| in |escape|.
1074     if ($self->{c} == 0x0020 or # SP
1075     $self->{c} == 0x000A or # \n
1076     $self->{c} == 0x0009 or # \t
1077     $self->{c} == 0x000C) { # \f
1078 wakaba 1.5 $self->{t}->{value} .= chr $char;
1079 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1080     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1081 wakaba 1.1 $self->{c} = $self->{get_char}->();
1082     redo A;
1083     } elsif ($self->{c} == 0x000D) { # \r
1084     $self->{state} = ESCAPE_BEFORE_NL_STATE;
1085     $self->{c} = $self->{get_char}->();
1086     redo A;
1087     } else {
1088 wakaba 1.5 $self->{t}->{value} .= chr $char;
1089 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1090     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1091 wakaba 1.1 # reconsume
1092     redo A;
1093     }
1094     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1095 wakaba 1.15 ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1096 wakaba 1.1 if ($self->{c} == 0x000A) { # \n
1097 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1098     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1099 wakaba 1.1 $self->{c} = $self->{get_char}->();
1100     redo A;
1101     } else {
1102 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1103     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1104 wakaba 1.8 # reprocess
1105 wakaba 1.1 redo A;
1106     }
1107     } elsif ($self->{state} == STRING_STATE) {
1108     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1109     ## a character in |invalid$Q| in |invalid| in |INVALID|,
1110     ## where |$Q = $q == 0x0022 ? 1 : 2|.
1111 wakaba 1.3 ## Or, in |URI|.
1112 wakaba 1.1 if ($self->{c} == 0x005C) { # \
1113     $self->{state} = ESCAPE_OPEN_STATE;
1114     $self->{c} = $self->{get_char}->();
1115     redo A;
1116     } elsif ($self->{c} == $q) { # " | '
1117 wakaba 1.5 if ($self->{t}->{type} == STRING_TOKEN) {
1118 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
1119     $self->{c} = $self->{get_char}->();
1120 wakaba 1.5 return $self->{t};
1121 wakaba 1.3 #redo A;
1122     } else {
1123     $self->{state} = URI_AFTER_WSP_STATE;
1124     $self->{c} = $self->{get_char}->();
1125     redo A;
1126     }
1127 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
1128     $self->{c} == 0x000D or # \r
1129     $self->{c} == 0x000C or # \f
1130     $self->{c} == -1) {
1131 wakaba 1.11 $self->{t}->{type} = {
1132     STRING_TOKEN, INVALID_TOKEN,
1133     INVALID_TOKEN, INVALID_TOKEN,
1134     URI_TOKEN, URI_INVALID_TOKEN,
1135     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1136     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1137     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1138     }->{$self->{t}->{type}};
1139 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1140     # reconsume
1141 wakaba 1.5 return $self->{t};
1142 wakaba 1.1 #redo A;
1143     } else {
1144 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1145 wakaba 1.1 # stay in the state
1146     $self->{c} = $self->{get_char}->();
1147     redo A;
1148     }
1149     } elsif ($self->{state} == NUMBER_STATE) {
1150     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1151     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1152 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1153 wakaba 1.1 # stay in the state
1154     $self->{c} = $self->{get_char}->();
1155     redo A;
1156     } elsif ($self->{c} == 0x002E) { # .
1157     $self->{state} = NUMBER_DOT_STATE;
1158     $self->{c} = $self->{get_char}->();
1159     redo A;
1160     } else {
1161 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1162     $self->{t}->{value} = '';
1163 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1164     # reprocess
1165 wakaba 1.2 redo A;
1166 wakaba 1.1 }
1167     } elsif ($self->{state} == NUMBER_DOT_STATE) {
1168     ## NOTE: The character immediately following |.| in |num|.
1169     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1170 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1171 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1172     $self->{c} = $self->{get_char}->();
1173     redo A;
1174     } else {
1175 wakaba 1.13 unshift @{$self->{token}}, {type => DOT_TOKEN};
1176 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1177     $self->{t}->{value} = '';
1178 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1179     # reprocess
1180 wakaba 1.5 return $self->{t};
1181 wakaba 1.1 #redo A;
1182     }
1183     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1184     ## NOTE: The character immediately following |.| at the beginning of |num|.
1185     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1186 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1187 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1188     $self->{c} = $self->{get_char}->();
1189     redo A;
1190     } else {
1191     $self->{state} = BEFORE_TOKEN_STATE;
1192 wakaba 1.9 # reprocess
1193 wakaba 1.18 return {type => DOT_TOKEN,
1194     line => $self->{line}, column => $self->{column} - 1};
1195     ## BUG: line and column might be wrong if they are on the
1196     ## line boundary.
1197 wakaba 1.1 #redo A;
1198     }
1199     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1200     ## NOTE: |[0-9]| in |num| after |.|.
1201     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1202 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1203 wakaba 1.1 # stay in the state
1204     $self->{c} = $self->{get_char}->();
1205     redo A;
1206     } else {
1207 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1208     $self->{t}->{value} = '';
1209 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1210     # reprocess
1211 wakaba 1.2 redo A;
1212 wakaba 1.1 }
1213     } else {
1214     die "$0: Unknown state |$self->{state}|";
1215     }
1216     } # A
1217     } # get_next_token
1218    
1219 wakaba 1.17 sub serialize_token ($$) {
1220     shift;
1221     my $t = shift;
1222    
1223     ## NOTE: This function is not intended for roundtrip-able serialization.
1224    
1225     if ($t->{type} == IDENT_TOKEN) {
1226     return $t->{value};
1227     } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1228     return '@' . $t->{value};
1229     } elsif ($t->{type} == HASH_TOKEN) {
1230     return '#' . $t->{value};
1231     } elsif ($t->{type} == FUNCTION_TOKEN) {
1232     return $t->{value} . '(';
1233     } elsif ($t->{type} == URI_TOKEN) {
1234     return 'url(' . $t->{value} . ')';
1235     } elsif ($t->{type} == URI_INVALID_TOKEN) {
1236     return 'url(' . $t->{value};
1237     } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1238     return 'url-prefix(' . $t->{value} . ')';
1239     } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1240     return 'url-prefix(' . $t->{value};
1241     } elsif ($t->{type} == STRING_TOKEN) {
1242     return '"' . $t->{value} . '"';
1243     } elsif ($t->{type} == INVALID_TOKEN) {
1244     return '"' . $t->{value};
1245     } elsif ($t->{type} == NUMBER_TOKEN) {
1246     return $t->{number};
1247     } elsif ($t->{type} == DIMENSION_TOKEN) {
1248     return $t->{number} . $t->{value};
1249     } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1250     return $t->{number} . '%';
1251     } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1252     return 'U+' . $t->{value};
1253     } elsif ($t->{type} == DELIM_TOKEN) {
1254     return $t->{value};
1255     } elsif ($t->{type} == PLUS_TOKEN) {
1256     return '+';
1257     } elsif ($t->{type} == GREATER_TOKEN) {
1258     return '>';
1259     } elsif ($t->{type} == COMMA_TOKEN) {
1260     return ',';
1261     } elsif ($t->{type} == TILDE_TOKEN) {
1262     return '~';
1263     } elsif ($t->{type} == DASHMATCH_TOKEN) {
1264     return '|=';
1265     } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1266     return '^=';
1267     } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1268     return '$=';
1269     } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1270     return '*=';
1271     } elsif ($t->{type} == INCLUDES_TOKEN) {
1272     return '~=';
1273     } elsif ($t->{type} == SEMICOLON_TOKEN) {
1274     return ';';
1275     } elsif ($t->{type} == LBRACE_TOKEN) {
1276     return '{';
1277     } elsif ($t->{type} == RBRACE_TOKEN) {
1278     return '}';
1279     } elsif ($t->{type} == LPAREN_TOKEN) {
1280     return '(';
1281     } elsif ($t->{type} == RPAREN_TOKEN) {
1282     return ')';
1283     } elsif ($t->{type} == LBRACKET_TOKEN) {
1284     return '[';
1285     } elsif ($t->{type} == RBRACKET_TOKEN) {
1286     return ']';
1287     } elsif ($t->{type} == S_TOKEN) {
1288     return ' ';
1289     } elsif ($t->{type} == CDO_TOKEN) {
1290     return '<!--';
1291     } elsif ($t->{type} == CDC_TOKEN) {
1292     return '-->';
1293     } elsif ($t->{type} == COMMENT_TOKEN) {
1294     return '/**/';
1295     } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1296     return '/*';
1297     } elsif ($t->{type} == EOF_TOKEN) {
1298     return '{EOF}';
1299     } elsif ($t->{type} == MINUS_TOKEN) {
1300     return '-';
1301     } elsif ($t->{type} == STAR_TOKEN) {
1302     return '*';
1303     } elsif ($t->{type} == VBAR_TOKEN) {
1304     return '|';
1305     } elsif ($t->{type} == COLON_TOKEN) {
1306     return ':';
1307     } elsif ($t->{type} == MATCH_TOKEN) {
1308     return '=';
1309     } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1310     return '!';
1311     } else {
1312     return '{'.$t->{type}.'}';
1313     }
1314     } # serialize_token
1315    
1316 wakaba 1.16 =head1 LICENSE
1317    
1318     Copyright 2007 Wakaba <w@suika.fam.cx>
1319    
1320     This library is free software; you can redistribute it
1321     and/or modify it under the same terms as Perl itself.
1322    
1323     =cut
1324    
1325 wakaba 1.1 1;
1326 wakaba 1.18 # $Date: 2008/01/20 04:02:25 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24