/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (hide annotations) (download)
Sun Jan 20 04:02:25 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.16: +103 -6 lines
++ whatpm/Whatpm/CSS/ChangeLog	20 Jan 2008 04:02:20 -0000
2008-01-20  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm (parse_char_string): Revise |$tt->{get_char}| callback
	so that it sets |$tt->{line}| and |$tt->{column}| options.  Some
	error handler calling codes are modified for the experimental
	support for more precious reporting of error location.

	* Tokenizer.pm (new): The |onerror| option has been removed, since
	it was never used.
	(get_next_token): Limited and experimental support for token
	emittion with the information on the position where it occurs.
	(serialize_token): New function.

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3 wakaba 1.17 our $VERSION=do{my @r=(q$Revision: 1.16 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.1
5 wakaba 1.14 require Exporter;
6     push our @ISA, 'Exporter';
7    
8 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
9     sub BEFORE_NMSTART_STATE () { 1 }
10     sub NAME_STATE () { 2 }
11     sub ESCAPE_OPEN_STATE () { 3 }
12     sub STRING_STATE () { 4 }
13     sub HASH_OPEN_STATE () { 5 }
14     sub NUMBER_STATE () { 6 }
15     sub NUMBER_FRACTION_STATE () { 7 }
16     sub AFTER_NUMBER_STATE () { 8 }
17     sub URI_BEFORE_WSP_STATE () { 9 }
18     sub ESCAPE_STATE () { 10 }
19     sub ESCAPE_BEFORE_LF_STATE () { 11 }
20     sub ESCAPE_BEFORE_NL_STATE () { 12 }
21     sub NUMBER_DOT_STATE () { 13 }
22     sub NUMBER_DOT_NUMBER_STATE () { 14 }
23     sub DELIM_STATE () { 15 }
24 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
25     sub URI_AFTER_WSP_STATE () { 17 }
26     sub AFTER_AT_STATE () { 18 }
27     sub AFTER_AT_HYPHEN_STATE () { 19 }
28 wakaba 1.2
29     sub IDENT_TOKEN () { 1 }
30     sub ATKEYWORD_TOKEN () { 2 }
31     sub HASH_TOKEN () { 3 }
32     sub FUNCTION_TOKEN () { 4 }
33     sub URI_TOKEN () { 5 }
34     sub URI_INVALID_TOKEN () { 6 }
35     sub URI_PREFIX_TOKEN () { 7 }
36     sub URI_PREFIX_INVALID_TOKEN () { 8 }
37     sub STRING_TOKEN () { 9 }
38     sub INVALID_TOKEN () { 10 }
39     sub NUMBER_TOKEN () { 11 }
40     sub DIMENSION_TOKEN () { 12 }
41     sub PERCENTAGE_TOKEN () { 13 }
42     sub UNICODE_RANGE_TOKEN () { 14 }
43     sub DELIM_TOKEN () { 16 }
44     sub PLUS_TOKEN () { 17 }
45     sub GREATER_TOKEN () { 18 }
46     sub COMMA_TOKEN () { 19 }
47     sub TILDE_TOKEN () { 20 }
48     sub DASHMATCH_TOKEN () { 21 }
49     sub PREFIXMATCH_TOKEN () { 22 }
50     sub SUFFIXMATCH_TOKEN () { 23 }
51     sub SUBSTRINGMATCH_TOKEN () { 24 }
52     sub INCLUDES_TOKEN () { 25 }
53     sub SEMICOLON_TOKEN () { 26 }
54     sub LBRACE_TOKEN () { 27 }
55     sub RBRACE_TOKEN () { 28 }
56     sub LPAREN_TOKEN () { 29 }
57     sub RPAREN_TOKEN () { 30 }
58     sub LBRACKET_TOKEN () { 31 }
59     sub RBRACKET_TOKEN () { 32 }
60     sub S_TOKEN () { 33 }
61     sub CDO_TOKEN () { 34 }
62     sub CDC_TOKEN () { 35 }
63     sub COMMENT_TOKEN () { 36 }
64     sub COMMENT_INVALID_TOKEN () { 37 }
65     sub EOF_TOKEN () { 38 }
66 wakaba 1.13 sub MINUS_TOKEN () { 39 }
67     sub STAR_TOKEN () { 40 }
68     sub VBAR_TOKEN () { 41 }
69     sub DOT_TOKEN () { 42 }
70     sub COLON_TOKEN () { 43 }
71     sub MATCH_TOKEN () { 44 }
72     sub EXCLAMATION_TOKEN () { 45 }
73 wakaba 1.2
74     our @TokenName = qw(
75 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77 wakaba 1.6 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78 wakaba 1.2 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80 wakaba 1.13 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81 wakaba 1.2 );
82    
83 wakaba 1.14 our @EXPORT_OK = qw(
84     IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85     URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86     STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87     UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88     TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89     SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90     RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91     S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92     MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93     EXCLAMATION_TOKEN
94     );
95    
96     our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97    
98 wakaba 1.1 sub new ($) {
99 wakaba 1.17 my $self = bless {token => [], get_char => sub { -1 }}, shift;
100 wakaba 1.1 return $self;
101     } # new
102    
103     sub init ($) {
104     my $self = shift;
105     $self->{state} = BEFORE_TOKEN_STATE;
106     $self->{c} = $self->{get_char}->();
107 wakaba 1.5 #$self->{t} = {type => token-type, value => value, number => number};
108 wakaba 1.1 } # init
109    
110     sub get_next_token ($) {
111     my $self = shift;
112     if (@{$self->{token}}) {
113     return shift @{$self->{token}};
114     }
115    
116     my $char;
117     my $num; # |{num}|, if any.
118     my $i; # |$i + 1|th character in |unicode| in |escape|.
119 wakaba 1.3 my $q;
120     ## NOTE:
121     ## 0: in |ident|.
122     ## 1: in |URI| outside of |string|.
123     ## 0x0022: in |string1| or |invalid1|.
124     ## 0x0027: in |string2| or |invalid2|.
125 wakaba 1.1
126     A: {
127     if ($self->{state} == BEFORE_TOKEN_STATE) {
128     if ($self->{c} == 0x002D) { # -
129     ## NOTE: |-| in |ident| in |IDENT|
130 wakaba 1.7 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
131 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
132     $self->{c} = $self->{get_char}->();
133     redo A;
134 wakaba 1.5 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
135     $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
136     $self->{c} = $self->{get_char}->();
137     if ($self->{c} == 0x002B) { # +
138     $self->{c} = $self->{get_char}->();
139     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
140     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
141     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
142     $self->{c} == 0x003F) { # ?
143 wakaba 1.12 $self->{t}->{value} = chr $self->{c};
144 wakaba 1.5 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
145     $self->{c} = $self->{get_char}->();
146     C: for (2..6) {
147     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
148     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
149     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
150     $self->{c} == 0x003F) { # ?
151     $self->{t}->{value} .= chr $self->{c};
152     $self->{c} = $self->{get_char}->();
153     } else {
154     last C;
155     }
156     } # C
157    
158     if ($self->{c} == 0x002D) { # -
159     $self->{c} = $self->{get_char}->();
160     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
161     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
162     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
163     $self->{t}->{value} .= '-' . chr $self->{c};
164     $self->{c} = $self->{get_char}->();
165     C: for (2..6) {
166     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
167     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
168     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
169     $self->{t}->{value} .= chr $self->{c};
170     $self->{c} = $self->{get_char}->();
171     } else {
172     last C;
173     }
174     } # C
175    
176     #
177     } else {
178     my $token = $self->{t};
179     $self->{t} = {type => IDENT_TOKEN, value => '-'};
180     $self->{state} = BEFORE_NMSTART_STATE;
181     # reprocess
182     return $token;
183     #redo A;
184     }
185     }
186    
187     $self->{state} = BEFORE_TOKEN_STATE;
188     # reprocess
189     return $self->{t};
190     #redo A;
191     } else {
192     unshift @{$self->{token}}, {type => PLUS_TOKEN};
193     $self->{state} = BEFORE_TOKEN_STATE;
194     # reprocess
195     return $self->{t};
196     #redo A;
197     }
198     } else {
199     $self->{state} = NAME_STATE;
200     # reprocess
201     redo A;
202     }
203 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
204     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
205 wakaba 1.1 $self->{c} == 0x005F or # _
206     $self->{c} > 0x007F) { # nonascii
207     ## NOTE: |nmstart| in |ident| in |IDENT|
208 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
209 wakaba 1.1 $self->{state} = NAME_STATE;
210     $self->{c} = $self->{get_char}->();
211     redo A;
212     } elsif ($self->{c} == 0x005C) { # \
213     ## NOTE: |nmstart| in |ident| in |IDENT|
214 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => ''};
215 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
216     $self->{c} = $self->{get_char}->();
217     redo A;
218     } elsif ($self->{c} == 0x0040) { # @
219     ## NOTE: |@| in |ATKEYWORD|
220 wakaba 1.5 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
221 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
222 wakaba 1.1 $self->{c} = $self->{get_char}->();
223     redo A;
224 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
225 wakaba 1.5 $self->{t} = {type => STRING_TOKEN, value => ''};
226 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
227 wakaba 1.1 $self->{c} = $self->{get_char}->();
228     redo A;
229     } elsif ($self->{c} == 0x0023) { # #
230     ## NOTE: |#| in |HASH|.
231 wakaba 1.5 $self->{t} = {type => HASH_TOKEN, value => ''};
232 wakaba 1.1 $self->{state} = HASH_OPEN_STATE;
233     $self->{c} = $self->{get_char}->();
234     redo A;
235     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
236     ## NOTE: |num|.
237 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
238 wakaba 1.1 $self->{state} = NUMBER_STATE;
239     $self->{c} = $self->{get_char}->();
240     redo A;
241     } elsif ($self->{c} == 0x002E) { # .
242     ## NOTE: |num|.
243 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
244 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
245     $self->{c} = $self->{get_char}->();
246     redo A;
247 wakaba 1.4 } elsif ($self->{c} == 0x002F) { # /
248     $self->{c} = $self->{get_char}->();
249     if ($self->{c} == 0x002A) { # *
250     C: {
251     $self->{c} = $self->{get_char}->();
252     if ($self->{c} == 0x002A) { # *
253     D: {
254     $self->{c} = $self->{get_char}->();
255     if ($self->{c} == 0x002F) { # /
256     #
257     } elsif ($self->{c} == 0x002A) { # *
258     redo D;
259     } else {
260     redo C;
261     }
262     } # D
263     } elsif ($self->{c} == -1) {
264     # stay in the state
265     # reprocess
266     return {type => COMMENT_INVALID_TOKEN};
267     #redo A;
268     } else {
269     redo C;
270     }
271     } # C
272    
273     # stay in the state.
274     $self->{c} = $self->{get_char}->();
275     redo A;
276     } else {
277     # stay in the state.
278     # reprocess
279 wakaba 1.9 return {type => DELIM_TOKEN, value => '/'};
280 wakaba 1.4 #redo A;
281     }
282 wakaba 1.1 } elsif ($self->{c} == 0x003C) { # <
283     ## NOTE: |CDO|
284     $self->{c} = $self->{get_char}->();
285     if ($self->{c} == 0x0021) { # !
286     $self->{c} = $self->{get_char}->();
287 wakaba 1.9 if ($self->{c} == 0x002D) { # -
288 wakaba 1.1 $self->{c} = $self->{get_char}->();
289 wakaba 1.9 if ($self->{c} == 0x002D) { # -
290 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
291     $self->{c} = $self->{get_char}->();
292     return {type => CDO_TOKEN};
293     #redo A;
294     } else {
295 wakaba 1.13 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
296 wakaba 1.1 ## NOTE: |-| in |ident| in |IDENT|
297 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
298 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
299     #reprocess
300     return {type => DELIM_TOKEN, value => '<'};
301     #redo A;
302     }
303     } else {
304 wakaba 1.13 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
305 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
306     #reprocess
307     return {type => DELIM_TOKEN, value => '<'};
308     #redo A;
309     }
310     } else {
311     $self->{state} = BEFORE_TOKEN_STATE;
312     #reprocess
313     return {type => DELIM_TOKEN, value => '<'};
314     #redo A;
315     }
316 wakaba 1.2 } elsif (my $t = {
317 wakaba 1.13 0x0021 => EXCLAMATION_TOKEN, # !
318     0x002D => MINUS_TOKEN, # -
319     0x002E => DOT_TOKEN, # .
320     0x003A => COLON_TOKEN, # :
321     0x003B => SEMICOLON_TOKEN, # ;
322     0x003D => MATCH_TOKEN, # =
323     0x007B => LBRACE_TOKEN, # {
324     0x007D => RBRACE_TOKEN, # }
325     0x0028 => LPAREN_TOKEN, # (
326     0x0029 => RPAREN_TOKEN, # )
327     0x005B => LBRACKET_TOKEN, # [
328     0x005D => RBRACKET_TOKEN, # ]
329 wakaba 1.1 }->{$self->{c}}) {
330 wakaba 1.17 my ($l, $c) = ($self->{line}, $self->{column});
331 wakaba 1.1 # stay in the state
332 wakaba 1.17 $self->{c} = $self->{get_char}->($self);
333     return {type => $t, line => $l, column => $c};
334 wakaba 1.1 # redo A;
335     } elsif ({
336     0x0020 => 1, # SP
337     0x0009 => 1, # \t
338     0x000D => 1, # \r
339     0x000A => 1, # \n
340     0x000C => 1, # \f
341     }->{$self->{c}}) {
342     W: {
343     $self->{c} = $self->{get_char}->();
344     if ({
345     0x0020 => 1, # SP
346     0x0009 => 1, # \t
347     0x000D => 1, # \r
348     0x000A => 1, # \n
349     0x000C => 1, # \f
350     }->{$self->{c}}) {
351     redo W;
352     } elsif (my $v = {
353     0x002B => PLUS_TOKEN, # +
354     0x003E => GREATER_TOKEN, # >
355     0x002C => COMMA_TOKEN, # ,
356     0x007E => TILDE_TOKEN, # ~
357     }->{$self->{c}}) {
358     # stay in the state
359     $self->{c} = $self->{get_char}->();
360     return {type => $v};
361     #redo A;
362     } else {
363     # stay in the state
364     # reprocess
365     return {type => S_TOKEN};
366     #redo A;
367     }
368     } # W
369     } elsif (my $v = {
370     0x007C => DASHMATCH_TOKEN, # |
371     0x005E => PREFIXMATCH_TOKEN, # ^
372     0x0024 => SUFFIXMATCH_TOKEN, # $
373     0x002A => SUBSTRINGMATCH_TOKEN, # *
374     }->{$self->{c}}) {
375 wakaba 1.2 my $c = $self->{c};
376 wakaba 1.1 $self->{c} = $self->{get_char}->();
377     if ($self->{c} == 0x003D) { # =
378     # stay in the state
379     $self->{c} = $self->{get_char}->();
380     return {type => $v};
381     #redo A;
382 wakaba 1.13 } elsif ($v = {
383     0x002A => STAR_TOKEN, # *
384     0x007C => VBAR_TOKEN, # |
385     }->{$c}) {
386     # stay in the state.
387     # reprocess
388     return {type => $v};
389     #redo A;
390 wakaba 1.1 } else {
391     # stay in the state
392     # reprocess
393 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
394 wakaba 1.1 #redo A;
395     }
396     } elsif ($self->{c} == 0x002B) { # +
397     # stay in the state
398     $self->{c} = $self->{get_char}->();
399     return {type => PLUS_TOKEN};
400     #redo A;
401     } elsif ($self->{c} == 0x003E) { # >
402     # stay in the state
403     $self->{c} = $self->{get_char}->();
404     return {type => GREATER_TOKEN};
405     #redo A;
406     } elsif ($self->{c} == 0x002C) { # ,
407     # stay in the state
408     $self->{c} = $self->{get_char}->();
409     return {type => COMMA_TOKEN};
410     #redo A;
411     } elsif ($self->{c} == 0x007E) { # ~
412     $self->{c} = $self->{get_char}->();
413     if ($self->{c} == 0x003D) { # =
414     # stay in the state
415     $self->{c} = $self->{get_char}->();
416     return {type => INCLUDES_TOKEN};
417     #redo A;
418     } else {
419     # stay in the state
420     # reprocess
421     return {type => TILDE_TOKEN};
422     #redo A;
423     }
424     } elsif ($self->{c} == -1) {
425     # stay in the state
426     $self->{c} = $self->{get_char}->();
427     return {type => EOF_TOKEN};
428     #redo A;
429     } else {
430     # stay in the state
431 wakaba 1.5 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
432 wakaba 1.1 $self->{c} = $self->{get_char}->();
433 wakaba 1.5 return $self->{t};
434 wakaba 1.1 #redo A;
435     }
436     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
437 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
438     ## |FUNCTION|)
439 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
440     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
441 wakaba 1.1 $self->{c} == 0x005F or # _
442     $self->{c} > 0x007F) { # nonascii
443 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
444     $self->{t}->{type} = DIMENSION_TOKEN
445     if $self->{t}->{type} == NUMBER_TOKEN;
446 wakaba 1.1 $self->{state} = NAME_STATE;
447     $self->{c} = $self->{get_char}->();
448     redo A;
449     } elsif ($self->{c} == 0x005C) { # \
450     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
451     $self->{c} = $self->{get_char}->();
452     redo A;
453 wakaba 1.10 } elsif ($self->{c} == 0x002D) { # -
454     if ($self->{t}->{type} == IDENT_TOKEN) {
455     $self->{c} = $self->{get_char}->();
456     if ($self->{c} == 0x003E) { # >
457     $self->{state} = BEFORE_TOKEN_STATE;
458     $self->{c} = $self->{get_char}->();
459     return {type => CDC_TOKEN};
460     #redo A;
461     } else {
462     ## NOTE: |-|, |-|, $self->{c}
463     #$self->{t} = {type => IDENT_TOKEN, value => '-'};
464     # stay in the state
465     # reconsume
466 wakaba 1.13 return {type => MINUS_TOKEN};
467 wakaba 1.10 #redo A;
468     }
469     } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
470 wakaba 1.1 $self->{c} = $self->{get_char}->();
471 wakaba 1.10 if ($self->{c} == 0x003E) { # >
472     unshift @{$self->{token}}, {type => CDC_TOKEN};
473     $self->{t}->{type} = NUMBER_TOKEN;
474     $self->{t}->{value} = '';
475     $self->{state} = BEFORE_TOKEN_STATE;
476     $self->{c} = $self->{get_char}->();
477     return $self->{t};
478     #redo A;
479     } else {
480     ## NOTE: |-|, |-|, $self->{c}
481     my $t = $self->{t};
482     $t->{type} = NUMBER_TOKEN;
483     $t->{value} = '';
484     $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
485 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
486 wakaba 1.10 # stay in the state
487     # reconsume
488     return $t;
489     #redo A;
490     }
491 wakaba 1.1 } else {
492 wakaba 1.10 #
493 wakaba 1.1 }
494     } else {
495 wakaba 1.10 #
496     }
497    
498     if ($self->{t}->{type} == DIMENSION_TOKEN) {
499     ## NOTE: |-| after |NUMBER|.
500 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
501 wakaba 1.10 $self->{state} = BEFORE_TOKEN_STATE;
502     # reprocess
503     $self->{t}->{type} = NUMBER_TOKEN;
504     $self->{t}->{value} = '';
505     return $self->{t};
506     } else {
507     ## NOTE: |-| not followed by |nmstart|.
508     $self->{state} = BEFORE_TOKEN_STATE;
509     # reprocess
510 wakaba 1.13 return {type => MINUS_TOKEN};
511 wakaba 1.1 }
512 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
513     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
514     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
515     $self->{c} == 0x005F or # _
516     $self->{c} > 0x007F) { # nonascii
517 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
518 wakaba 1.3 $self->{state} = NAME_STATE;
519     $self->{c} = $self->{get_char}->();
520     redo A;
521     } elsif ($self->{c} == 0x002D) { # -
522 wakaba 1.5 $self->{t}->{value} .= '-';
523 wakaba 1.3 $self->{state} = AFTER_AT_HYPHEN_STATE;
524     $self->{c} = $self->{get_char}->();
525     redo A;
526     } elsif ($self->{c} == 0x005C) { # \
527     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
528     $self->{c} = $self->{get_char}->();
529     redo A;
530     } else {
531     $self->{state} = BEFORE_TOKEN_STATE;
532     # reprocess
533     return {type => DELIM_TOKEN, value => '@'};
534     }
535     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
536     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
537     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
538     $self->{c} == 0x005F or # _
539     $self->{c} > 0x007F) { # nonascii
540 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
541 wakaba 1.3 $self->{state} = NAME_STATE;
542     $self->{c} = $self->{get_char}->();
543     redo A;
544     } elsif ($self->{c} == 0x002D) { # -
545     $self->{c} = $self->{get_char}->();
546     if ($self->{c} == 0x003E) { # >
547 wakaba 1.4 unshift @{$self->{token}}, {type => CDC_TOKEN};
548 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
549     $self->{c} = $self->{get_char}->();
550 wakaba 1.4 return {type => DELIM_TOKEN, value => '@'};
551 wakaba 1.3 #redo A;
552     } else {
553 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
554 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
555 wakaba 1.3 $self->{state} = BEFORE_NMSTART_STATE;
556     # reprocess
557     return {type => DELIM_TOKEN, value => '@'};
558     #redo A;
559     }
560     } elsif ($self->{c} == 0x005C) { # \
561     ## TODO: @-\{nl}
562     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
563     $self->{c} = $self->{get_char}->();
564     redo A;
565     } else {
566 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
567 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
568     # reprocess
569     return {type => DELIM_TOKEN, value => '@'};
570     }
571 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
572     if ($self->{c} == 0x002D) { # -
573     ## NOTE: |-| in |ident|.
574 wakaba 1.10 $self->{t}->{hyphen} = 1;
575 wakaba 1.5 $self->{t}->{value} = '-';
576 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
577 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
578     $self->{c} = $self->{get_char}->();
579     redo A;
580 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
581     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
582 wakaba 1.1 $self->{c} == 0x005F or # _
583     $self->{c} > 0x007F) { # nonascii
584     ## NOTE: |nmstart| in |ident|.
585 wakaba 1.5 $self->{t}->{value} = chr $self->{c};
586     $self->{t}->{type} = DIMENSION_TOKEN;
587 wakaba 1.1 $self->{state} = NAME_STATE;
588     $self->{c} = $self->{get_char}->();
589     redo A;
590     } elsif ($self->{c} == 0x005C) { # \
591     ## NOTE: |nmstart| in |ident| in |IDENT|
592 wakaba 1.5 $self->{t}->{value} = '';
593 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
594 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
595     $self->{c} = $self->{get_char}->();
596     redo A;
597     } elsif ($self->{c} == 0x0025) { # %
598 wakaba 1.5 $self->{t}->{type} = PERCENTAGE_TOKEN;
599 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
600     $self->{c} = $self->{get_char}->();
601 wakaba 1.5 return $self->{t};
602 wakaba 1.1 #redo A;
603     } else {
604     $self->{state} = BEFORE_TOKEN_STATE;
605     # reprocess
606 wakaba 1.5 return $self->{t};
607 wakaba 1.1 #redo A;
608     }
609     } elsif ($self->{state} == HASH_OPEN_STATE) {
610     ## NOTE: The first |nmchar| in |name| in |HASH|.
611 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
612     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
613     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
614 wakaba 1.1 $self->{c} == 0x002D or # -
615     $self->{c} == 0x005F or # _
616     $self->{c} > 0x007F) { # nonascii
617 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
618 wakaba 1.1 $self->{state} = NAME_STATE;
619     $self->{c} = $self->{get_char}->();
620     redo A;
621     } elsif ($self->{c} == 0x005C) { # \
622     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
623     $self->{c} = $self->{get_char}->();
624     redo A;
625     } else {
626     $self->{state} = BEFORE_TOKEN_STATE;
627 wakaba 1.9 # reprocess
628 wakaba 1.1 return {type => DELIM_TOKEN, value => '#'};
629     #redo A;
630     }
631     } elsif ($self->{state} == NAME_STATE) {
632     ## NOTE: |nmchar| in (|ident| or |name|).
633 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
634     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
635     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
636 wakaba 1.1 $self->{c} == 0x005F or # _
637     $self->{c} == 0x002D or # -
638     $self->{c} > 0x007F) { # nonascii
639 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
640 wakaba 1.1 # stay in the state
641     $self->{c} = $self->{get_char}->();
642     redo A;
643     } elsif ($self->{c} == 0x005C) { # \
644 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
645 wakaba 1.1 $self->{c} = $self->{get_char}->();
646     redo A;
647     } elsif ($self->{c} == 0x0028 and # (
648 wakaba 1.5 $self->{t}->{type} == IDENT_TOKEN) { # (
649     my $func_name = $self->{t}->{value};
650 wakaba 1.3 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
651     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
652 wakaba 1.5 if ($self->{t}->{has_escape}) {
653 wakaba 1.3 ## TODO: warn
654     }
655 wakaba 1.5 $self->{t}->{type}
656 wakaba 1.3 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
657 wakaba 1.5 $self->{t}->{value} = '';
658 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
659     $self->{c} = $self->{get_char}->();
660     redo A;
661     } else {
662 wakaba 1.5 $self->{t}->{type} = FUNCTION_TOKEN;
663 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
664     $self->{c} = $self->{get_char}->();
665 wakaba 1.5 return $self->{t};
666 wakaba 1.1 #redo A;
667     }
668     } else {
669     $self->{state} = BEFORE_TOKEN_STATE;
670     # reconsume
671 wakaba 1.5 return $self->{t};
672 wakaba 1.1 #redo A;
673     }
674 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
675     while ({
676     0x0020 => 1, # SP
677     0x0009 => 1, # \t
678     0x000D => 1, # \r
679     0x000A => 1, # \n
680     0x000C => 1, # \f
681     }->{$self->{c}}) {
682     $self->{c} = $self->{get_char}->();
683     }
684     if ($self->{c} == -1) {
685 wakaba 1.5 $self->{t}->{type} = {
686 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
687     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690 wakaba 1.5 }->{$self->{t}->{type}};
691 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
692     $self->{c} = $self->{get_char}->();
693 wakaba 1.5 return $self->{t};
694 wakaba 1.3 #redo A;
695     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
696     ## TODO: Should we consider matches of "(" and ")"?
697 wakaba 1.5 $self->{t}->{type} = {
698 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
699     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
700     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
701     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
702 wakaba 1.5 }->{$self->{t}->{type}};
703 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
704     $self->{c} = $self->{get_char}->();
705     redo A;
706     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
707     $self->{state} = STRING_STATE; $q = $self->{c};
708     $self->{c} = $self->{get_char}->();
709     redo A;
710     } elsif ($self->{c} == 0x0029) { # )
711     $self->{state} = BEFORE_TOKEN_STATE;
712     $self->{c} = $self->{get_char}->();
713 wakaba 1.5 return $self->{t};
714 wakaba 1.3 #redo A;
715     } elsif ($self->{c} == 0x005C) { # \
716     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
717     $self->{c} = $self->{get_char}->();
718     redo A;
719     } else {
720 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
721 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
722     $self->{c} = $self->{get_char}->();
723     redo A;
724     }
725     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
726     if ({
727     0x0020 => 1, # SP
728     0x0009 => 1, # \t
729     0x000D => 1, # \r
730     0x000A => 1, # \n
731     0x000C => 1, # \f
732     }->{$self->{c}}) {
733     $self->{state} = URI_AFTER_WSP_STATE;
734     $self->{c} = $self->{get_char}->();
735     redo A;
736     } elsif ($self->{c} == -1) {
737 wakaba 1.5 $self->{t}->{type} = {
738 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
739     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
740     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
741     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
742 wakaba 1.5 }->{$self->{t}->{type}};
743 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
744     $self->{c} = $self->{get_char}->();
745 wakaba 1.5 return $self->{t};
746 wakaba 1.3 #redo A;
747     } elsif ($self->{c} < 0x0020 or {
748     0x0022 => 1, # "
749     0x0027 => 1, # '
750     0x0028 => 1, # (
751     }->{$self->{c}}) { # C0 or (
752     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
753 wakaba 1.5 $self->{t}->{type} = {
754 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
755     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
756     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
757     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
758 wakaba 1.5 }->{$self->{t}->{type}};
759 wakaba 1.3 # stay in the state.
760     $self->{c} = $self->{get_char}->();
761     redo A;
762     } elsif ($self->{c} == 0x0029) { # )
763     $self->{state} = BEFORE_TOKEN_STATE;
764     $self->{c} = $self->{get_char}->();
765 wakaba 1.5 return $self->{t};
766 wakaba 1.3 #redo A;
767     } elsif ($self->{c} == 0x005C) { # \
768     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
769     $self->{c} = $self->{get_char}->();
770     redo A;
771     } else {
772 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
773 wakaba 1.3 # stay in the state.
774     $self->{c} = $self->{get_char}->();
775     redo A;
776     }
777     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
778     if ({
779     0x0020 => 1, # SP
780     0x0009 => 1, # \t
781     0x000D => 1, # \r
782     0x000A => 1, # \n
783     0x000C => 1, # \f
784     }->{$self->{c}}) {
785     # stay in the state.
786     $self->{c} = $self->{get_char}->();
787     redo A;
788     } elsif ($self->{c} == -1) {
789 wakaba 1.5 $self->{t}->{type} = {
790 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
791     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
792     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
793     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
794 wakaba 1.5 }->{$self->{t}->{type}};
795 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
796     $self->{c} = $self->{get_char}->();
797 wakaba 1.5 return $self->{t};
798 wakaba 1.3 #redo A;
799     } elsif ($self->{c} == 0x0029) { # )
800     $self->{state} = BEFORE_TOKEN_STATE;
801     $self->{c} = $self->{get_char}->();
802 wakaba 1.5 return $self->{t};
803 wakaba 1.3 #redo A;
804     } elsif ($self->{c} == 0x005C) { # \
805     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
806     $self->{c} = $self->{get_char}->();
807     redo A;
808     } else {
809     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
810 wakaba 1.5 $self->{t}->{type} = {
811 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
812     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
813     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
814     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
815 wakaba 1.5 }->{$self->{t}->{type}};
816 wakaba 1.3 # stay in the state.
817     $self->{c} = $self->{get_char}->();
818     redo A;
819     }
820 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
821 wakaba 1.5 $self->{t}->{has_escape} = 1;
822 wakaba 1.1 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
823     ## NOTE: second character of |unicode| in |escape|.
824     $char = $self->{c} - 0x0030;
825     $self->{state} = ESCAPE_STATE; $i = 2;
826     $self->{c} = $self->{get_char}->();
827     redo A;
828     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
829     ## NOTE: second character of |unicode| in |escape|.
830     $char = $self->{c} - 0x0041 + 0xA;
831     $self->{state} = ESCAPE_STATE; $i = 2;
832     $self->{c} = $self->{get_char}->();
833     redo A;
834 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
835 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
836 wakaba 1.7 $char = $self->{c} - 0x0061 + 0xA;
837 wakaba 1.1 $self->{state} = ESCAPE_STATE; $i = 2;
838     $self->{c} = $self->{get_char}->();
839     redo A;
840     } elsif ($self->{c} == 0x000A or # \n
841     $self->{c} == 0x000C) { # \f
842     if ($q == 0) {
843 wakaba 1.7 #
844 wakaba 1.3 } elsif ($q == 1) {
845     ## NOTE: In |escape| in |URI|.
846 wakaba 1.5 $self->{t}->{type} = {
847 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
848     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
849     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
850     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
851 wakaba 1.5 }->{$self->{t}->{type}};
852     $self->{t}->{value} .= chr $self->{c};
853 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
854     $self->{c} = $self->{get_char}->();
855     redo A;
856 wakaba 1.1 } else {
857     ## Note: In |nl| in ... in |string| or |ident|.
858     $self->{state} = STRING_STATE;
859     $self->{c} = $self->{get_char}->();
860     redo A;
861     }
862     } elsif ($self->{c} == 0x000D) { # \r
863     if ($q == 0) {
864 wakaba 1.7 #
865 wakaba 1.3 } elsif ($q == 1) {
866 wakaba 1.7 ## NOTE: In |escape| in |URI|.
867 wakaba 1.5 $self->{t}->{type} = {
868 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
869     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
870     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872 wakaba 1.5 }->{$self->{t}->{type}};
873 wakaba 1.8 $self->{state} = ESCAPE_BEFORE_LF_STATE;
874 wakaba 1.3 $self->{c} = $self->{get_char}->();
875     redo A;
876 wakaba 1.1 } else {
877     ## Note: In |nl| in ... in |string| or |ident|.
878     $self->{state} = ESCAPE_BEFORE_LF_STATE;
879     $self->{c} = $self->{get_char}->();
880     redo A;
881     }
882 wakaba 1.7 } elsif ($self->{c} == -1) {
883     #
884 wakaba 1.1 } else {
885     ## NOTE: second character of |escape|.
886 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
887 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
888     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
889 wakaba 1.1 $self->{c} = $self->{get_char}->();
890     redo A;
891     }
892 wakaba 1.7
893     if ($q == 0) {
894 wakaba 1.10 if ($self->{t}->{type} == DIMENSION_TOKEN) {
895     if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
896     $self->{state} = BEFORE_TOKEN_STATE;
897     # reprocess
898     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
899 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
900 wakaba 1.10 $self->{t}->{type} = NUMBER_TOKEN;
901     $self->{t}->{value} = '';
902     return $self->{t};
903     #redo A;
904     } elsif (length $self->{t}->{value}) {
905     $self->{state} = BEFORE_TOKEN_STATE;
906     # reprocess
907     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
908     return $self->{t};
909     #redo A;
910     } else {
911     $self->{state} = BEFORE_TOKEN_STATE;
912     # reprocess
913     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
914     $self->{t}->{type} = NUMBER_TOKEN;
915     $self->{t}->{value} = '';
916     return $self->{t};
917     #redo A;
918     }
919 wakaba 1.7 } else {
920 wakaba 1.10 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
921     $self->{state} = BEFORE_TOKEN_STATE;
922     # reprocess
923     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
924 wakaba 1.13 return {type => MINUS_TOKEN};
925 wakaba 1.10 #redo A;
926     } elsif (length $self->{t}->{value}) {
927     $self->{state} = BEFORE_TOKEN_STATE;
928     # reprocess
929     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
930     return $self->{t};
931     #redo A;
932     } else {
933     $self->{state} = BEFORE_TOKEN_STATE;
934     # reprocess
935     return {type => DELIM_TOKEN, value => '\\'};
936     #redo A;
937     }
938 wakaba 1.7 }
939 wakaba 1.8 } elsif ($q == 1) {
940     $self->{state} = URI_UNQUOTED_STATE;
941 wakaba 1.7 $self->{c} = $self->{get_char}->();
942     redo A;
943 wakaba 1.8 } else {
944     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
945     $self->{t}->{type} = {
946     STRING_TOKEN, INVALID_TOKEN,
947     URI_TOKEN, URI_INVALID_TOKEN,
948     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
949     }->{$self->{t}->{type}} || $self->{t}->{type};
950     $self->{state} = BEFORE_TOKEN_STATE;
951     # reprocess
952     return $self->{t};
953     #redo A;
954 wakaba 1.7 }
955 wakaba 1.1 } elsif ($self->{state} == ESCAPE_STATE) {
956     ## NOTE: third..seventh character of |unicode| in |escape|.
957     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
958     $char = $char * 0x10 + $self->{c} - 0x0030;
959     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
960     $self->{c} = $self->{get_char}->();
961     redo A;
962     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
963     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
964     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
965     $self->{c} = $self->{get_char}->();
966     redo A;
967 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
968 wakaba 1.7 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
969 wakaba 1.1 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
970     $self->{c} = $self->{get_char}->();
971     redo A;
972     } elsif ($self->{c} == 0x0020 or # SP
973     $self->{c} == 0x000A or # \n
974     $self->{c} == 0x0009 or # \t
975     $self->{c} == 0x000C) { # \f
976 wakaba 1.5 $self->{t}->{value} .= chr $char;
977 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
978     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
979 wakaba 1.1 $self->{c} = $self->{get_char}->();
980     redo A;
981     } elsif ($self->{c} == 0x000D) { # \r
982     $self->{state} = ESCAPE_BEFORE_LF_STATE;
983     $self->{c} = $self->{get_char}->();
984     redo A;
985     } else {
986 wakaba 1.5 $self->{t}->{value} .= chr $char;
987 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
988     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
989 wakaba 1.1 # reconsume
990     redo A;
991     }
992     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
993     ## NOTE: eightth character of |unicode| in |escape|.
994     if ($self->{c} == 0x0020 or # SP
995     $self->{c} == 0x000A or # \n
996     $self->{c} == 0x0009 or # \t
997     $self->{c} == 0x000C) { # \f
998 wakaba 1.5 $self->{t}->{value} .= chr $char;
999 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1000     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1001 wakaba 1.1 $self->{c} = $self->{get_char}->();
1002     redo A;
1003     } elsif ($self->{c} == 0x000D) { # \r
1004     $self->{state} = ESCAPE_BEFORE_NL_STATE;
1005     $self->{c} = $self->{get_char}->();
1006     redo A;
1007     } else {
1008 wakaba 1.5 $self->{t}->{value} .= chr $char;
1009 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1010     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1011 wakaba 1.1 # reconsume
1012     redo A;
1013     }
1014     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1015 wakaba 1.15 ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1016 wakaba 1.1 if ($self->{c} == 0x000A) { # \n
1017 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1018     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1019 wakaba 1.1 $self->{c} = $self->{get_char}->();
1020     redo A;
1021     } else {
1022 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1023     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1024 wakaba 1.8 # reprocess
1025 wakaba 1.1 redo A;
1026     }
1027     } elsif ($self->{state} == STRING_STATE) {
1028     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1029     ## a character in |invalid$Q| in |invalid| in |INVALID|,
1030     ## where |$Q = $q == 0x0022 ? 1 : 2|.
1031 wakaba 1.3 ## Or, in |URI|.
1032 wakaba 1.1 if ($self->{c} == 0x005C) { # \
1033     $self->{state} = ESCAPE_OPEN_STATE;
1034     $self->{c} = $self->{get_char}->();
1035     redo A;
1036     } elsif ($self->{c} == $q) { # " | '
1037 wakaba 1.5 if ($self->{t}->{type} == STRING_TOKEN) {
1038 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
1039     $self->{c} = $self->{get_char}->();
1040 wakaba 1.5 return $self->{t};
1041 wakaba 1.3 #redo A;
1042     } else {
1043     $self->{state} = URI_AFTER_WSP_STATE;
1044     $self->{c} = $self->{get_char}->();
1045     redo A;
1046     }
1047 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
1048     $self->{c} == 0x000D or # \r
1049     $self->{c} == 0x000C or # \f
1050     $self->{c} == -1) {
1051 wakaba 1.11 $self->{t}->{type} = {
1052     STRING_TOKEN, INVALID_TOKEN,
1053     INVALID_TOKEN, INVALID_TOKEN,
1054     URI_TOKEN, URI_INVALID_TOKEN,
1055     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1056     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1057     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1058     }->{$self->{t}->{type}};
1059 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1060     # reconsume
1061 wakaba 1.5 return $self->{t};
1062 wakaba 1.1 #redo A;
1063     } else {
1064 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1065 wakaba 1.1 # stay in the state
1066     $self->{c} = $self->{get_char}->();
1067     redo A;
1068     }
1069     } elsif ($self->{state} == NUMBER_STATE) {
1070     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1071     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1072 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1073 wakaba 1.1 # stay in the state
1074     $self->{c} = $self->{get_char}->();
1075     redo A;
1076     } elsif ($self->{c} == 0x002E) { # .
1077     $self->{state} = NUMBER_DOT_STATE;
1078     $self->{c} = $self->{get_char}->();
1079     redo A;
1080     } else {
1081 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1082     $self->{t}->{value} = '';
1083 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1084     # reprocess
1085 wakaba 1.2 redo A;
1086 wakaba 1.1 }
1087     } elsif ($self->{state} == NUMBER_DOT_STATE) {
1088     ## NOTE: The character immediately following |.| in |num|.
1089     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1090 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1091 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1092     $self->{c} = $self->{get_char}->();
1093     redo A;
1094     } else {
1095 wakaba 1.13 unshift @{$self->{token}}, {type => DOT_TOKEN};
1096 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1097     $self->{t}->{value} = '';
1098 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1099     # reprocess
1100 wakaba 1.5 return $self->{t};
1101 wakaba 1.1 #redo A;
1102     }
1103     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1104     ## NOTE: The character immediately following |.| at the beginning of |num|.
1105     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1106 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1107 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1108     $self->{c} = $self->{get_char}->();
1109     redo A;
1110     } else {
1111     $self->{state} = BEFORE_TOKEN_STATE;
1112 wakaba 1.9 # reprocess
1113 wakaba 1.13 return {type => DOT_TOKEN};
1114 wakaba 1.1 #redo A;
1115     }
1116     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1117     ## NOTE: |[0-9]| in |num| after |.|.
1118     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1119 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1120 wakaba 1.1 # stay in the state
1121     $self->{c} = $self->{get_char}->();
1122     redo A;
1123     } else {
1124 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1125     $self->{t}->{value} = '';
1126 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1127     # reprocess
1128 wakaba 1.2 redo A;
1129 wakaba 1.1 }
1130     } else {
1131     die "$0: Unknown state |$self->{state}|";
1132     }
1133     } # A
1134     } # get_next_token
1135    
1136 wakaba 1.17 sub serialize_token ($$) {
1137     shift;
1138     my $t = shift;
1139    
1140     ## NOTE: This function is not intended for roundtrip-able serialization.
1141    
1142     if ($t->{type} == IDENT_TOKEN) {
1143     return $t->{value};
1144     } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1145     return '@' . $t->{value};
1146     } elsif ($t->{type} == HASH_TOKEN) {
1147     return '#' . $t->{value};
1148     } elsif ($t->{type} == FUNCTION_TOKEN) {
1149     return $t->{value} . '(';
1150     } elsif ($t->{type} == URI_TOKEN) {
1151     return 'url(' . $t->{value} . ')';
1152     } elsif ($t->{type} == URI_INVALID_TOKEN) {
1153     return 'url(' . $t->{value};
1154     } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1155     return 'url-prefix(' . $t->{value} . ')';
1156     } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1157     return 'url-prefix(' . $t->{value};
1158     } elsif ($t->{type} == STRING_TOKEN) {
1159     return '"' . $t->{value} . '"';
1160     } elsif ($t->{type} == INVALID_TOKEN) {
1161     return '"' . $t->{value};
1162     } elsif ($t->{type} == NUMBER_TOKEN) {
1163     return $t->{number};
1164     } elsif ($t->{type} == DIMENSION_TOKEN) {
1165     return $t->{number} . $t->{value};
1166     } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1167     return $t->{number} . '%';
1168     } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1169     return 'U+' . $t->{value};
1170     } elsif ($t->{type} == DELIM_TOKEN) {
1171     return $t->{value};
1172     } elsif ($t->{type} == PLUS_TOKEN) {
1173     return '+';
1174     } elsif ($t->{type} == GREATER_TOKEN) {
1175     return '>';
1176     } elsif ($t->{type} == COMMA_TOKEN) {
1177     return ',';
1178     } elsif ($t->{type} == TILDE_TOKEN) {
1179     return '~';
1180     } elsif ($t->{type} == DASHMATCH_TOKEN) {
1181     return '|=';
1182     } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1183     return '^=';
1184     } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1185     return '$=';
1186     } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1187     return '*=';
1188     } elsif ($t->{type} == INCLUDES_TOKEN) {
1189     return '~=';
1190     } elsif ($t->{type} == SEMICOLON_TOKEN) {
1191     return ';';
1192     } elsif ($t->{type} == LBRACE_TOKEN) {
1193     return '{';
1194     } elsif ($t->{type} == RBRACE_TOKEN) {
1195     return '}';
1196     } elsif ($t->{type} == LPAREN_TOKEN) {
1197     return '(';
1198     } elsif ($t->{type} == RPAREN_TOKEN) {
1199     return ')';
1200     } elsif ($t->{type} == LBRACKET_TOKEN) {
1201     return '[';
1202     } elsif ($t->{type} == RBRACKET_TOKEN) {
1203     return ']';
1204     } elsif ($t->{type} == S_TOKEN) {
1205     return ' ';
1206     } elsif ($t->{type} == CDO_TOKEN) {
1207     return '<!--';
1208     } elsif ($t->{type} == CDC_TOKEN) {
1209     return '-->';
1210     } elsif ($t->{type} == COMMENT_TOKEN) {
1211     return '/**/';
1212     } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1213     return '/*';
1214     } elsif ($t->{type} == EOF_TOKEN) {
1215     return '{EOF}';
1216     } elsif ($t->{type} == MINUS_TOKEN) {
1217     return '-';
1218     } elsif ($t->{type} == STAR_TOKEN) {
1219     return '*';
1220     } elsif ($t->{type} == VBAR_TOKEN) {
1221     return '|';
1222     } elsif ($t->{type} == COLON_TOKEN) {
1223     return ':';
1224     } elsif ($t->{type} == MATCH_TOKEN) {
1225     return '=';
1226     } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1227     return '!';
1228     } else {
1229     return '{'.$t->{type}.'}';
1230     }
1231     } # serialize_token
1232    
1233 wakaba 1.16 =head1 LICENSE
1234    
1235     Copyright 2007 Wakaba <w@suika.fam.cx>
1236    
1237     This library is free software; you can redistribute it
1238     and/or modify it under the same terms as Perl itself.
1239    
1240     =cut
1241    
1242 wakaba 1.1 1;
1243 wakaba 1.17 # $Date: 2007/10/17 10:46:26 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24