/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (hide annotations) (download)
Sat Sep 22 12:16:33 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.13: +19 -1 lines
++ whatpm/Whatpm/CSS/ChangeLog	22 Sep 2007 12:16:08 -0000
2007-09-22  Wakaba  <wakaba@suika.fam.cx>

	* SelectorsParser.pm, SelectorsSerializer.pm: New Perl modules.

	* Tokenizer.pm: Token type constants are exportable.

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3    
4 wakaba 1.14 require Exporter;
5     push our @ISA, 'Exporter';
6    
7 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
8     sub BEFORE_NMSTART_STATE () { 1 }
9     sub NAME_STATE () { 2 }
10     sub ESCAPE_OPEN_STATE () { 3 }
11     sub STRING_STATE () { 4 }
12     sub HASH_OPEN_STATE () { 5 }
13     sub NUMBER_STATE () { 6 }
14     sub NUMBER_FRACTION_STATE () { 7 }
15     sub AFTER_NUMBER_STATE () { 8 }
16     sub URI_BEFORE_WSP_STATE () { 9 }
17     sub ESCAPE_STATE () { 10 }
18     sub ESCAPE_BEFORE_LF_STATE () { 11 }
19     sub ESCAPE_BEFORE_NL_STATE () { 12 }
20     sub NUMBER_DOT_STATE () { 13 }
21     sub NUMBER_DOT_NUMBER_STATE () { 14 }
22     sub DELIM_STATE () { 15 }
23 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
24     sub URI_AFTER_WSP_STATE () { 17 }
25     sub AFTER_AT_STATE () { 18 }
26     sub AFTER_AT_HYPHEN_STATE () { 19 }
27 wakaba 1.2
28     sub IDENT_TOKEN () { 1 }
29     sub ATKEYWORD_TOKEN () { 2 }
30     sub HASH_TOKEN () { 3 }
31     sub FUNCTION_TOKEN () { 4 }
32     sub URI_TOKEN () { 5 }
33     sub URI_INVALID_TOKEN () { 6 }
34     sub URI_PREFIX_TOKEN () { 7 }
35     sub URI_PREFIX_INVALID_TOKEN () { 8 }
36     sub STRING_TOKEN () { 9 }
37     sub INVALID_TOKEN () { 10 }
38     sub NUMBER_TOKEN () { 11 }
39     sub DIMENSION_TOKEN () { 12 }
40     sub PERCENTAGE_TOKEN () { 13 }
41     sub UNICODE_RANGE_TOKEN () { 14 }
42     sub DELIM_TOKEN () { 16 }
43     sub PLUS_TOKEN () { 17 }
44     sub GREATER_TOKEN () { 18 }
45     sub COMMA_TOKEN () { 19 }
46     sub TILDE_TOKEN () { 20 }
47     sub DASHMATCH_TOKEN () { 21 }
48     sub PREFIXMATCH_TOKEN () { 22 }
49     sub SUFFIXMATCH_TOKEN () { 23 }
50     sub SUBSTRINGMATCH_TOKEN () { 24 }
51     sub INCLUDES_TOKEN () { 25 }
52     sub SEMICOLON_TOKEN () { 26 }
53     sub LBRACE_TOKEN () { 27 }
54     sub RBRACE_TOKEN () { 28 }
55     sub LPAREN_TOKEN () { 29 }
56     sub RPAREN_TOKEN () { 30 }
57     sub LBRACKET_TOKEN () { 31 }
58     sub RBRACKET_TOKEN () { 32 }
59     sub S_TOKEN () { 33 }
60     sub CDO_TOKEN () { 34 }
61     sub CDC_TOKEN () { 35 }
62     sub COMMENT_TOKEN () { 36 }
63     sub COMMENT_INVALID_TOKEN () { 37 }
64     sub EOF_TOKEN () { 38 }
65 wakaba 1.13 sub MINUS_TOKEN () { 39 }
66     sub STAR_TOKEN () { 40 }
67     sub VBAR_TOKEN () { 41 }
68     sub DOT_TOKEN () { 42 }
69     sub COLON_TOKEN () { 43 }
70     sub MATCH_TOKEN () { 44 }
71     sub EXCLAMATION_TOKEN () { 45 }
72 wakaba 1.2
73     our @TokenName = qw(
74 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
75 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
76 wakaba 1.6 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
77 wakaba 1.2 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
78     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
79 wakaba 1.13 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
80 wakaba 1.2 );
81    
82 wakaba 1.14 our @EXPORT_OK = qw(
83     IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
84     URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
85     STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
86     UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
87     TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
88     SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
89     RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
90     S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
91     MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
92     EXCLAMATION_TOKEN
93     );
94    
95     our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
96    
97 wakaba 1.1 sub new ($) {
98 wakaba 1.2 my $self = bless {token => [], get_char => sub { -1 },
99     onerror => sub { }}, shift;
100 wakaba 1.1 return $self;
101     } # new
102    
103     sub init ($) {
104     my $self = shift;
105     $self->{state} = BEFORE_TOKEN_STATE;
106     $self->{c} = $self->{get_char}->();
107 wakaba 1.5 #$self->{t} = {type => token-type, value => value, number => number};
108 wakaba 1.1 } # init
109    
110     sub get_next_token ($) {
111     my $self = shift;
112     if (@{$self->{token}}) {
113     return shift @{$self->{token}};
114     }
115    
116     my $char;
117     my $num; # |{num}|, if any.
118     my $i; # |$i + 1|th character in |unicode| in |escape|.
119 wakaba 1.3 my $q;
120     ## NOTE:
121     ## 0: in |ident|.
122     ## 1: in |URI| outside of |string|.
123     ## 0x0022: in |string1| or |invalid1|.
124     ## 0x0027: in |string2| or |invalid2|.
125 wakaba 1.1
126     A: {
127     if ($self->{state} == BEFORE_TOKEN_STATE) {
128     if ($self->{c} == 0x002D) { # -
129     ## NOTE: |-| in |ident| in |IDENT|
130 wakaba 1.7 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
131 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
132     $self->{c} = $self->{get_char}->();
133     redo A;
134 wakaba 1.5 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
135     $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
136     $self->{c} = $self->{get_char}->();
137     if ($self->{c} == 0x002B) { # +
138     $self->{c} = $self->{get_char}->();
139     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
140     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
141     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
142     $self->{c} == 0x003F) { # ?
143 wakaba 1.12 $self->{t}->{value} = chr $self->{c};
144 wakaba 1.5 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
145     $self->{c} = $self->{get_char}->();
146     C: for (2..6) {
147     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
148     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
149     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
150     $self->{c} == 0x003F) { # ?
151     $self->{t}->{value} .= chr $self->{c};
152     $self->{c} = $self->{get_char}->();
153     } else {
154     last C;
155     }
156     } # C
157    
158     if ($self->{c} == 0x002D) { # -
159     $self->{c} = $self->{get_char}->();
160     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
161     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
162     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
163     $self->{t}->{value} .= '-' . chr $self->{c};
164     $self->{c} = $self->{get_char}->();
165     C: for (2..6) {
166     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
167     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
168     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
169     $self->{t}->{value} .= chr $self->{c};
170     $self->{c} = $self->{get_char}->();
171     } else {
172     last C;
173     }
174     } # C
175    
176     #
177     } else {
178     my $token = $self->{t};
179     $self->{t} = {type => IDENT_TOKEN, value => '-'};
180     $self->{state} = BEFORE_NMSTART_STATE;
181     # reprocess
182     return $token;
183     #redo A;
184     }
185     }
186    
187     $self->{state} = BEFORE_TOKEN_STATE;
188     # reprocess
189     return $self->{t};
190     #redo A;
191     } else {
192     unshift @{$self->{token}}, {type => PLUS_TOKEN};
193     $self->{state} = BEFORE_TOKEN_STATE;
194     # reprocess
195     return $self->{t};
196     #redo A;
197     }
198     } else {
199     $self->{state} = NAME_STATE;
200     # reprocess
201     redo A;
202     }
203 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
204     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
205 wakaba 1.1 $self->{c} == 0x005F or # _
206     $self->{c} > 0x007F) { # nonascii
207     ## NOTE: |nmstart| in |ident| in |IDENT|
208 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
209 wakaba 1.1 $self->{state} = NAME_STATE;
210     $self->{c} = $self->{get_char}->();
211     redo A;
212     } elsif ($self->{c} == 0x005C) { # \
213     ## NOTE: |nmstart| in |ident| in |IDENT|
214 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => ''};
215 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
216     $self->{c} = $self->{get_char}->();
217     redo A;
218     } elsif ($self->{c} == 0x0040) { # @
219     ## NOTE: |@| in |ATKEYWORD|
220 wakaba 1.5 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
221 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
222 wakaba 1.1 $self->{c} = $self->{get_char}->();
223     redo A;
224 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
225 wakaba 1.5 $self->{t} = {type => STRING_TOKEN, value => ''};
226 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
227 wakaba 1.1 $self->{c} = $self->{get_char}->();
228     redo A;
229     } elsif ($self->{c} == 0x0023) { # #
230     ## NOTE: |#| in |HASH|.
231 wakaba 1.5 $self->{t} = {type => HASH_TOKEN, value => ''};
232 wakaba 1.1 $self->{state} = HASH_OPEN_STATE;
233     $self->{c} = $self->{get_char}->();
234     redo A;
235     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
236     ## NOTE: |num|.
237 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
238 wakaba 1.1 $self->{state} = NUMBER_STATE;
239     $self->{c} = $self->{get_char}->();
240     redo A;
241     } elsif ($self->{c} == 0x002E) { # .
242     ## NOTE: |num|.
243 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
244 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
245     $self->{c} = $self->{get_char}->();
246     redo A;
247 wakaba 1.4 } elsif ($self->{c} == 0x002F) { # /
248     $self->{c} = $self->{get_char}->();
249     if ($self->{c} == 0x002A) { # *
250     C: {
251     $self->{c} = $self->{get_char}->();
252     if ($self->{c} == 0x002A) { # *
253     D: {
254     $self->{c} = $self->{get_char}->();
255     if ($self->{c} == 0x002F) { # /
256     #
257     } elsif ($self->{c} == 0x002A) { # *
258     redo D;
259     } else {
260     redo C;
261     }
262     } # D
263     } elsif ($self->{c} == -1) {
264     # stay in the state
265     # reprocess
266     return {type => COMMENT_INVALID_TOKEN};
267     #redo A;
268     } else {
269     redo C;
270     }
271     } # C
272    
273     # stay in the state.
274     $self->{c} = $self->{get_char}->();
275     redo A;
276     } else {
277     # stay in the state.
278     # reprocess
279 wakaba 1.9 return {type => DELIM_TOKEN, value => '/'};
280 wakaba 1.4 #redo A;
281     }
282 wakaba 1.1 } elsif ($self->{c} == 0x003C) { # <
283     ## NOTE: |CDO|
284     $self->{c} = $self->{get_char}->();
285     if ($self->{c} == 0x0021) { # !
286     $self->{c} = $self->{get_char}->();
287 wakaba 1.9 if ($self->{c} == 0x002D) { # -
288 wakaba 1.1 $self->{c} = $self->{get_char}->();
289 wakaba 1.9 if ($self->{c} == 0x002D) { # -
290 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
291     $self->{c} = $self->{get_char}->();
292     return {type => CDO_TOKEN};
293     #redo A;
294     } else {
295 wakaba 1.13 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
296 wakaba 1.1 ## NOTE: |-| in |ident| in |IDENT|
297 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
298 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
299     #reprocess
300     return {type => DELIM_TOKEN, value => '<'};
301     #redo A;
302     }
303     } else {
304 wakaba 1.13 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
305 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
306     #reprocess
307     return {type => DELIM_TOKEN, value => '<'};
308     #redo A;
309     }
310     } else {
311     $self->{state} = BEFORE_TOKEN_STATE;
312     #reprocess
313     return {type => DELIM_TOKEN, value => '<'};
314     #redo A;
315     }
316 wakaba 1.2 } elsif (my $t = {
317 wakaba 1.13 0x0021 => EXCLAMATION_TOKEN, # !
318     0x002D => MINUS_TOKEN, # -
319     0x002E => DOT_TOKEN, # .
320     0x003A => COLON_TOKEN, # :
321     0x003B => SEMICOLON_TOKEN, # ;
322     0x003D => MATCH_TOKEN, # =
323     0x007B => LBRACE_TOKEN, # {
324     0x007D => RBRACE_TOKEN, # }
325     0x0028 => LPAREN_TOKEN, # (
326     0x0029 => RPAREN_TOKEN, # )
327     0x005B => LBRACKET_TOKEN, # [
328     0x005D => RBRACKET_TOKEN, # ]
329 wakaba 1.1 }->{$self->{c}}) {
330     # stay in the state
331     $self->{c} = $self->{get_char}->();
332 wakaba 1.2 return {type => $t};
333 wakaba 1.1 # redo A;
334     } elsif ({
335     0x0020 => 1, # SP
336     0x0009 => 1, # \t
337     0x000D => 1, # \r
338     0x000A => 1, # \n
339     0x000C => 1, # \f
340     }->{$self->{c}}) {
341     W: {
342     $self->{c} = $self->{get_char}->();
343     if ({
344     0x0020 => 1, # SP
345     0x0009 => 1, # \t
346     0x000D => 1, # \r
347     0x000A => 1, # \n
348     0x000C => 1, # \f
349     }->{$self->{c}}) {
350     redo W;
351     } elsif (my $v = {
352     0x002B => PLUS_TOKEN, # +
353     0x003E => GREATER_TOKEN, # >
354     0x002C => COMMA_TOKEN, # ,
355     0x007E => TILDE_TOKEN, # ~
356     }->{$self->{c}}) {
357     # stay in the state
358     $self->{c} = $self->{get_char}->();
359     return {type => $v};
360     #redo A;
361     } else {
362     # stay in the state
363     # reprocess
364     return {type => S_TOKEN};
365     #redo A;
366     }
367     } # W
368     } elsif (my $v = {
369     0x007C => DASHMATCH_TOKEN, # |
370     0x005E => PREFIXMATCH_TOKEN, # ^
371     0x0024 => SUFFIXMATCH_TOKEN, # $
372     0x002A => SUBSTRINGMATCH_TOKEN, # *
373     }->{$self->{c}}) {
374 wakaba 1.2 my $c = $self->{c};
375 wakaba 1.1 $self->{c} = $self->{get_char}->();
376     if ($self->{c} == 0x003D) { # =
377     # stay in the state
378     $self->{c} = $self->{get_char}->();
379     return {type => $v};
380     #redo A;
381 wakaba 1.13 } elsif ($v = {
382     0x002A => STAR_TOKEN, # *
383     0x007C => VBAR_TOKEN, # |
384     }->{$c}) {
385     # stay in the state.
386     # reprocess
387     return {type => $v};
388     #redo A;
389 wakaba 1.1 } else {
390     # stay in the state
391     # reprocess
392 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
393 wakaba 1.1 #redo A;
394     }
395     } elsif ($self->{c} == 0x002B) { # +
396     # stay in the state
397     $self->{c} = $self->{get_char}->();
398     return {type => PLUS_TOKEN};
399     #redo A;
400     } elsif ($self->{c} == 0x003E) { # >
401     # stay in the state
402     $self->{c} = $self->{get_char}->();
403     return {type => GREATER_TOKEN};
404     #redo A;
405     } elsif ($self->{c} == 0x002C) { # ,
406     # stay in the state
407     $self->{c} = $self->{get_char}->();
408     return {type => COMMA_TOKEN};
409     #redo A;
410     } elsif ($self->{c} == 0x007E) { # ~
411     $self->{c} = $self->{get_char}->();
412     if ($self->{c} == 0x003D) { # =
413     # stay in the state
414     $self->{c} = $self->{get_char}->();
415     return {type => INCLUDES_TOKEN};
416     #redo A;
417     } else {
418     # stay in the state
419     # reprocess
420     return {type => TILDE_TOKEN};
421     #redo A;
422     }
423     } elsif ($self->{c} == -1) {
424     # stay in the state
425     $self->{c} = $self->{get_char}->();
426     return {type => EOF_TOKEN};
427     #redo A;
428     } else {
429     # stay in the state
430 wakaba 1.5 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
431 wakaba 1.1 $self->{c} = $self->{get_char}->();
432 wakaba 1.5 return $self->{t};
433 wakaba 1.1 #redo A;
434     }
435     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
436 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
437     ## |FUNCTION|)
438 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
439     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
440 wakaba 1.1 $self->{c} == 0x005F or # _
441     $self->{c} > 0x007F) { # nonascii
442 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
443     $self->{t}->{type} = DIMENSION_TOKEN
444     if $self->{t}->{type} == NUMBER_TOKEN;
445 wakaba 1.1 $self->{state} = NAME_STATE;
446     $self->{c} = $self->{get_char}->();
447     redo A;
448     } elsif ($self->{c} == 0x005C) { # \
449     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
450     $self->{c} = $self->{get_char}->();
451     redo A;
452 wakaba 1.10 } elsif ($self->{c} == 0x002D) { # -
453     if ($self->{t}->{type} == IDENT_TOKEN) {
454     $self->{c} = $self->{get_char}->();
455     if ($self->{c} == 0x003E) { # >
456     $self->{state} = BEFORE_TOKEN_STATE;
457     $self->{c} = $self->{get_char}->();
458     return {type => CDC_TOKEN};
459     #redo A;
460     } else {
461     ## NOTE: |-|, |-|, $self->{c}
462     #$self->{t} = {type => IDENT_TOKEN, value => '-'};
463     # stay in the state
464     # reconsume
465 wakaba 1.13 return {type => MINUS_TOKEN};
466 wakaba 1.10 #redo A;
467     }
468     } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
469 wakaba 1.1 $self->{c} = $self->{get_char}->();
470 wakaba 1.10 if ($self->{c} == 0x003E) { # >
471     unshift @{$self->{token}}, {type => CDC_TOKEN};
472     $self->{t}->{type} = NUMBER_TOKEN;
473     $self->{t}->{value} = '';
474     $self->{state} = BEFORE_TOKEN_STATE;
475     $self->{c} = $self->{get_char}->();
476     return $self->{t};
477     #redo A;
478     } else {
479     ## NOTE: |-|, |-|, $self->{c}
480     my $t = $self->{t};
481     $t->{type} = NUMBER_TOKEN;
482     $t->{value} = '';
483     $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
484 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
485 wakaba 1.10 # stay in the state
486     # reconsume
487     return $t;
488     #redo A;
489     }
490 wakaba 1.1 } else {
491 wakaba 1.10 #
492 wakaba 1.1 }
493     } else {
494 wakaba 1.10 #
495     }
496    
497     if ($self->{t}->{type} == DIMENSION_TOKEN) {
498     ## NOTE: |-| after |NUMBER|.
499 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
500 wakaba 1.10 $self->{state} = BEFORE_TOKEN_STATE;
501     # reprocess
502     $self->{t}->{type} = NUMBER_TOKEN;
503     $self->{t}->{value} = '';
504     return $self->{t};
505     } else {
506     ## NOTE: |-| not followed by |nmstart|.
507     $self->{state} = BEFORE_TOKEN_STATE;
508     # reprocess
509 wakaba 1.13 return {type => MINUS_TOKEN};
510 wakaba 1.1 }
511 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
512     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
513     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
514     $self->{c} == 0x005F or # _
515     $self->{c} > 0x007F) { # nonascii
516 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
517 wakaba 1.3 $self->{state} = NAME_STATE;
518     $self->{c} = $self->{get_char}->();
519     redo A;
520     } elsif ($self->{c} == 0x002D) { # -
521 wakaba 1.5 $self->{t}->{value} .= '-';
522 wakaba 1.3 $self->{state} = AFTER_AT_HYPHEN_STATE;
523     $self->{c} = $self->{get_char}->();
524     redo A;
525     } elsif ($self->{c} == 0x005C) { # \
526     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
527     $self->{c} = $self->{get_char}->();
528     redo A;
529     } else {
530     $self->{state} = BEFORE_TOKEN_STATE;
531     # reprocess
532     return {type => DELIM_TOKEN, value => '@'};
533     }
534     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
535     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
536     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
537     $self->{c} == 0x005F or # _
538     $self->{c} > 0x007F) { # nonascii
539 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
540 wakaba 1.3 $self->{state} = NAME_STATE;
541     $self->{c} = $self->{get_char}->();
542     redo A;
543     } elsif ($self->{c} == 0x002D) { # -
544     $self->{c} = $self->{get_char}->();
545     if ($self->{c} == 0x003E) { # >
546 wakaba 1.4 unshift @{$self->{token}}, {type => CDC_TOKEN};
547 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
548     $self->{c} = $self->{get_char}->();
549 wakaba 1.4 return {type => DELIM_TOKEN, value => '@'};
550 wakaba 1.3 #redo A;
551     } else {
552 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
553 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
554 wakaba 1.3 $self->{state} = BEFORE_NMSTART_STATE;
555     # reprocess
556     return {type => DELIM_TOKEN, value => '@'};
557     #redo A;
558     }
559     } elsif ($self->{c} == 0x005C) { # \
560     ## TODO: @-\{nl}
561     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
562     $self->{c} = $self->{get_char}->();
563     redo A;
564     } else {
565 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
566 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
567     # reprocess
568     return {type => DELIM_TOKEN, value => '@'};
569     }
570 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
571     if ($self->{c} == 0x002D) { # -
572     ## NOTE: |-| in |ident|.
573 wakaba 1.10 $self->{t}->{hyphen} = 1;
574 wakaba 1.5 $self->{t}->{value} = '-';
575 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
576 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
577     $self->{c} = $self->{get_char}->();
578     redo A;
579 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
580     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
581 wakaba 1.1 $self->{c} == 0x005F or # _
582     $self->{c} > 0x007F) { # nonascii
583     ## NOTE: |nmstart| in |ident|.
584 wakaba 1.5 $self->{t}->{value} = chr $self->{c};
585     $self->{t}->{type} = DIMENSION_TOKEN;
586 wakaba 1.1 $self->{state} = NAME_STATE;
587     $self->{c} = $self->{get_char}->();
588     redo A;
589     } elsif ($self->{c} == 0x005C) { # \
590     ## NOTE: |nmstart| in |ident| in |IDENT|
591 wakaba 1.5 $self->{t}->{value} = '';
592 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
593 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
594     $self->{c} = $self->{get_char}->();
595     redo A;
596     } elsif ($self->{c} == 0x0025) { # %
597 wakaba 1.5 $self->{t}->{type} = PERCENTAGE_TOKEN;
598 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
599     $self->{c} = $self->{get_char}->();
600 wakaba 1.5 return $self->{t};
601 wakaba 1.1 #redo A;
602     } else {
603     $self->{state} = BEFORE_TOKEN_STATE;
604     # reprocess
605 wakaba 1.5 return $self->{t};
606 wakaba 1.1 #redo A;
607     }
608     } elsif ($self->{state} == HASH_OPEN_STATE) {
609     ## NOTE: The first |nmchar| in |name| in |HASH|.
610 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
611     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
612     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
613 wakaba 1.1 $self->{c} == 0x002D or # -
614     $self->{c} == 0x005F or # _
615     $self->{c} > 0x007F) { # nonascii
616 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
617 wakaba 1.1 $self->{state} = NAME_STATE;
618     $self->{c} = $self->{get_char}->();
619     redo A;
620     } elsif ($self->{c} == 0x005C) { # \
621     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
622     $self->{c} = $self->{get_char}->();
623     redo A;
624     } else {
625     $self->{state} = BEFORE_TOKEN_STATE;
626 wakaba 1.9 # reprocess
627 wakaba 1.1 return {type => DELIM_TOKEN, value => '#'};
628     #redo A;
629     }
630     } elsif ($self->{state} == NAME_STATE) {
631     ## NOTE: |nmchar| in (|ident| or |name|).
632 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
633     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
634     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
635 wakaba 1.1 $self->{c} == 0x005F or # _
636     $self->{c} == 0x002D or # -
637     $self->{c} > 0x007F) { # nonascii
638 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
639 wakaba 1.1 # stay in the state
640     $self->{c} = $self->{get_char}->();
641     redo A;
642     } elsif ($self->{c} == 0x005C) { # \
643 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
644 wakaba 1.1 $self->{c} = $self->{get_char}->();
645     redo A;
646     } elsif ($self->{c} == 0x0028 and # (
647 wakaba 1.5 $self->{t}->{type} == IDENT_TOKEN) { # (
648     my $func_name = $self->{t}->{value};
649 wakaba 1.3 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
650     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
651 wakaba 1.5 if ($self->{t}->{has_escape}) {
652 wakaba 1.3 ## TODO: warn
653     }
654 wakaba 1.5 $self->{t}->{type}
655 wakaba 1.3 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
656 wakaba 1.5 $self->{t}->{value} = '';
657 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
658     $self->{c} = $self->{get_char}->();
659     redo A;
660     } else {
661 wakaba 1.5 $self->{t}->{type} = FUNCTION_TOKEN;
662 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
663     $self->{c} = $self->{get_char}->();
664 wakaba 1.5 return $self->{t};
665 wakaba 1.1 #redo A;
666     }
667     } else {
668     $self->{state} = BEFORE_TOKEN_STATE;
669     # reconsume
670 wakaba 1.5 return $self->{t};
671 wakaba 1.1 #redo A;
672     }
673 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
674     while ({
675     0x0020 => 1, # SP
676     0x0009 => 1, # \t
677     0x000D => 1, # \r
678     0x000A => 1, # \n
679     0x000C => 1, # \f
680     }->{$self->{c}}) {
681     $self->{c} = $self->{get_char}->();
682     }
683     if ($self->{c} == -1) {
684 wakaba 1.5 $self->{t}->{type} = {
685 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
686     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
687     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
688     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
689 wakaba 1.5 }->{$self->{t}->{type}};
690 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
691     $self->{c} = $self->{get_char}->();
692 wakaba 1.5 return $self->{t};
693 wakaba 1.3 #redo A;
694     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
695     ## TODO: Should we consider matches of "(" and ")"?
696 wakaba 1.5 $self->{t}->{type} = {
697 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
698     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
699     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
700     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
701 wakaba 1.5 }->{$self->{t}->{type}};
702 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
703     $self->{c} = $self->{get_char}->();
704     redo A;
705     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
706     $self->{state} = STRING_STATE; $q = $self->{c};
707     $self->{c} = $self->{get_char}->();
708     redo A;
709     } elsif ($self->{c} == 0x0029) { # )
710     $self->{state} = BEFORE_TOKEN_STATE;
711     $self->{c} = $self->{get_char}->();
712 wakaba 1.5 return $self->{t};
713 wakaba 1.3 #redo A;
714     } elsif ($self->{c} == 0x005C) { # \
715     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
716     $self->{c} = $self->{get_char}->();
717     redo A;
718     } else {
719 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
720 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
721     $self->{c} = $self->{get_char}->();
722     redo A;
723     }
724     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
725     if ({
726     0x0020 => 1, # SP
727     0x0009 => 1, # \t
728     0x000D => 1, # \r
729     0x000A => 1, # \n
730     0x000C => 1, # \f
731     }->{$self->{c}}) {
732     $self->{state} = URI_AFTER_WSP_STATE;
733     $self->{c} = $self->{get_char}->();
734     redo A;
735     } elsif ($self->{c} == -1) {
736 wakaba 1.5 $self->{t}->{type} = {
737 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
738     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
739     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
740     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
741 wakaba 1.5 }->{$self->{t}->{type}};
742 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
743     $self->{c} = $self->{get_char}->();
744 wakaba 1.5 return $self->{t};
745 wakaba 1.3 #redo A;
746     } elsif ($self->{c} < 0x0020 or {
747     0x0022 => 1, # "
748     0x0027 => 1, # '
749     0x0028 => 1, # (
750     }->{$self->{c}}) { # C0 or (
751     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
752 wakaba 1.5 $self->{t}->{type} = {
753 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
754     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
755     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
756     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
757 wakaba 1.5 }->{$self->{t}->{type}};
758 wakaba 1.3 # stay in the state.
759     $self->{c} = $self->{get_char}->();
760     redo A;
761     } elsif ($self->{c} == 0x0029) { # )
762     $self->{state} = BEFORE_TOKEN_STATE;
763     $self->{c} = $self->{get_char}->();
764 wakaba 1.5 return $self->{t};
765 wakaba 1.3 #redo A;
766     } elsif ($self->{c} == 0x005C) { # \
767     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
768     $self->{c} = $self->{get_char}->();
769     redo A;
770     } else {
771 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
772 wakaba 1.3 # stay in the state.
773     $self->{c} = $self->{get_char}->();
774     redo A;
775     }
776     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
777     if ({
778     0x0020 => 1, # SP
779     0x0009 => 1, # \t
780     0x000D => 1, # \r
781     0x000A => 1, # \n
782     0x000C => 1, # \f
783     }->{$self->{c}}) {
784     # stay in the state.
785     $self->{c} = $self->{get_char}->();
786     redo A;
787     } elsif ($self->{c} == -1) {
788 wakaba 1.5 $self->{t}->{type} = {
789 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
790     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
791     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
792     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
793 wakaba 1.5 }->{$self->{t}->{type}};
794 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
795     $self->{c} = $self->{get_char}->();
796 wakaba 1.5 return $self->{t};
797 wakaba 1.3 #redo A;
798     } elsif ($self->{c} == 0x0029) { # )
799     $self->{state} = BEFORE_TOKEN_STATE;
800     $self->{c} = $self->{get_char}->();
801 wakaba 1.5 return $self->{t};
802 wakaba 1.3 #redo A;
803     } elsif ($self->{c} == 0x005C) { # \
804     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
805     $self->{c} = $self->{get_char}->();
806     redo A;
807     } else {
808     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
809 wakaba 1.5 $self->{t}->{type} = {
810 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
811     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
812     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
813     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
814 wakaba 1.5 }->{$self->{t}->{type}};
815 wakaba 1.3 # stay in the state.
816     $self->{c} = $self->{get_char}->();
817     redo A;
818     }
819 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
820 wakaba 1.5 $self->{t}->{has_escape} = 1;
821 wakaba 1.1 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
822     ## NOTE: second character of |unicode| in |escape|.
823     $char = $self->{c} - 0x0030;
824     $self->{state} = ESCAPE_STATE; $i = 2;
825     $self->{c} = $self->{get_char}->();
826     redo A;
827     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
828     ## NOTE: second character of |unicode| in |escape|.
829     $char = $self->{c} - 0x0041 + 0xA;
830     $self->{state} = ESCAPE_STATE; $i = 2;
831     $self->{c} = $self->{get_char}->();
832     redo A;
833 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
834 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
835 wakaba 1.7 $char = $self->{c} - 0x0061 + 0xA;
836 wakaba 1.1 $self->{state} = ESCAPE_STATE; $i = 2;
837     $self->{c} = $self->{get_char}->();
838     redo A;
839     } elsif ($self->{c} == 0x000A or # \n
840     $self->{c} == 0x000C) { # \f
841     if ($q == 0) {
842 wakaba 1.7 #
843 wakaba 1.3 } elsif ($q == 1) {
844     ## NOTE: In |escape| in |URI|.
845 wakaba 1.5 $self->{t}->{type} = {
846 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
847     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
848     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
849     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
850 wakaba 1.5 }->{$self->{t}->{type}};
851     $self->{t}->{value} .= chr $self->{c};
852 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
853     $self->{c} = $self->{get_char}->();
854     redo A;
855 wakaba 1.1 } else {
856     ## Note: In |nl| in ... in |string| or |ident|.
857 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
858 wakaba 1.1 $self->{state} = STRING_STATE;
859     $self->{c} = $self->{get_char}->();
860     redo A;
861     }
862     } elsif ($self->{c} == 0x000D) { # \r
863     if ($q == 0) {
864 wakaba 1.7 #
865 wakaba 1.3 } elsif ($q == 1) {
866 wakaba 1.7 ## NOTE: In |escape| in |URI|.
867 wakaba 1.5 $self->{t}->{type} = {
868 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
869     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
870     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872 wakaba 1.5 }->{$self->{t}->{type}};
873 wakaba 1.8 $self->{t}->{value} .= "\x0D";
874     $self->{state} = ESCAPE_BEFORE_LF_STATE;
875 wakaba 1.3 $self->{c} = $self->{get_char}->();
876     redo A;
877 wakaba 1.1 } else {
878     ## Note: In |nl| in ... in |string| or |ident|.
879 wakaba 1.8 $self->{t}->{value} .= "\x0D";
880 wakaba 1.1 $self->{state} = ESCAPE_BEFORE_LF_STATE;
881     $self->{c} = $self->{get_char}->();
882     redo A;
883     }
884 wakaba 1.7 } elsif ($self->{c} == -1) {
885     #
886 wakaba 1.1 } else {
887     ## NOTE: second character of |escape|.
888 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
889 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
890     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
891 wakaba 1.1 $self->{c} = $self->{get_char}->();
892     redo A;
893     }
894 wakaba 1.7
895     if ($q == 0) {
896 wakaba 1.10 if ($self->{t}->{type} == DIMENSION_TOKEN) {
897     if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
898     $self->{state} = BEFORE_TOKEN_STATE;
899     # reprocess
900     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
901 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
902 wakaba 1.10 $self->{t}->{type} = NUMBER_TOKEN;
903     $self->{t}->{value} = '';
904     return $self->{t};
905     #redo A;
906     } elsif (length $self->{t}->{value}) {
907     $self->{state} = BEFORE_TOKEN_STATE;
908     # reprocess
909     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
910     return $self->{t};
911     #redo A;
912     } else {
913     $self->{state} = BEFORE_TOKEN_STATE;
914     # reprocess
915     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
916     $self->{t}->{type} = NUMBER_TOKEN;
917     $self->{t}->{value} = '';
918     return $self->{t};
919     #redo A;
920     }
921 wakaba 1.7 } else {
922 wakaba 1.10 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
923     $self->{state} = BEFORE_TOKEN_STATE;
924     # reprocess
925     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
926 wakaba 1.13 return {type => MINUS_TOKEN};
927 wakaba 1.10 #redo A;
928     } elsif (length $self->{t}->{value}) {
929     $self->{state} = BEFORE_TOKEN_STATE;
930     # reprocess
931     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
932     return $self->{t};
933     #redo A;
934     } else {
935     $self->{state} = BEFORE_TOKEN_STATE;
936     # reprocess
937     return {type => DELIM_TOKEN, value => '\\'};
938     #redo A;
939     }
940 wakaba 1.7 }
941 wakaba 1.8 } elsif ($q == 1) {
942     $self->{state} = URI_UNQUOTED_STATE;
943 wakaba 1.7 $self->{c} = $self->{get_char}->();
944     redo A;
945 wakaba 1.8 } else {
946     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
947     $self->{t}->{type} = {
948     STRING_TOKEN, INVALID_TOKEN,
949     URI_TOKEN, URI_INVALID_TOKEN,
950     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
951     }->{$self->{t}->{type}} || $self->{t}->{type};
952     $self->{state} = BEFORE_TOKEN_STATE;
953     # reprocess
954     return $self->{t};
955     #redo A;
956 wakaba 1.7 }
957 wakaba 1.1 } elsif ($self->{state} == ESCAPE_STATE) {
958     ## NOTE: third..seventh character of |unicode| in |escape|.
959     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
960     $char = $char * 0x10 + $self->{c} - 0x0030;
961     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
962     $self->{c} = $self->{get_char}->();
963     redo A;
964     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
965     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
966     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
967     $self->{c} = $self->{get_char}->();
968     redo A;
969 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
970 wakaba 1.7 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
971 wakaba 1.1 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
972     $self->{c} = $self->{get_char}->();
973     redo A;
974     } elsif ($self->{c} == 0x0020 or # SP
975     $self->{c} == 0x000A or # \n
976     $self->{c} == 0x0009 or # \t
977     $self->{c} == 0x000C) { # \f
978 wakaba 1.5 $self->{t}->{value} .= chr $char;
979 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
980     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
981 wakaba 1.1 $self->{c} = $self->{get_char}->();
982     redo A;
983     } elsif ($self->{c} == 0x000D) { # \r
984     $self->{state} = ESCAPE_BEFORE_LF_STATE;
985     $self->{c} = $self->{get_char}->();
986     redo A;
987     } else {
988 wakaba 1.5 $self->{t}->{value} .= chr $char;
989 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
990     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
991 wakaba 1.1 # reconsume
992     redo A;
993     }
994     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
995     ## NOTE: eightth character of |unicode| in |escape|.
996     if ($self->{c} == 0x0020 or # SP
997     $self->{c} == 0x000A or # \n
998     $self->{c} == 0x0009 or # \t
999     $self->{c} == 0x000C) { # \f
1000 wakaba 1.5 $self->{t}->{value} .= chr $char;
1001 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1002     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1003 wakaba 1.1 $self->{c} = $self->{get_char}->();
1004     redo A;
1005     } elsif ($self->{c} == 0x000D) { # \r
1006     $self->{state} = ESCAPE_BEFORE_NL_STATE;
1007     $self->{c} = $self->{get_char}->();
1008     redo A;
1009     } else {
1010 wakaba 1.5 $self->{t}->{value} .= chr $char;
1011 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1012     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1013 wakaba 1.1 # reconsume
1014     redo A;
1015     }
1016     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1017     ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
1018     if ($self->{c} == 0x000A) { # \n
1019 wakaba 1.8 $self->{t}->{value} .= chr $self->{c};
1020 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1021     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1022 wakaba 1.1 $self->{c} = $self->{get_char}->();
1023     redo A;
1024     } else {
1025 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1026     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1027 wakaba 1.8 # reprocess
1028 wakaba 1.1 redo A;
1029     }
1030     } elsif ($self->{state} == STRING_STATE) {
1031     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1032     ## a character in |invalid$Q| in |invalid| in |INVALID|,
1033     ## where |$Q = $q == 0x0022 ? 1 : 2|.
1034 wakaba 1.3 ## Or, in |URI|.
1035 wakaba 1.1 if ($self->{c} == 0x005C) { # \
1036     $self->{state} = ESCAPE_OPEN_STATE;
1037     $self->{c} = $self->{get_char}->();
1038     redo A;
1039     } elsif ($self->{c} == $q) { # " | '
1040 wakaba 1.5 if ($self->{t}->{type} == STRING_TOKEN) {
1041 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
1042     $self->{c} = $self->{get_char}->();
1043 wakaba 1.5 return $self->{t};
1044 wakaba 1.3 #redo A;
1045     } else {
1046     $self->{state} = URI_AFTER_WSP_STATE;
1047     $self->{c} = $self->{get_char}->();
1048     redo A;
1049     }
1050 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
1051     $self->{c} == 0x000D or # \r
1052     $self->{c} == 0x000C or # \f
1053     $self->{c} == -1) {
1054 wakaba 1.11 $self->{t}->{type} = {
1055     STRING_TOKEN, INVALID_TOKEN,
1056     INVALID_TOKEN, INVALID_TOKEN,
1057     URI_TOKEN, URI_INVALID_TOKEN,
1058     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1059     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1060     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1061     }->{$self->{t}->{type}};
1062 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1063     # reconsume
1064 wakaba 1.5 return $self->{t};
1065 wakaba 1.1 #redo A;
1066     } else {
1067 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1068 wakaba 1.1 # stay in the state
1069     $self->{c} = $self->{get_char}->();
1070     redo A;
1071     }
1072     } elsif ($self->{state} == NUMBER_STATE) {
1073     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1074     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1075 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1076 wakaba 1.1 # stay in the state
1077     $self->{c} = $self->{get_char}->();
1078     redo A;
1079     } elsif ($self->{c} == 0x002E) { # .
1080     $self->{state} = NUMBER_DOT_STATE;
1081     $self->{c} = $self->{get_char}->();
1082     redo A;
1083     } else {
1084 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1085     $self->{t}->{value} = '';
1086 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1087     # reprocess
1088 wakaba 1.2 redo A;
1089 wakaba 1.1 }
1090     } elsif ($self->{state} == NUMBER_DOT_STATE) {
1091     ## NOTE: The character immediately following |.| in |num|.
1092     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1093 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1094 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1095     $self->{c} = $self->{get_char}->();
1096     redo A;
1097     } else {
1098 wakaba 1.13 unshift @{$self->{token}}, {type => DOT_TOKEN};
1099 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1100     $self->{t}->{value} = '';
1101 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1102     # reprocess
1103 wakaba 1.5 return $self->{t};
1104 wakaba 1.1 #redo A;
1105     }
1106     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1107     ## NOTE: The character immediately following |.| at the beginning of |num|.
1108     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1109 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1110 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1111     $self->{c} = $self->{get_char}->();
1112     redo A;
1113     } else {
1114     $self->{state} = BEFORE_TOKEN_STATE;
1115 wakaba 1.9 # reprocess
1116 wakaba 1.13 return {type => DOT_TOKEN};
1117 wakaba 1.1 #redo A;
1118     }
1119     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1120     ## NOTE: |[0-9]| in |num| after |.|.
1121     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1122 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1123 wakaba 1.1 # stay in the state
1124     $self->{c} = $self->{get_char}->();
1125     redo A;
1126     } else {
1127 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1128     $self->{t}->{value} = '';
1129 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1130     # reprocess
1131 wakaba 1.2 redo A;
1132 wakaba 1.1 }
1133     } else {
1134     die "$0: Unknown state |$self->{state}|";
1135     }
1136     } # A
1137     } # get_next_token
1138    
1139     1;
1140 wakaba 1.14 # $Date: 2007/09/08 17:43:41 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24