/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (hide annotations) (download)
Sat Sep 8 17:43:41 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.12: +41 -21 lines
++ whatpm/t/ChangeLog	8 Sep 2007 17:43:26 -0000
	* css-token-1.test: Test cases for remaining CSS tokens
	are added.

2007-09-09  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 17:43:04 -0000
	* Tokenizer.pm: Delimiters frequently used in CSS
	now have different |type|s than |DELIM_TOKEN|.

2007-09-09  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3    
4 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
5     sub BEFORE_NMSTART_STATE () { 1 }
6     sub NAME_STATE () { 2 }
7     sub ESCAPE_OPEN_STATE () { 3 }
8     sub STRING_STATE () { 4 }
9     sub HASH_OPEN_STATE () { 5 }
10     sub NUMBER_STATE () { 6 }
11     sub NUMBER_FRACTION_STATE () { 7 }
12     sub AFTER_NUMBER_STATE () { 8 }
13     sub URI_BEFORE_WSP_STATE () { 9 }
14     sub ESCAPE_STATE () { 10 }
15     sub ESCAPE_BEFORE_LF_STATE () { 11 }
16     sub ESCAPE_BEFORE_NL_STATE () { 12 }
17     sub NUMBER_DOT_STATE () { 13 }
18     sub NUMBER_DOT_NUMBER_STATE () { 14 }
19     sub DELIM_STATE () { 15 }
20 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
21     sub URI_AFTER_WSP_STATE () { 17 }
22     sub AFTER_AT_STATE () { 18 }
23     sub AFTER_AT_HYPHEN_STATE () { 19 }
24 wakaba 1.2
25     sub IDENT_TOKEN () { 1 }
26     sub ATKEYWORD_TOKEN () { 2 }
27     sub HASH_TOKEN () { 3 }
28     sub FUNCTION_TOKEN () { 4 }
29     sub URI_TOKEN () { 5 }
30     sub URI_INVALID_TOKEN () { 6 }
31     sub URI_PREFIX_TOKEN () { 7 }
32     sub URI_PREFIX_INVALID_TOKEN () { 8 }
33     sub STRING_TOKEN () { 9 }
34     sub INVALID_TOKEN () { 10 }
35     sub NUMBER_TOKEN () { 11 }
36     sub DIMENSION_TOKEN () { 12 }
37     sub PERCENTAGE_TOKEN () { 13 }
38     sub UNICODE_RANGE_TOKEN () { 14 }
39     sub DELIM_TOKEN () { 16 }
40     sub PLUS_TOKEN () { 17 }
41     sub GREATER_TOKEN () { 18 }
42     sub COMMA_TOKEN () { 19 }
43     sub TILDE_TOKEN () { 20 }
44     sub DASHMATCH_TOKEN () { 21 }
45     sub PREFIXMATCH_TOKEN () { 22 }
46     sub SUFFIXMATCH_TOKEN () { 23 }
47     sub SUBSTRINGMATCH_TOKEN () { 24 }
48     sub INCLUDES_TOKEN () { 25 }
49     sub SEMICOLON_TOKEN () { 26 }
50     sub LBRACE_TOKEN () { 27 }
51     sub RBRACE_TOKEN () { 28 }
52     sub LPAREN_TOKEN () { 29 }
53     sub RPAREN_TOKEN () { 30 }
54     sub LBRACKET_TOKEN () { 31 }
55     sub RBRACKET_TOKEN () { 32 }
56     sub S_TOKEN () { 33 }
57     sub CDO_TOKEN () { 34 }
58     sub CDC_TOKEN () { 35 }
59     sub COMMENT_TOKEN () { 36 }
60     sub COMMENT_INVALID_TOKEN () { 37 }
61     sub EOF_TOKEN () { 38 }
62 wakaba 1.13 sub MINUS_TOKEN () { 39 }
63     sub STAR_TOKEN () { 40 }
64     sub VBAR_TOKEN () { 41 }
65     sub DOT_TOKEN () { 42 }
66     sub COLON_TOKEN () { 43 }
67     sub MATCH_TOKEN () { 44 }
68     sub EXCLAMATION_TOKEN () { 45 }
69 wakaba 1.2
70     our @TokenName = qw(
71 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
72 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
73 wakaba 1.6 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
74 wakaba 1.2 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
75     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
76 wakaba 1.13 COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
77 wakaba 1.2 );
78    
79 wakaba 1.1 sub new ($) {
80 wakaba 1.2 my $self = bless {token => [], get_char => sub { -1 },
81     onerror => sub { }}, shift;
82 wakaba 1.1 return $self;
83     } # new
84    
85     sub init ($) {
86     my $self = shift;
87     $self->{state} = BEFORE_TOKEN_STATE;
88     $self->{c} = $self->{get_char}->();
89 wakaba 1.5 #$self->{t} = {type => token-type, value => value, number => number};
90 wakaba 1.1 } # init
91    
92     sub get_next_token ($) {
93     my $self = shift;
94     if (@{$self->{token}}) {
95     return shift @{$self->{token}};
96     }
97    
98     my $char;
99     my $num; # |{num}|, if any.
100     my $i; # |$i + 1|th character in |unicode| in |escape|.
101 wakaba 1.3 my $q;
102     ## NOTE:
103     ## 0: in |ident|.
104     ## 1: in |URI| outside of |string|.
105     ## 0x0022: in |string1| or |invalid1|.
106     ## 0x0027: in |string2| or |invalid2|.
107 wakaba 1.1
108     A: {
109     if ($self->{state} == BEFORE_TOKEN_STATE) {
110     if ($self->{c} == 0x002D) { # -
111     ## NOTE: |-| in |ident| in |IDENT|
112 wakaba 1.7 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
113 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
114     $self->{c} = $self->{get_char}->();
115     redo A;
116 wakaba 1.5 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
117     $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
118     $self->{c} = $self->{get_char}->();
119     if ($self->{c} == 0x002B) { # +
120     $self->{c} = $self->{get_char}->();
121     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
122     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
123     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
124     $self->{c} == 0x003F) { # ?
125 wakaba 1.12 $self->{t}->{value} = chr $self->{c};
126 wakaba 1.5 $self->{t}->{type} = UNICODE_RANGE_TOKEN;
127     $self->{c} = $self->{get_char}->();
128     C: for (2..6) {
129     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
130     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
131     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
132     $self->{c} == 0x003F) { # ?
133     $self->{t}->{value} .= chr $self->{c};
134     $self->{c} = $self->{get_char}->();
135     } else {
136     last C;
137     }
138     } # C
139    
140     if ($self->{c} == 0x002D) { # -
141     $self->{c} = $self->{get_char}->();
142     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
145     $self->{t}->{value} .= '-' . chr $self->{c};
146     $self->{c} = $self->{get_char}->();
147     C: for (2..6) {
148     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
149     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
150     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
151     $self->{t}->{value} .= chr $self->{c};
152     $self->{c} = $self->{get_char}->();
153     } else {
154     last C;
155     }
156     } # C
157    
158     #
159     } else {
160     my $token = $self->{t};
161     $self->{t} = {type => IDENT_TOKEN, value => '-'};
162     $self->{state} = BEFORE_NMSTART_STATE;
163     # reprocess
164     return $token;
165     #redo A;
166     }
167     }
168    
169     $self->{state} = BEFORE_TOKEN_STATE;
170     # reprocess
171     return $self->{t};
172     #redo A;
173     } else {
174     unshift @{$self->{token}}, {type => PLUS_TOKEN};
175     $self->{state} = BEFORE_TOKEN_STATE;
176     # reprocess
177     return $self->{t};
178     #redo A;
179     }
180     } else {
181     $self->{state} = NAME_STATE;
182     # reprocess
183     redo A;
184     }
185 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
186     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
187 wakaba 1.1 $self->{c} == 0x005F or # _
188     $self->{c} > 0x007F) { # nonascii
189     ## NOTE: |nmstart| in |ident| in |IDENT|
190 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
191 wakaba 1.1 $self->{state} = NAME_STATE;
192     $self->{c} = $self->{get_char}->();
193     redo A;
194     } elsif ($self->{c} == 0x005C) { # \
195     ## NOTE: |nmstart| in |ident| in |IDENT|
196 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => ''};
197 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
198     $self->{c} = $self->{get_char}->();
199     redo A;
200     } elsif ($self->{c} == 0x0040) { # @
201     ## NOTE: |@| in |ATKEYWORD|
202 wakaba 1.5 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
203 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
204 wakaba 1.1 $self->{c} = $self->{get_char}->();
205     redo A;
206 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
207 wakaba 1.5 $self->{t} = {type => STRING_TOKEN, value => ''};
208 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
209 wakaba 1.1 $self->{c} = $self->{get_char}->();
210     redo A;
211     } elsif ($self->{c} == 0x0023) { # #
212     ## NOTE: |#| in |HASH|.
213 wakaba 1.5 $self->{t} = {type => HASH_TOKEN, value => ''};
214 wakaba 1.1 $self->{state} = HASH_OPEN_STATE;
215     $self->{c} = $self->{get_char}->();
216     redo A;
217     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
218     ## NOTE: |num|.
219 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
220 wakaba 1.1 $self->{state} = NUMBER_STATE;
221     $self->{c} = $self->{get_char}->();
222     redo A;
223     } elsif ($self->{c} == 0x002E) { # .
224     ## NOTE: |num|.
225 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
226 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
227     $self->{c} = $self->{get_char}->();
228     redo A;
229 wakaba 1.4 } elsif ($self->{c} == 0x002F) { # /
230     $self->{c} = $self->{get_char}->();
231     if ($self->{c} == 0x002A) { # *
232     C: {
233     $self->{c} = $self->{get_char}->();
234     if ($self->{c} == 0x002A) { # *
235     D: {
236     $self->{c} = $self->{get_char}->();
237     if ($self->{c} == 0x002F) { # /
238     #
239     } elsif ($self->{c} == 0x002A) { # *
240     redo D;
241     } else {
242     redo C;
243     }
244     } # D
245     } elsif ($self->{c} == -1) {
246     # stay in the state
247     # reprocess
248     return {type => COMMENT_INVALID_TOKEN};
249     #redo A;
250     } else {
251     redo C;
252     }
253     } # C
254    
255     # stay in the state.
256     $self->{c} = $self->{get_char}->();
257     redo A;
258     } else {
259     # stay in the state.
260     # reprocess
261 wakaba 1.9 return {type => DELIM_TOKEN, value => '/'};
262 wakaba 1.4 #redo A;
263     }
264 wakaba 1.1 } elsif ($self->{c} == 0x003C) { # <
265     ## NOTE: |CDO|
266     $self->{c} = $self->{get_char}->();
267     if ($self->{c} == 0x0021) { # !
268     $self->{c} = $self->{get_char}->();
269 wakaba 1.9 if ($self->{c} == 0x002D) { # -
270 wakaba 1.1 $self->{c} = $self->{get_char}->();
271 wakaba 1.9 if ($self->{c} == 0x002D) { # -
272 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
273     $self->{c} = $self->{get_char}->();
274     return {type => CDO_TOKEN};
275     #redo A;
276     } else {
277 wakaba 1.13 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
278 wakaba 1.1 ## NOTE: |-| in |ident| in |IDENT|
279 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
280 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
281     #reprocess
282     return {type => DELIM_TOKEN, value => '<'};
283     #redo A;
284     }
285     } else {
286 wakaba 1.13 unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
287 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
288     #reprocess
289     return {type => DELIM_TOKEN, value => '<'};
290     #redo A;
291     }
292     } else {
293     $self->{state} = BEFORE_TOKEN_STATE;
294     #reprocess
295     return {type => DELIM_TOKEN, value => '<'};
296     #redo A;
297     }
298 wakaba 1.2 } elsif (my $t = {
299 wakaba 1.13 0x0021 => EXCLAMATION_TOKEN, # !
300     0x002D => MINUS_TOKEN, # -
301     0x002E => DOT_TOKEN, # .
302     0x003A => COLON_TOKEN, # :
303     0x003B => SEMICOLON_TOKEN, # ;
304     0x003D => MATCH_TOKEN, # =
305     0x007B => LBRACE_TOKEN, # {
306     0x007D => RBRACE_TOKEN, # }
307     0x0028 => LPAREN_TOKEN, # (
308     0x0029 => RPAREN_TOKEN, # )
309     0x005B => LBRACKET_TOKEN, # [
310     0x005D => RBRACKET_TOKEN, # ]
311 wakaba 1.1 }->{$self->{c}}) {
312     # stay in the state
313     $self->{c} = $self->{get_char}->();
314 wakaba 1.2 return {type => $t};
315 wakaba 1.1 # redo A;
316     } elsif ({
317     0x0020 => 1, # SP
318     0x0009 => 1, # \t
319     0x000D => 1, # \r
320     0x000A => 1, # \n
321     0x000C => 1, # \f
322     }->{$self->{c}}) {
323     W: {
324     $self->{c} = $self->{get_char}->();
325     if ({
326     0x0020 => 1, # SP
327     0x0009 => 1, # \t
328     0x000D => 1, # \r
329     0x000A => 1, # \n
330     0x000C => 1, # \f
331     }->{$self->{c}}) {
332     redo W;
333     } elsif (my $v = {
334     0x002B => PLUS_TOKEN, # +
335     0x003E => GREATER_TOKEN, # >
336     0x002C => COMMA_TOKEN, # ,
337     0x007E => TILDE_TOKEN, # ~
338     }->{$self->{c}}) {
339     # stay in the state
340     $self->{c} = $self->{get_char}->();
341     return {type => $v};
342     #redo A;
343     } else {
344     # stay in the state
345     # reprocess
346     return {type => S_TOKEN};
347     #redo A;
348     }
349     } # W
350     } elsif (my $v = {
351     0x007C => DASHMATCH_TOKEN, # |
352     0x005E => PREFIXMATCH_TOKEN, # ^
353     0x0024 => SUFFIXMATCH_TOKEN, # $
354     0x002A => SUBSTRINGMATCH_TOKEN, # *
355     }->{$self->{c}}) {
356 wakaba 1.2 my $c = $self->{c};
357 wakaba 1.1 $self->{c} = $self->{get_char}->();
358     if ($self->{c} == 0x003D) { # =
359     # stay in the state
360     $self->{c} = $self->{get_char}->();
361     return {type => $v};
362     #redo A;
363 wakaba 1.13 } elsif ($v = {
364     0x002A => STAR_TOKEN, # *
365     0x007C => VBAR_TOKEN, # |
366     }->{$c}) {
367     # stay in the state.
368     # reprocess
369     return {type => $v};
370     #redo A;
371 wakaba 1.1 } else {
372     # stay in the state
373     # reprocess
374 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
375 wakaba 1.1 #redo A;
376     }
377     } elsif ($self->{c} == 0x002B) { # +
378     # stay in the state
379     $self->{c} = $self->{get_char}->();
380     return {type => PLUS_TOKEN};
381     #redo A;
382     } elsif ($self->{c} == 0x003E) { # >
383     # stay in the state
384     $self->{c} = $self->{get_char}->();
385     return {type => GREATER_TOKEN};
386     #redo A;
387     } elsif ($self->{c} == 0x002C) { # ,
388     # stay in the state
389     $self->{c} = $self->{get_char}->();
390     return {type => COMMA_TOKEN};
391     #redo A;
392     } elsif ($self->{c} == 0x007E) { # ~
393     $self->{c} = $self->{get_char}->();
394     if ($self->{c} == 0x003D) { # =
395     # stay in the state
396     $self->{c} = $self->{get_char}->();
397     return {type => INCLUDES_TOKEN};
398     #redo A;
399     } else {
400     # stay in the state
401     # reprocess
402     return {type => TILDE_TOKEN};
403     #redo A;
404     }
405     } elsif ($self->{c} == -1) {
406     # stay in the state
407     $self->{c} = $self->{get_char}->();
408     return {type => EOF_TOKEN};
409     #redo A;
410     } else {
411     # stay in the state
412 wakaba 1.5 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
413 wakaba 1.1 $self->{c} = $self->{get_char}->();
414 wakaba 1.5 return $self->{t};
415 wakaba 1.1 #redo A;
416     }
417     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
418 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
419     ## |FUNCTION|)
420 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
421     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
422 wakaba 1.1 $self->{c} == 0x005F or # _
423     $self->{c} > 0x007F) { # nonascii
424 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
425     $self->{t}->{type} = DIMENSION_TOKEN
426     if $self->{t}->{type} == NUMBER_TOKEN;
427 wakaba 1.1 $self->{state} = NAME_STATE;
428     $self->{c} = $self->{get_char}->();
429     redo A;
430     } elsif ($self->{c} == 0x005C) { # \
431     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
432     $self->{c} = $self->{get_char}->();
433     redo A;
434 wakaba 1.10 } elsif ($self->{c} == 0x002D) { # -
435     if ($self->{t}->{type} == IDENT_TOKEN) {
436     $self->{c} = $self->{get_char}->();
437     if ($self->{c} == 0x003E) { # >
438     $self->{state} = BEFORE_TOKEN_STATE;
439     $self->{c} = $self->{get_char}->();
440     return {type => CDC_TOKEN};
441     #redo A;
442     } else {
443     ## NOTE: |-|, |-|, $self->{c}
444     #$self->{t} = {type => IDENT_TOKEN, value => '-'};
445     # stay in the state
446     # reconsume
447 wakaba 1.13 return {type => MINUS_TOKEN};
448 wakaba 1.10 #redo A;
449     }
450     } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
451 wakaba 1.1 $self->{c} = $self->{get_char}->();
452 wakaba 1.10 if ($self->{c} == 0x003E) { # >
453     unshift @{$self->{token}}, {type => CDC_TOKEN};
454     $self->{t}->{type} = NUMBER_TOKEN;
455     $self->{t}->{value} = '';
456     $self->{state} = BEFORE_TOKEN_STATE;
457     $self->{c} = $self->{get_char}->();
458     return $self->{t};
459     #redo A;
460     } else {
461     ## NOTE: |-|, |-|, $self->{c}
462     my $t = $self->{t};
463     $t->{type} = NUMBER_TOKEN;
464     $t->{value} = '';
465     $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
466 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
467 wakaba 1.10 # stay in the state
468     # reconsume
469     return $t;
470     #redo A;
471     }
472 wakaba 1.1 } else {
473 wakaba 1.10 #
474 wakaba 1.1 }
475     } else {
476 wakaba 1.10 #
477     }
478    
479     if ($self->{t}->{type} == DIMENSION_TOKEN) {
480     ## NOTE: |-| after |NUMBER|.
481 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
482 wakaba 1.10 $self->{state} = BEFORE_TOKEN_STATE;
483     # reprocess
484     $self->{t}->{type} = NUMBER_TOKEN;
485     $self->{t}->{value} = '';
486     return $self->{t};
487     } else {
488     ## NOTE: |-| not followed by |nmstart|.
489     $self->{state} = BEFORE_TOKEN_STATE;
490     # reprocess
491 wakaba 1.13 return {type => MINUS_TOKEN};
492 wakaba 1.1 }
493 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
494     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
495     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
496     $self->{c} == 0x005F or # _
497     $self->{c} > 0x007F) { # nonascii
498 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
499 wakaba 1.3 $self->{state} = NAME_STATE;
500     $self->{c} = $self->{get_char}->();
501     redo A;
502     } elsif ($self->{c} == 0x002D) { # -
503 wakaba 1.5 $self->{t}->{value} .= '-';
504 wakaba 1.3 $self->{state} = AFTER_AT_HYPHEN_STATE;
505     $self->{c} = $self->{get_char}->();
506     redo A;
507     } elsif ($self->{c} == 0x005C) { # \
508     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
509     $self->{c} = $self->{get_char}->();
510     redo A;
511     } else {
512     $self->{state} = BEFORE_TOKEN_STATE;
513     # reprocess
514     return {type => DELIM_TOKEN, value => '@'};
515     }
516     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
517     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
518     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
519     $self->{c} == 0x005F or # _
520     $self->{c} > 0x007F) { # nonascii
521 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
522 wakaba 1.3 $self->{state} = NAME_STATE;
523     $self->{c} = $self->{get_char}->();
524     redo A;
525     } elsif ($self->{c} == 0x002D) { # -
526     $self->{c} = $self->{get_char}->();
527     if ($self->{c} == 0x003E) { # >
528 wakaba 1.4 unshift @{$self->{token}}, {type => CDC_TOKEN};
529 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
530     $self->{c} = $self->{get_char}->();
531 wakaba 1.4 return {type => DELIM_TOKEN, value => '@'};
532 wakaba 1.3 #redo A;
533     } else {
534 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
535 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
536 wakaba 1.3 $self->{state} = BEFORE_NMSTART_STATE;
537     # reprocess
538     return {type => DELIM_TOKEN, value => '@'};
539     #redo A;
540     }
541     } elsif ($self->{c} == 0x005C) { # \
542     ## TODO: @-\{nl}
543     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
544     $self->{c} = $self->{get_char}->();
545     redo A;
546     } else {
547 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
548 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
549     # reprocess
550     return {type => DELIM_TOKEN, value => '@'};
551     }
552 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
553     if ($self->{c} == 0x002D) { # -
554     ## NOTE: |-| in |ident|.
555 wakaba 1.10 $self->{t}->{hyphen} = 1;
556 wakaba 1.5 $self->{t}->{value} = '-';
557 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
558 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
559     $self->{c} = $self->{get_char}->();
560     redo A;
561 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
562     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
563 wakaba 1.1 $self->{c} == 0x005F or # _
564     $self->{c} > 0x007F) { # nonascii
565     ## NOTE: |nmstart| in |ident|.
566 wakaba 1.5 $self->{t}->{value} = chr $self->{c};
567     $self->{t}->{type} = DIMENSION_TOKEN;
568 wakaba 1.1 $self->{state} = NAME_STATE;
569     $self->{c} = $self->{get_char}->();
570     redo A;
571     } elsif ($self->{c} == 0x005C) { # \
572     ## NOTE: |nmstart| in |ident| in |IDENT|
573 wakaba 1.5 $self->{t}->{value} = '';
574 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
575 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
576     $self->{c} = $self->{get_char}->();
577     redo A;
578     } elsif ($self->{c} == 0x0025) { # %
579 wakaba 1.5 $self->{t}->{type} = PERCENTAGE_TOKEN;
580 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
581     $self->{c} = $self->{get_char}->();
582 wakaba 1.5 return $self->{t};
583 wakaba 1.1 #redo A;
584     } else {
585     $self->{state} = BEFORE_TOKEN_STATE;
586     # reprocess
587 wakaba 1.5 return $self->{t};
588 wakaba 1.1 #redo A;
589     }
590     } elsif ($self->{state} == HASH_OPEN_STATE) {
591     ## NOTE: The first |nmchar| in |name| in |HASH|.
592 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
593     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
594     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
595 wakaba 1.1 $self->{c} == 0x002D or # -
596     $self->{c} == 0x005F or # _
597     $self->{c} > 0x007F) { # nonascii
598 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
599 wakaba 1.1 $self->{state} = NAME_STATE;
600     $self->{c} = $self->{get_char}->();
601     redo A;
602     } elsif ($self->{c} == 0x005C) { # \
603     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
604     $self->{c} = $self->{get_char}->();
605     redo A;
606     } else {
607     $self->{state} = BEFORE_TOKEN_STATE;
608 wakaba 1.9 # reprocess
609 wakaba 1.1 return {type => DELIM_TOKEN, value => '#'};
610     #redo A;
611     }
612     } elsif ($self->{state} == NAME_STATE) {
613     ## NOTE: |nmchar| in (|ident| or |name|).
614 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
615     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
616     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
617 wakaba 1.1 $self->{c} == 0x005F or # _
618     $self->{c} == 0x002D or # -
619     $self->{c} > 0x007F) { # nonascii
620 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
621 wakaba 1.1 # stay in the state
622     $self->{c} = $self->{get_char}->();
623     redo A;
624     } elsif ($self->{c} == 0x005C) { # \
625 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
626 wakaba 1.1 $self->{c} = $self->{get_char}->();
627     redo A;
628     } elsif ($self->{c} == 0x0028 and # (
629 wakaba 1.5 $self->{t}->{type} == IDENT_TOKEN) { # (
630     my $func_name = $self->{t}->{value};
631 wakaba 1.3 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
632     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
633 wakaba 1.5 if ($self->{t}->{has_escape}) {
634 wakaba 1.3 ## TODO: warn
635     }
636 wakaba 1.5 $self->{t}->{type}
637 wakaba 1.3 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
638 wakaba 1.5 $self->{t}->{value} = '';
639 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
640     $self->{c} = $self->{get_char}->();
641     redo A;
642     } else {
643 wakaba 1.5 $self->{t}->{type} = FUNCTION_TOKEN;
644 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
645     $self->{c} = $self->{get_char}->();
646 wakaba 1.5 return $self->{t};
647 wakaba 1.1 #redo A;
648     }
649     } else {
650     $self->{state} = BEFORE_TOKEN_STATE;
651     # reconsume
652 wakaba 1.5 return $self->{t};
653 wakaba 1.1 #redo A;
654     }
655 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
656     while ({
657     0x0020 => 1, # SP
658     0x0009 => 1, # \t
659     0x000D => 1, # \r
660     0x000A => 1, # \n
661     0x000C => 1, # \f
662     }->{$self->{c}}) {
663     $self->{c} = $self->{get_char}->();
664     }
665     if ($self->{c} == -1) {
666 wakaba 1.5 $self->{t}->{type} = {
667 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
668     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
669     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
670     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
671 wakaba 1.5 }->{$self->{t}->{type}};
672 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
673     $self->{c} = $self->{get_char}->();
674 wakaba 1.5 return $self->{t};
675 wakaba 1.3 #redo A;
676     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
677     ## TODO: Should we consider matches of "(" and ")"?
678 wakaba 1.5 $self->{t}->{type} = {
679 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
680     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
681     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
682     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
683 wakaba 1.5 }->{$self->{t}->{type}};
684 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
685     $self->{c} = $self->{get_char}->();
686     redo A;
687     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
688     $self->{state} = STRING_STATE; $q = $self->{c};
689     $self->{c} = $self->{get_char}->();
690     redo A;
691     } elsif ($self->{c} == 0x0029) { # )
692     $self->{state} = BEFORE_TOKEN_STATE;
693     $self->{c} = $self->{get_char}->();
694 wakaba 1.5 return $self->{t};
695 wakaba 1.3 #redo A;
696     } elsif ($self->{c} == 0x005C) { # \
697     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
698     $self->{c} = $self->{get_char}->();
699     redo A;
700     } else {
701 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
702 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
703     $self->{c} = $self->{get_char}->();
704     redo A;
705     }
706     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
707     if ({
708     0x0020 => 1, # SP
709     0x0009 => 1, # \t
710     0x000D => 1, # \r
711     0x000A => 1, # \n
712     0x000C => 1, # \f
713     }->{$self->{c}}) {
714     $self->{state} = URI_AFTER_WSP_STATE;
715     $self->{c} = $self->{get_char}->();
716     redo A;
717     } elsif ($self->{c} == -1) {
718 wakaba 1.5 $self->{t}->{type} = {
719 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
720     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
721     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
722     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
723 wakaba 1.5 }->{$self->{t}->{type}};
724 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
725     $self->{c} = $self->{get_char}->();
726 wakaba 1.5 return $self->{t};
727 wakaba 1.3 #redo A;
728     } elsif ($self->{c} < 0x0020 or {
729     0x0022 => 1, # "
730     0x0027 => 1, # '
731     0x0028 => 1, # (
732     }->{$self->{c}}) { # C0 or (
733     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
734 wakaba 1.5 $self->{t}->{type} = {
735 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
736     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
737     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
738     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
739 wakaba 1.5 }->{$self->{t}->{type}};
740 wakaba 1.3 # stay in the state.
741     $self->{c} = $self->{get_char}->();
742     redo A;
743     } elsif ($self->{c} == 0x0029) { # )
744     $self->{state} = BEFORE_TOKEN_STATE;
745     $self->{c} = $self->{get_char}->();
746 wakaba 1.5 return $self->{t};
747 wakaba 1.3 #redo A;
748     } elsif ($self->{c} == 0x005C) { # \
749     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
750     $self->{c} = $self->{get_char}->();
751     redo A;
752     } else {
753 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
754 wakaba 1.3 # stay in the state.
755     $self->{c} = $self->{get_char}->();
756     redo A;
757     }
758     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
759     if ({
760     0x0020 => 1, # SP
761     0x0009 => 1, # \t
762     0x000D => 1, # \r
763     0x000A => 1, # \n
764     0x000C => 1, # \f
765     }->{$self->{c}}) {
766     # stay in the state.
767     $self->{c} = $self->{get_char}->();
768     redo A;
769     } elsif ($self->{c} == -1) {
770 wakaba 1.5 $self->{t}->{type} = {
771 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
772     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
773     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
774     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
775 wakaba 1.5 }->{$self->{t}->{type}};
776 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
777     $self->{c} = $self->{get_char}->();
778 wakaba 1.5 return $self->{t};
779 wakaba 1.3 #redo A;
780     } elsif ($self->{c} == 0x0029) { # )
781     $self->{state} = BEFORE_TOKEN_STATE;
782     $self->{c} = $self->{get_char}->();
783 wakaba 1.5 return $self->{t};
784 wakaba 1.3 #redo A;
785     } elsif ($self->{c} == 0x005C) { # \
786     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
787     $self->{c} = $self->{get_char}->();
788     redo A;
789     } else {
790     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
791 wakaba 1.5 $self->{t}->{type} = {
792 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
793     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
794     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
795     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
796 wakaba 1.5 }->{$self->{t}->{type}};
797 wakaba 1.3 # stay in the state.
798     $self->{c} = $self->{get_char}->();
799     redo A;
800     }
801 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
802 wakaba 1.5 $self->{t}->{has_escape} = 1;
803 wakaba 1.1 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
804     ## NOTE: second character of |unicode| in |escape|.
805     $char = $self->{c} - 0x0030;
806     $self->{state} = ESCAPE_STATE; $i = 2;
807     $self->{c} = $self->{get_char}->();
808     redo A;
809     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
810     ## NOTE: second character of |unicode| in |escape|.
811     $char = $self->{c} - 0x0041 + 0xA;
812     $self->{state} = ESCAPE_STATE; $i = 2;
813     $self->{c} = $self->{get_char}->();
814     redo A;
815 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
816 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
817 wakaba 1.7 $char = $self->{c} - 0x0061 + 0xA;
818 wakaba 1.1 $self->{state} = ESCAPE_STATE; $i = 2;
819     $self->{c} = $self->{get_char}->();
820     redo A;
821     } elsif ($self->{c} == 0x000A or # \n
822     $self->{c} == 0x000C) { # \f
823     if ($q == 0) {
824 wakaba 1.7 #
825 wakaba 1.3 } elsif ($q == 1) {
826     ## NOTE: In |escape| in |URI|.
827 wakaba 1.5 $self->{t}->{type} = {
828 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
829     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
830     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
831     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
832 wakaba 1.5 }->{$self->{t}->{type}};
833     $self->{t}->{value} .= chr $self->{c};
834 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
835     $self->{c} = $self->{get_char}->();
836     redo A;
837 wakaba 1.1 } else {
838     ## Note: In |nl| in ... in |string| or |ident|.
839 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
840 wakaba 1.1 $self->{state} = STRING_STATE;
841     $self->{c} = $self->{get_char}->();
842     redo A;
843     }
844     } elsif ($self->{c} == 0x000D) { # \r
845     if ($q == 0) {
846 wakaba 1.7 #
847 wakaba 1.3 } elsif ($q == 1) {
848 wakaba 1.7 ## NOTE: In |escape| in |URI|.
849 wakaba 1.5 $self->{t}->{type} = {
850 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
851     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
852     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
853     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
854 wakaba 1.5 }->{$self->{t}->{type}};
855 wakaba 1.8 $self->{t}->{value} .= "\x0D";
856     $self->{state} = ESCAPE_BEFORE_LF_STATE;
857 wakaba 1.3 $self->{c} = $self->{get_char}->();
858     redo A;
859 wakaba 1.1 } else {
860     ## Note: In |nl| in ... in |string| or |ident|.
861 wakaba 1.8 $self->{t}->{value} .= "\x0D";
862 wakaba 1.1 $self->{state} = ESCAPE_BEFORE_LF_STATE;
863     $self->{c} = $self->{get_char}->();
864     redo A;
865     }
866 wakaba 1.7 } elsif ($self->{c} == -1) {
867     #
868 wakaba 1.1 } else {
869     ## NOTE: second character of |escape|.
870 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
871 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
872     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
873 wakaba 1.1 $self->{c} = $self->{get_char}->();
874     redo A;
875     }
876 wakaba 1.7
877     if ($q == 0) {
878 wakaba 1.10 if ($self->{t}->{type} == DIMENSION_TOKEN) {
879     if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
880     $self->{state} = BEFORE_TOKEN_STATE;
881     # reprocess
882     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
883 wakaba 1.13 unshift @{$self->{token}}, {type => MINUS_TOKEN};
884 wakaba 1.10 $self->{t}->{type} = NUMBER_TOKEN;
885     $self->{t}->{value} = '';
886     return $self->{t};
887     #redo A;
888     } elsif (length $self->{t}->{value}) {
889     $self->{state} = BEFORE_TOKEN_STATE;
890     # reprocess
891     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
892     return $self->{t};
893     #redo A;
894     } else {
895     $self->{state} = BEFORE_TOKEN_STATE;
896     # reprocess
897     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
898     $self->{t}->{type} = NUMBER_TOKEN;
899     $self->{t}->{value} = '';
900     return $self->{t};
901     #redo A;
902     }
903 wakaba 1.7 } else {
904 wakaba 1.10 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
905     $self->{state} = BEFORE_TOKEN_STATE;
906     # reprocess
907     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
908 wakaba 1.13 return {type => MINUS_TOKEN};
909 wakaba 1.10 #redo A;
910     } elsif (length $self->{t}->{value}) {
911     $self->{state} = BEFORE_TOKEN_STATE;
912     # reprocess
913     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
914     return $self->{t};
915     #redo A;
916     } else {
917     $self->{state} = BEFORE_TOKEN_STATE;
918     # reprocess
919     return {type => DELIM_TOKEN, value => '\\'};
920     #redo A;
921     }
922 wakaba 1.7 }
923 wakaba 1.8 } elsif ($q == 1) {
924     $self->{state} = URI_UNQUOTED_STATE;
925 wakaba 1.7 $self->{c} = $self->{get_char}->();
926     redo A;
927 wakaba 1.8 } else {
928     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
929     $self->{t}->{type} = {
930     STRING_TOKEN, INVALID_TOKEN,
931     URI_TOKEN, URI_INVALID_TOKEN,
932     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
933     }->{$self->{t}->{type}} || $self->{t}->{type};
934     $self->{state} = BEFORE_TOKEN_STATE;
935     # reprocess
936     return $self->{t};
937     #redo A;
938 wakaba 1.7 }
939 wakaba 1.1 } elsif ($self->{state} == ESCAPE_STATE) {
940     ## NOTE: third..seventh character of |unicode| in |escape|.
941     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
942     $char = $char * 0x10 + $self->{c} - 0x0030;
943     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
944     $self->{c} = $self->{get_char}->();
945     redo A;
946     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
947     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
948     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
949     $self->{c} = $self->{get_char}->();
950     redo A;
951 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
952 wakaba 1.7 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
953 wakaba 1.1 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
954     $self->{c} = $self->{get_char}->();
955     redo A;
956     } elsif ($self->{c} == 0x0020 or # SP
957     $self->{c} == 0x000A or # \n
958     $self->{c} == 0x0009 or # \t
959     $self->{c} == 0x000C) { # \f
960 wakaba 1.5 $self->{t}->{value} .= chr $char;
961 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
962     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
963 wakaba 1.1 $self->{c} = $self->{get_char}->();
964     redo A;
965     } elsif ($self->{c} == 0x000D) { # \r
966     $self->{state} = ESCAPE_BEFORE_LF_STATE;
967     $self->{c} = $self->{get_char}->();
968     redo A;
969     } else {
970 wakaba 1.5 $self->{t}->{value} .= chr $char;
971 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
972     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
973 wakaba 1.1 # reconsume
974     redo A;
975     }
976     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
977     ## NOTE: eightth character of |unicode| in |escape|.
978     if ($self->{c} == 0x0020 or # SP
979     $self->{c} == 0x000A or # \n
980     $self->{c} == 0x0009 or # \t
981     $self->{c} == 0x000C) { # \f
982 wakaba 1.5 $self->{t}->{value} .= chr $char;
983 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
984     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
985 wakaba 1.1 $self->{c} = $self->{get_char}->();
986     redo A;
987     } elsif ($self->{c} == 0x000D) { # \r
988     $self->{state} = ESCAPE_BEFORE_NL_STATE;
989     $self->{c} = $self->{get_char}->();
990     redo A;
991     } else {
992 wakaba 1.5 $self->{t}->{value} .= chr $char;
993 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
994     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
995 wakaba 1.1 # reconsume
996     redo A;
997     }
998     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
999     ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
1000     if ($self->{c} == 0x000A) { # \n
1001 wakaba 1.8 $self->{t}->{value} .= chr $self->{c};
1002 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1003     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1004 wakaba 1.1 $self->{c} = $self->{get_char}->();
1005     redo A;
1006     } else {
1007 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
1008     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1009 wakaba 1.8 # reprocess
1010 wakaba 1.1 redo A;
1011     }
1012     } elsif ($self->{state} == STRING_STATE) {
1013     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1014     ## a character in |invalid$Q| in |invalid| in |INVALID|,
1015     ## where |$Q = $q == 0x0022 ? 1 : 2|.
1016 wakaba 1.3 ## Or, in |URI|.
1017 wakaba 1.1 if ($self->{c} == 0x005C) { # \
1018     $self->{state} = ESCAPE_OPEN_STATE;
1019     $self->{c} = $self->{get_char}->();
1020     redo A;
1021     } elsif ($self->{c} == $q) { # " | '
1022 wakaba 1.5 if ($self->{t}->{type} == STRING_TOKEN) {
1023 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
1024     $self->{c} = $self->{get_char}->();
1025 wakaba 1.5 return $self->{t};
1026 wakaba 1.3 #redo A;
1027     } else {
1028     $self->{state} = URI_AFTER_WSP_STATE;
1029     $self->{c} = $self->{get_char}->();
1030     redo A;
1031     }
1032 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
1033     $self->{c} == 0x000D or # \r
1034     $self->{c} == 0x000C or # \f
1035     $self->{c} == -1) {
1036 wakaba 1.11 $self->{t}->{type} = {
1037     STRING_TOKEN, INVALID_TOKEN,
1038     INVALID_TOKEN, INVALID_TOKEN,
1039     URI_TOKEN, URI_INVALID_TOKEN,
1040     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1041     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1042     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1043     }->{$self->{t}->{type}};
1044 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1045     # reconsume
1046 wakaba 1.5 return $self->{t};
1047 wakaba 1.1 #redo A;
1048     } else {
1049 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1050 wakaba 1.1 # stay in the state
1051     $self->{c} = $self->{get_char}->();
1052     redo A;
1053     }
1054     } elsif ($self->{state} == NUMBER_STATE) {
1055     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1056     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1057 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1058 wakaba 1.1 # stay in the state
1059     $self->{c} = $self->{get_char}->();
1060     redo A;
1061     } elsif ($self->{c} == 0x002E) { # .
1062     $self->{state} = NUMBER_DOT_STATE;
1063     $self->{c} = $self->{get_char}->();
1064     redo A;
1065     } else {
1066 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1067     $self->{t}->{value} = '';
1068 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1069     # reprocess
1070 wakaba 1.2 redo A;
1071 wakaba 1.1 }
1072     } elsif ($self->{state} == NUMBER_DOT_STATE) {
1073     ## NOTE: The character immediately following |.| in |num|.
1074     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1075 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1076 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1077     $self->{c} = $self->{get_char}->();
1078     redo A;
1079     } else {
1080 wakaba 1.13 unshift @{$self->{token}}, {type => DOT_TOKEN};
1081 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1082     $self->{t}->{value} = '';
1083 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1084     # reprocess
1085 wakaba 1.5 return $self->{t};
1086 wakaba 1.1 #redo A;
1087     }
1088     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1089     ## NOTE: The character immediately following |.| at the beginning of |num|.
1090     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1091 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1092 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1093     $self->{c} = $self->{get_char}->();
1094     redo A;
1095     } else {
1096     $self->{state} = BEFORE_TOKEN_STATE;
1097 wakaba 1.9 # reprocess
1098 wakaba 1.13 return {type => DOT_TOKEN};
1099 wakaba 1.1 #redo A;
1100     }
1101     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1102     ## NOTE: |[0-9]| in |num| after |.|.
1103     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1104 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1105 wakaba 1.1 # stay in the state
1106     $self->{c} = $self->{get_char}->();
1107     redo A;
1108     } else {
1109 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1110     $self->{t}->{value} = '';
1111 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1112     # reprocess
1113 wakaba 1.2 redo A;
1114 wakaba 1.1 }
1115     } else {
1116     die "$0: Unknown state |$self->{state}|";
1117     }
1118     } # A
1119     } # get_next_token
1120    
1121     1;
1122 wakaba 1.13 # $Date: 2007/09/08 15:43:12 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24