/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (hide annotations) (download)
Sat Sep 8 15:20:41 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.10: +9 -2 lines
++ whatpm/t/ChangeLog	8 Sep 2007 15:19:38 -0000
	* css-token-1.test: |URI| and |INVALID| test
	cases are added.

2007-09-08  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 15:19:19 -0000
	* Tokenizer.pm: |URI| bugs are fixed.

2007-09-08  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3    
4 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
5     sub BEFORE_NMSTART_STATE () { 1 }
6     sub NAME_STATE () { 2 }
7     sub ESCAPE_OPEN_STATE () { 3 }
8     sub STRING_STATE () { 4 }
9     sub HASH_OPEN_STATE () { 5 }
10     sub NUMBER_STATE () { 6 }
11     sub NUMBER_FRACTION_STATE () { 7 }
12     sub AFTER_NUMBER_STATE () { 8 }
13     sub URI_BEFORE_WSP_STATE () { 9 }
14     sub ESCAPE_STATE () { 10 }
15     sub ESCAPE_BEFORE_LF_STATE () { 11 }
16     sub ESCAPE_BEFORE_NL_STATE () { 12 }
17     sub NUMBER_DOT_STATE () { 13 }
18     sub NUMBER_DOT_NUMBER_STATE () { 14 }
19     sub DELIM_STATE () { 15 }
20 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
21     sub URI_AFTER_WSP_STATE () { 17 }
22     sub AFTER_AT_STATE () { 18 }
23     sub AFTER_AT_HYPHEN_STATE () { 19 }
24 wakaba 1.2
25     sub IDENT_TOKEN () { 1 }
26     sub ATKEYWORD_TOKEN () { 2 }
27     sub HASH_TOKEN () { 3 }
28     sub FUNCTION_TOKEN () { 4 }
29     sub URI_TOKEN () { 5 }
30     sub URI_INVALID_TOKEN () { 6 }
31     sub URI_PREFIX_TOKEN () { 7 }
32     sub URI_PREFIX_INVALID_TOKEN () { 8 }
33     sub STRING_TOKEN () { 9 }
34     sub INVALID_TOKEN () { 10 }
35     sub NUMBER_TOKEN () { 11 }
36     sub DIMENSION_TOKEN () { 12 }
37     sub PERCENTAGE_TOKEN () { 13 }
38     sub UNICODE_RANGE_TOKEN () { 14 }
39     sub DELIM_TOKEN () { 16 }
40     sub PLUS_TOKEN () { 17 }
41     sub GREATER_TOKEN () { 18 }
42     sub COMMA_TOKEN () { 19 }
43     sub TILDE_TOKEN () { 20 }
44     sub DASHMATCH_TOKEN () { 21 }
45     sub PREFIXMATCH_TOKEN () { 22 }
46     sub SUFFIXMATCH_TOKEN () { 23 }
47     sub SUBSTRINGMATCH_TOKEN () { 24 }
48     sub INCLUDES_TOKEN () { 25 }
49     sub SEMICOLON_TOKEN () { 26 }
50     sub LBRACE_TOKEN () { 27 }
51     sub RBRACE_TOKEN () { 28 }
52     sub LPAREN_TOKEN () { 29 }
53     sub RPAREN_TOKEN () { 30 }
54     sub LBRACKET_TOKEN () { 31 }
55     sub RBRACKET_TOKEN () { 32 }
56     sub S_TOKEN () { 33 }
57     sub CDO_TOKEN () { 34 }
58     sub CDC_TOKEN () { 35 }
59     sub COMMENT_TOKEN () { 36 }
60     sub COMMENT_INVALID_TOKEN () { 37 }
61     sub EOF_TOKEN () { 38 }
62    
63     our @TokenName = qw(
64 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
65 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
66 wakaba 1.6 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
67 wakaba 1.2 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
68     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
69     COMMENT_INVALID EOF
70     );
71    
72 wakaba 1.1 sub new ($) {
73 wakaba 1.2 my $self = bless {token => [], get_char => sub { -1 },
74     onerror => sub { }}, shift;
75 wakaba 1.1 return $self;
76     } # new
77    
78     sub init ($) {
79     my $self = shift;
80     $self->{state} = BEFORE_TOKEN_STATE;
81     $self->{c} = $self->{get_char}->();
82 wakaba 1.5 #$self->{t} = {type => token-type, value => value, number => number};
83 wakaba 1.1 } # init
84    
85     sub get_next_token ($) {
86     my $self = shift;
87     if (@{$self->{token}}) {
88     return shift @{$self->{token}};
89     }
90    
91     my $char;
92     my $num; # |{num}|, if any.
93     my $i; # |$i + 1|th character in |unicode| in |escape|.
94 wakaba 1.3 my $q;
95     ## NOTE:
96     ## 0: in |ident|.
97     ## 1: in |URI| outside of |string|.
98     ## 0x0022: in |string1| or |invalid1|.
99     ## 0x0027: in |string2| or |invalid2|.
100 wakaba 1.1
101     A: {
102     if ($self->{state} == BEFORE_TOKEN_STATE) {
103     if ($self->{c} == 0x002D) { # -
104     ## NOTE: |-| in |ident| in |IDENT|
105 wakaba 1.7 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
106 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
107     $self->{c} = $self->{get_char}->();
108     redo A;
109 wakaba 1.5 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
110     $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
111     $self->{c} = $self->{get_char}->();
112     if ($self->{c} == 0x002B) { # +
113     $self->{c} = $self->{get_char}->();
114     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
115     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
116     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
117     $self->{c} == 0x003F) { # ?
118     $self->{t}->{value} .= '+' . chr $self->{c};
119     $self->{t}->{type} = UNICODE_RANGE_TOKEN;
120     $self->{c} = $self->{get_char}->();
121     C: for (2..6) {
122     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
123     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
124     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
125     $self->{c} == 0x003F) { # ?
126     $self->{t}->{value} .= chr $self->{c};
127     $self->{c} = $self->{get_char}->();
128     } else {
129     last C;
130     }
131     } # C
132    
133     if ($self->{c} == 0x002D) { # -
134     $self->{c} = $self->{get_char}->();
135     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
136     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
137     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
138     $self->{t}->{value} .= '-' . chr $self->{c};
139     $self->{c} = $self->{get_char}->();
140     C: for (2..6) {
141     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
142     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
143     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
144     $self->{t}->{value} .= chr $self->{c};
145     $self->{c} = $self->{get_char}->();
146     } else {
147     last C;
148     }
149     } # C
150    
151     #
152     } else {
153     my $token = $self->{t};
154     $self->{t} = {type => IDENT_TOKEN, value => '-'};
155     $self->{state} = BEFORE_NMSTART_STATE;
156     # reprocess
157     return $token;
158     #redo A;
159     }
160     }
161    
162     $self->{state} = BEFORE_TOKEN_STATE;
163     # reprocess
164     return $self->{t};
165     #redo A;
166     } else {
167     unshift @{$self->{token}}, {type => PLUS_TOKEN};
168     $self->{state} = BEFORE_TOKEN_STATE;
169     # reprocess
170     return $self->{t};
171     #redo A;
172     }
173     } else {
174     $self->{state} = NAME_STATE;
175     # reprocess
176     redo A;
177     }
178 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
179     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
180 wakaba 1.1 $self->{c} == 0x005F or # _
181     $self->{c} > 0x007F) { # nonascii
182     ## NOTE: |nmstart| in |ident| in |IDENT|
183 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
184 wakaba 1.1 $self->{state} = NAME_STATE;
185     $self->{c} = $self->{get_char}->();
186     redo A;
187     } elsif ($self->{c} == 0x005C) { # \
188     ## NOTE: |nmstart| in |ident| in |IDENT|
189 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => ''};
190 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
191     $self->{c} = $self->{get_char}->();
192     redo A;
193     } elsif ($self->{c} == 0x0040) { # @
194     ## NOTE: |@| in |ATKEYWORD|
195 wakaba 1.5 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
196 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
197 wakaba 1.1 $self->{c} = $self->{get_char}->();
198     redo A;
199 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
200 wakaba 1.5 $self->{t} = {type => STRING_TOKEN, value => ''};
201 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
202 wakaba 1.1 $self->{c} = $self->{get_char}->();
203     redo A;
204     } elsif ($self->{c} == 0x0023) { # #
205     ## NOTE: |#| in |HASH|.
206 wakaba 1.5 $self->{t} = {type => HASH_TOKEN, value => ''};
207 wakaba 1.1 $self->{state} = HASH_OPEN_STATE;
208     $self->{c} = $self->{get_char}->();
209     redo A;
210     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
211     ## NOTE: |num|.
212 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
213 wakaba 1.1 $self->{state} = NUMBER_STATE;
214     $self->{c} = $self->{get_char}->();
215     redo A;
216     } elsif ($self->{c} == 0x002E) { # .
217     ## NOTE: |num|.
218 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
219 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
220     $self->{c} = $self->{get_char}->();
221     redo A;
222 wakaba 1.4 } elsif ($self->{c} == 0x002F) { # /
223     $self->{c} = $self->{get_char}->();
224     if ($self->{c} == 0x002A) { # *
225     C: {
226     $self->{c} = $self->{get_char}->();
227     if ($self->{c} == 0x002A) { # *
228     D: {
229     $self->{c} = $self->{get_char}->();
230     if ($self->{c} == 0x002F) { # /
231     #
232     } elsif ($self->{c} == 0x002A) { # *
233     redo D;
234     } else {
235     redo C;
236     }
237     } # D
238     } elsif ($self->{c} == -1) {
239     # stay in the state
240     # reprocess
241     return {type => COMMENT_INVALID_TOKEN};
242     #redo A;
243     } else {
244     redo C;
245     }
246     } # C
247    
248     # stay in the state.
249     $self->{c} = $self->{get_char}->();
250     redo A;
251     } else {
252     # stay in the state.
253     # reprocess
254 wakaba 1.9 return {type => DELIM_TOKEN, value => '/'};
255 wakaba 1.4 #redo A;
256     }
257 wakaba 1.1 } elsif ($self->{c} == 0x003C) { # <
258     ## NOTE: |CDO|
259     $self->{c} = $self->{get_char}->();
260     if ($self->{c} == 0x0021) { # !
261     $self->{c} = $self->{get_char}->();
262 wakaba 1.9 if ($self->{c} == 0x002D) { # -
263 wakaba 1.1 $self->{c} = $self->{get_char}->();
264 wakaba 1.9 if ($self->{c} == 0x002D) { # -
265 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
266     $self->{c} = $self->{get_char}->();
267     return {type => CDO_TOKEN};
268     #redo A;
269     } else {
270     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
271     ## NOTE: |-| in |ident| in |IDENT|
272 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
273 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
274     #reprocess
275     return {type => DELIM_TOKEN, value => '<'};
276     #redo A;
277     }
278     } else {
279     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
280     $self->{state} = BEFORE_TOKEN_STATE;
281     #reprocess
282     return {type => DELIM_TOKEN, value => '<'};
283     #redo A;
284     }
285     } else {
286     $self->{state} = BEFORE_TOKEN_STATE;
287     #reprocess
288     return {type => DELIM_TOKEN, value => '<'};
289     #redo A;
290     }
291 wakaba 1.2 } elsif (my $t = {
292     0x003B => SEMICOLON_TOKEN, # ;
293     0x007B => LBRACE_TOKEN, # {
294     0x007D => RBRACE_TOKEN, # }
295     0x0028 => LPAREN_TOKEN, # (
296     0x0029 => RPAREN_TOKEN, # )
297     0x005B => LBRACKET_TOKEN, # [
298     0x005D => RBRACKET_TOKEN, # ]
299 wakaba 1.1 }->{$self->{c}}) {
300     # stay in the state
301     $self->{c} = $self->{get_char}->();
302 wakaba 1.2 return {type => $t};
303 wakaba 1.1 # redo A;
304     } elsif ({
305     0x0020 => 1, # SP
306     0x0009 => 1, # \t
307     0x000D => 1, # \r
308     0x000A => 1, # \n
309     0x000C => 1, # \f
310     }->{$self->{c}}) {
311     W: {
312     $self->{c} = $self->{get_char}->();
313     if ({
314     0x0020 => 1, # SP
315     0x0009 => 1, # \t
316     0x000D => 1, # \r
317     0x000A => 1, # \n
318     0x000C => 1, # \f
319     }->{$self->{c}}) {
320     redo W;
321     } elsif (my $v = {
322     0x002B => PLUS_TOKEN, # +
323     0x003E => GREATER_TOKEN, # >
324     0x002C => COMMA_TOKEN, # ,
325     0x007E => TILDE_TOKEN, # ~
326     }->{$self->{c}}) {
327     # stay in the state
328     $self->{c} = $self->{get_char}->();
329     return {type => $v};
330     #redo A;
331     } else {
332     # stay in the state
333     # reprocess
334     return {type => S_TOKEN};
335     #redo A;
336     }
337     } # W
338     } elsif (my $v = {
339     0x007C => DASHMATCH_TOKEN, # |
340     0x005E => PREFIXMATCH_TOKEN, # ^
341     0x0024 => SUFFIXMATCH_TOKEN, # $
342     0x002A => SUBSTRINGMATCH_TOKEN, # *
343     }->{$self->{c}}) {
344 wakaba 1.2 my $c = $self->{c};
345 wakaba 1.1 $self->{c} = $self->{get_char}->();
346     if ($self->{c} == 0x003D) { # =
347     # stay in the state
348     $self->{c} = $self->{get_char}->();
349     return {type => $v};
350     #redo A;
351     } else {
352     # stay in the state
353     # reprocess
354 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
355 wakaba 1.1 #redo A;
356     }
357     } elsif ($self->{c} == 0x002B) { # +
358     # stay in the state
359     $self->{c} = $self->{get_char}->();
360     return {type => PLUS_TOKEN};
361     #redo A;
362     } elsif ($self->{c} == 0x003E) { # >
363     # stay in the state
364     $self->{c} = $self->{get_char}->();
365     return {type => GREATER_TOKEN};
366     #redo A;
367     } elsif ($self->{c} == 0x002C) { # ,
368     # stay in the state
369     $self->{c} = $self->{get_char}->();
370     return {type => COMMA_TOKEN};
371     #redo A;
372     } elsif ($self->{c} == 0x007E) { # ~
373     $self->{c} = $self->{get_char}->();
374     if ($self->{c} == 0x003D) { # =
375     # stay in the state
376     $self->{c} = $self->{get_char}->();
377     return {type => INCLUDES_TOKEN};
378     #redo A;
379     } else {
380     # stay in the state
381     # reprocess
382     return {type => TILDE_TOKEN};
383     #redo A;
384     }
385     } elsif ($self->{c} == -1) {
386     # stay in the state
387     $self->{c} = $self->{get_char}->();
388     return {type => EOF_TOKEN};
389     #redo A;
390     } else {
391     # stay in the state
392 wakaba 1.5 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
393 wakaba 1.1 $self->{c} = $self->{get_char}->();
394 wakaba 1.5 return $self->{t};
395 wakaba 1.1 #redo A;
396     }
397     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
398 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
399     ## |FUNCTION|)
400 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
401     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
402 wakaba 1.1 $self->{c} == 0x005F or # _
403     $self->{c} > 0x007F) { # nonascii
404 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
405     $self->{t}->{type} = DIMENSION_TOKEN
406     if $self->{t}->{type} == NUMBER_TOKEN;
407 wakaba 1.1 $self->{state} = NAME_STATE;
408     $self->{c} = $self->{get_char}->();
409     redo A;
410     } elsif ($self->{c} == 0x005C) { # \
411     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
412     $self->{c} = $self->{get_char}->();
413     redo A;
414 wakaba 1.10 } elsif ($self->{c} == 0x002D) { # -
415     if ($self->{t}->{type} == IDENT_TOKEN) {
416     $self->{c} = $self->{get_char}->();
417     if ($self->{c} == 0x003E) { # >
418     $self->{state} = BEFORE_TOKEN_STATE;
419     $self->{c} = $self->{get_char}->();
420     return {type => CDC_TOKEN};
421     #redo A;
422     } else {
423     ## NOTE: |-|, |-|, $self->{c}
424     #$self->{t} = {type => IDENT_TOKEN, value => '-'};
425     # stay in the state
426     # reconsume
427     return {type => DELIM_TOKEN, value => '-'};
428     #redo A;
429     }
430     } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
431 wakaba 1.1 $self->{c} = $self->{get_char}->();
432 wakaba 1.10 if ($self->{c} == 0x003E) { # >
433     unshift @{$self->{token}}, {type => CDC_TOKEN};
434     $self->{t}->{type} = NUMBER_TOKEN;
435     $self->{t}->{value} = '';
436     $self->{state} = BEFORE_TOKEN_STATE;
437     $self->{c} = $self->{get_char}->();
438     return $self->{t};
439     #redo A;
440     } else {
441     ## NOTE: |-|, |-|, $self->{c}
442     my $t = $self->{t};
443     $t->{type} = NUMBER_TOKEN;
444     $t->{value} = '';
445     $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
446     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
447     # stay in the state
448     # reconsume
449     return $t;
450     #redo A;
451     }
452 wakaba 1.1 } else {
453 wakaba 1.10 #
454 wakaba 1.1 }
455     } else {
456 wakaba 1.10 #
457     }
458    
459     if ($self->{t}->{type} == DIMENSION_TOKEN) {
460     ## NOTE: |-| after |NUMBER|.
461     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
462     $self->{state} = BEFORE_TOKEN_STATE;
463     # reprocess
464     $self->{t}->{type} = NUMBER_TOKEN;
465     $self->{t}->{value} = '';
466     return $self->{t};
467     } else {
468     ## NOTE: |-| not followed by |nmstart|.
469     $self->{state} = BEFORE_TOKEN_STATE;
470     # reprocess
471     return {type => DELIM_TOKEN, value => '-'};
472 wakaba 1.1 }
473 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
474     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
475     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
476     $self->{c} == 0x005F or # _
477     $self->{c} > 0x007F) { # nonascii
478 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
479 wakaba 1.3 $self->{state} = NAME_STATE;
480     $self->{c} = $self->{get_char}->();
481     redo A;
482     } elsif ($self->{c} == 0x002D) { # -
483 wakaba 1.5 $self->{t}->{value} .= '-';
484 wakaba 1.3 $self->{state} = AFTER_AT_HYPHEN_STATE;
485     $self->{c} = $self->{get_char}->();
486     redo A;
487     } elsif ($self->{c} == 0x005C) { # \
488     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
489     $self->{c} = $self->{get_char}->();
490     redo A;
491     } else {
492     $self->{state} = BEFORE_TOKEN_STATE;
493     # reprocess
494     return {type => DELIM_TOKEN, value => '@'};
495     }
496     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
497     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
498     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
499     $self->{c} == 0x005F or # _
500     $self->{c} > 0x007F) { # nonascii
501 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
502 wakaba 1.3 $self->{state} = NAME_STATE;
503     $self->{c} = $self->{get_char}->();
504     redo A;
505     } elsif ($self->{c} == 0x002D) { # -
506     $self->{c} = $self->{get_char}->();
507     if ($self->{c} == 0x003E) { # >
508 wakaba 1.4 unshift @{$self->{token}}, {type => CDC_TOKEN};
509 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
510     $self->{c} = $self->{get_char}->();
511 wakaba 1.4 return {type => DELIM_TOKEN, value => '@'};
512 wakaba 1.3 #redo A;
513     } else {
514     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
515 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
516 wakaba 1.3 $self->{state} = BEFORE_NMSTART_STATE;
517     # reprocess
518     return {type => DELIM_TOKEN, value => '@'};
519     #redo A;
520     }
521     } elsif ($self->{c} == 0x005C) { # \
522     ## TODO: @-\{nl}
523     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
524     $self->{c} = $self->{get_char}->();
525     redo A;
526     } else {
527     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
528     $self->{state} = BEFORE_TOKEN_STATE;
529     # reprocess
530     return {type => DELIM_TOKEN, value => '@'};
531     }
532 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
533     if ($self->{c} == 0x002D) { # -
534     ## NOTE: |-| in |ident|.
535 wakaba 1.10 $self->{t}->{hyphen} = 1;
536 wakaba 1.5 $self->{t}->{value} = '-';
537 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
538 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
539     $self->{c} = $self->{get_char}->();
540     redo A;
541 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
542     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
543 wakaba 1.1 $self->{c} == 0x005F or # _
544     $self->{c} > 0x007F) { # nonascii
545     ## NOTE: |nmstart| in |ident|.
546 wakaba 1.5 $self->{t}->{value} = chr $self->{c};
547     $self->{t}->{type} = DIMENSION_TOKEN;
548 wakaba 1.1 $self->{state} = NAME_STATE;
549     $self->{c} = $self->{get_char}->();
550     redo A;
551     } elsif ($self->{c} == 0x005C) { # \
552     ## NOTE: |nmstart| in |ident| in |IDENT|
553 wakaba 1.5 $self->{t}->{value} = '';
554 wakaba 1.10 $self->{t}->{type} = DIMENSION_TOKEN;
555 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
556     $self->{c} = $self->{get_char}->();
557     redo A;
558     } elsif ($self->{c} == 0x0025) { # %
559 wakaba 1.5 $self->{t}->{type} = PERCENTAGE_TOKEN;
560 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
561     $self->{c} = $self->{get_char}->();
562 wakaba 1.5 return $self->{t};
563 wakaba 1.1 #redo A;
564     } else {
565     $self->{state} = BEFORE_TOKEN_STATE;
566     # reprocess
567 wakaba 1.5 return $self->{t};
568 wakaba 1.1 #redo A;
569     }
570     } elsif ($self->{state} == HASH_OPEN_STATE) {
571     ## NOTE: The first |nmchar| in |name| in |HASH|.
572 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
573     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
574     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
575 wakaba 1.1 $self->{c} == 0x002D or # -
576     $self->{c} == 0x005F or # _
577     $self->{c} > 0x007F) { # nonascii
578 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
579 wakaba 1.1 $self->{state} = NAME_STATE;
580     $self->{c} = $self->{get_char}->();
581     redo A;
582     } elsif ($self->{c} == 0x005C) { # \
583     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
584     $self->{c} = $self->{get_char}->();
585     redo A;
586     } else {
587     $self->{state} = BEFORE_TOKEN_STATE;
588 wakaba 1.9 # reprocess
589 wakaba 1.1 return {type => DELIM_TOKEN, value => '#'};
590     #redo A;
591     }
592     } elsif ($self->{state} == NAME_STATE) {
593     ## NOTE: |nmchar| in (|ident| or |name|).
594 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
595     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
596     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
597 wakaba 1.1 $self->{c} == 0x005F or # _
598     $self->{c} == 0x002D or # -
599     $self->{c} > 0x007F) { # nonascii
600 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
601 wakaba 1.1 # stay in the state
602     $self->{c} = $self->{get_char}->();
603     redo A;
604     } elsif ($self->{c} == 0x005C) { # \
605 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
606 wakaba 1.1 $self->{c} = $self->{get_char}->();
607     redo A;
608     } elsif ($self->{c} == 0x0028 and # (
609 wakaba 1.5 $self->{t}->{type} == IDENT_TOKEN) { # (
610     my $func_name = $self->{t}->{value};
611 wakaba 1.3 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
612     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
613 wakaba 1.5 if ($self->{t}->{has_escape}) {
614 wakaba 1.3 ## TODO: warn
615     }
616 wakaba 1.5 $self->{t}->{type}
617 wakaba 1.3 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
618 wakaba 1.5 $self->{t}->{value} = '';
619 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
620     $self->{c} = $self->{get_char}->();
621     redo A;
622     } else {
623 wakaba 1.5 $self->{t}->{type} = FUNCTION_TOKEN;
624 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
625     $self->{c} = $self->{get_char}->();
626 wakaba 1.5 return $self->{t};
627 wakaba 1.1 #redo A;
628     }
629     } else {
630     $self->{state} = BEFORE_TOKEN_STATE;
631     # reconsume
632 wakaba 1.5 return $self->{t};
633 wakaba 1.1 #redo A;
634     }
635 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
636     while ({
637     0x0020 => 1, # SP
638     0x0009 => 1, # \t
639     0x000D => 1, # \r
640     0x000A => 1, # \n
641     0x000C => 1, # \f
642     }->{$self->{c}}) {
643     $self->{c} = $self->{get_char}->();
644     }
645     if ($self->{c} == -1) {
646 wakaba 1.5 $self->{t}->{type} = {
647 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
648     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
649     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
650     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
651 wakaba 1.5 }->{$self->{t}->{type}};
652 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
653     $self->{c} = $self->{get_char}->();
654 wakaba 1.5 return $self->{t};
655 wakaba 1.3 #redo A;
656     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
657     ## TODO: Should we consider matches of "(" and ")"?
658 wakaba 1.5 $self->{t}->{type} = {
659 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
660     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
661     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
662     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
663 wakaba 1.5 }->{$self->{t}->{type}};
664 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
665     $self->{c} = $self->{get_char}->();
666     redo A;
667     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
668     $self->{state} = STRING_STATE; $q = $self->{c};
669     $self->{c} = $self->{get_char}->();
670     redo A;
671     } elsif ($self->{c} == 0x0029) { # )
672     $self->{state} = BEFORE_TOKEN_STATE;
673     $self->{c} = $self->{get_char}->();
674 wakaba 1.5 return $self->{t};
675 wakaba 1.3 #redo A;
676     } elsif ($self->{c} == 0x005C) { # \
677     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
678     $self->{c} = $self->{get_char}->();
679     redo A;
680     } else {
681 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
682 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
683     $self->{c} = $self->{get_char}->();
684     redo A;
685     }
686     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
687     if ({
688     0x0020 => 1, # SP
689     0x0009 => 1, # \t
690     0x000D => 1, # \r
691     0x000A => 1, # \n
692     0x000C => 1, # \f
693     }->{$self->{c}}) {
694     $self->{state} = URI_AFTER_WSP_STATE;
695     $self->{c} = $self->{get_char}->();
696     redo A;
697     } elsif ($self->{c} == -1) {
698 wakaba 1.5 $self->{t}->{type} = {
699 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
700     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
701     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
702     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
703 wakaba 1.5 }->{$self->{t}->{type}};
704 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
705     $self->{c} = $self->{get_char}->();
706 wakaba 1.5 return $self->{t};
707 wakaba 1.3 #redo A;
708     } elsif ($self->{c} < 0x0020 or {
709     0x0022 => 1, # "
710     0x0027 => 1, # '
711     0x0028 => 1, # (
712     }->{$self->{c}}) { # C0 or (
713     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
714 wakaba 1.5 $self->{t}->{type} = {
715 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
716     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
717     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
718     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
719 wakaba 1.5 }->{$self->{t}->{type}};
720 wakaba 1.3 # stay in the state.
721     $self->{c} = $self->{get_char}->();
722     redo A;
723     } elsif ($self->{c} == 0x0029) { # )
724     $self->{state} = BEFORE_TOKEN_STATE;
725     $self->{c} = $self->{get_char}->();
726 wakaba 1.5 return $self->{t};
727 wakaba 1.3 #redo A;
728     } elsif ($self->{c} == 0x005C) { # \
729     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
730     $self->{c} = $self->{get_char}->();
731     redo A;
732     } else {
733 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
734 wakaba 1.3 # stay in the state.
735     $self->{c} = $self->{get_char}->();
736     redo A;
737     }
738     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
739     if ({
740     0x0020 => 1, # SP
741     0x0009 => 1, # \t
742     0x000D => 1, # \r
743     0x000A => 1, # \n
744     0x000C => 1, # \f
745     }->{$self->{c}}) {
746     # stay in the state.
747     $self->{c} = $self->{get_char}->();
748     redo A;
749     } elsif ($self->{c} == -1) {
750 wakaba 1.5 $self->{t}->{type} = {
751 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
752     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
753     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
754     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
755 wakaba 1.5 }->{$self->{t}->{type}};
756 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
757     $self->{c} = $self->{get_char}->();
758 wakaba 1.5 return $self->{t};
759 wakaba 1.3 #redo A;
760     } elsif ($self->{c} == 0x0029) { # )
761     $self->{state} = BEFORE_TOKEN_STATE;
762     $self->{c} = $self->{get_char}->();
763 wakaba 1.5 return $self->{t};
764 wakaba 1.3 #redo A;
765     } elsif ($self->{c} == 0x005C) { # \
766     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
767     $self->{c} = $self->{get_char}->();
768     redo A;
769     } else {
770     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
771 wakaba 1.5 $self->{t}->{type} = {
772 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
773     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
774     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
775     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
776 wakaba 1.5 }->{$self->{t}->{type}};
777 wakaba 1.3 # stay in the state.
778     $self->{c} = $self->{get_char}->();
779     redo A;
780     }
781 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
782 wakaba 1.5 $self->{t}->{has_escape} = 1;
783 wakaba 1.1 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
784     ## NOTE: second character of |unicode| in |escape|.
785     $char = $self->{c} - 0x0030;
786     $self->{state} = ESCAPE_STATE; $i = 2;
787     $self->{c} = $self->{get_char}->();
788     redo A;
789     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
790     ## NOTE: second character of |unicode| in |escape|.
791     $char = $self->{c} - 0x0041 + 0xA;
792     $self->{state} = ESCAPE_STATE; $i = 2;
793     $self->{c} = $self->{get_char}->();
794     redo A;
795 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
796 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
797 wakaba 1.7 $char = $self->{c} - 0x0061 + 0xA;
798 wakaba 1.1 $self->{state} = ESCAPE_STATE; $i = 2;
799     $self->{c} = $self->{get_char}->();
800     redo A;
801     } elsif ($self->{c} == 0x000A or # \n
802     $self->{c} == 0x000C) { # \f
803     if ($q == 0) {
804 wakaba 1.7 #
805 wakaba 1.3 } elsif ($q == 1) {
806     ## NOTE: In |escape| in |URI|.
807 wakaba 1.5 $self->{t}->{type} = {
808 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
809     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
810     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
811     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
812 wakaba 1.5 }->{$self->{t}->{type}};
813     $self->{t}->{value} .= chr $self->{c};
814 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
815     $self->{c} = $self->{get_char}->();
816     redo A;
817 wakaba 1.1 } else {
818     ## Note: In |nl| in ... in |string| or |ident|.
819 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
820 wakaba 1.1 $self->{state} = STRING_STATE;
821     $self->{c} = $self->{get_char}->();
822     redo A;
823     }
824     } elsif ($self->{c} == 0x000D) { # \r
825     if ($q == 0) {
826 wakaba 1.7 #
827 wakaba 1.3 } elsif ($q == 1) {
828 wakaba 1.7 ## NOTE: In |escape| in |URI|.
829 wakaba 1.5 $self->{t}->{type} = {
830 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
831     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
832     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
833     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
834 wakaba 1.5 }->{$self->{t}->{type}};
835 wakaba 1.8 $self->{t}->{value} .= "\x0D";
836     $self->{state} = ESCAPE_BEFORE_LF_STATE;
837 wakaba 1.3 $self->{c} = $self->{get_char}->();
838     redo A;
839 wakaba 1.1 } else {
840     ## Note: In |nl| in ... in |string| or |ident|.
841 wakaba 1.8 $self->{t}->{value} .= "\x0D";
842 wakaba 1.1 $self->{state} = ESCAPE_BEFORE_LF_STATE;
843     $self->{c} = $self->{get_char}->();
844     redo A;
845     }
846 wakaba 1.7 } elsif ($self->{c} == -1) {
847     #
848 wakaba 1.1 } else {
849     ## NOTE: second character of |escape|.
850 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
851 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
852     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
853 wakaba 1.1 $self->{c} = $self->{get_char}->();
854     redo A;
855     }
856 wakaba 1.7
857     if ($q == 0) {
858 wakaba 1.10 if ($self->{t}->{type} == DIMENSION_TOKEN) {
859     if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
860     $self->{state} = BEFORE_TOKEN_STATE;
861     # reprocess
862     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
863     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
864     $self->{t}->{type} = NUMBER_TOKEN;
865     $self->{t}->{value} = '';
866     return $self->{t};
867     #redo A;
868     } elsif (length $self->{t}->{value}) {
869     $self->{state} = BEFORE_TOKEN_STATE;
870     # reprocess
871     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
872     return $self->{t};
873     #redo A;
874     } else {
875     $self->{state} = BEFORE_TOKEN_STATE;
876     # reprocess
877     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
878     $self->{t}->{type} = NUMBER_TOKEN;
879     $self->{t}->{value} = '';
880     return $self->{t};
881     #redo A;
882     }
883 wakaba 1.7 } else {
884 wakaba 1.10 if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
885     $self->{state} = BEFORE_TOKEN_STATE;
886     # reprocess
887     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
888     return {type => DELIM_TOKEN, value => '-'};
889     #redo A;
890     } elsif (length $self->{t}->{value}) {
891     $self->{state} = BEFORE_TOKEN_STATE;
892     # reprocess
893     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
894     return $self->{t};
895     #redo A;
896     } else {
897     $self->{state} = BEFORE_TOKEN_STATE;
898     # reprocess
899     return {type => DELIM_TOKEN, value => '\\'};
900     #redo A;
901     }
902 wakaba 1.7 }
903 wakaba 1.8 } elsif ($q == 1) {
904     $self->{state} = URI_UNQUOTED_STATE;
905 wakaba 1.7 $self->{c} = $self->{get_char}->();
906     redo A;
907 wakaba 1.8 } else {
908     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
909     $self->{t}->{type} = {
910     STRING_TOKEN, INVALID_TOKEN,
911     URI_TOKEN, URI_INVALID_TOKEN,
912     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
913     }->{$self->{t}->{type}} || $self->{t}->{type};
914     $self->{state} = BEFORE_TOKEN_STATE;
915     # reprocess
916     return $self->{t};
917     #redo A;
918 wakaba 1.7 }
919 wakaba 1.1 } elsif ($self->{state} == ESCAPE_STATE) {
920     ## NOTE: third..seventh character of |unicode| in |escape|.
921     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
922     $char = $char * 0x10 + $self->{c} - 0x0030;
923     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
924     $self->{c} = $self->{get_char}->();
925     redo A;
926     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
927     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
928     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
929     $self->{c} = $self->{get_char}->();
930     redo A;
931 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
932 wakaba 1.7 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
933 wakaba 1.1 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
934     $self->{c} = $self->{get_char}->();
935     redo A;
936     } elsif ($self->{c} == 0x0020 or # SP
937     $self->{c} == 0x000A or # \n
938     $self->{c} == 0x0009 or # \t
939     $self->{c} == 0x000C) { # \f
940 wakaba 1.5 $self->{t}->{value} .= chr $char;
941 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
942     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
943 wakaba 1.1 $self->{c} = $self->{get_char}->();
944     redo A;
945     } elsif ($self->{c} == 0x000D) { # \r
946     $self->{state} = ESCAPE_BEFORE_LF_STATE;
947     $self->{c} = $self->{get_char}->();
948     redo A;
949     } else {
950 wakaba 1.5 $self->{t}->{value} .= chr $char;
951 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
952     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
953 wakaba 1.1 # reconsume
954     redo A;
955     }
956     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
957     ## NOTE: eightth character of |unicode| in |escape|.
958     if ($self->{c} == 0x0020 or # SP
959     $self->{c} == 0x000A or # \n
960     $self->{c} == 0x0009 or # \t
961     $self->{c} == 0x000C) { # \f
962 wakaba 1.5 $self->{t}->{value} .= chr $char;
963 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
964     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
965 wakaba 1.1 $self->{c} = $self->{get_char}->();
966     redo A;
967     } elsif ($self->{c} == 0x000D) { # \r
968     $self->{state} = ESCAPE_BEFORE_NL_STATE;
969     $self->{c} = $self->{get_char}->();
970     redo A;
971     } else {
972 wakaba 1.5 $self->{t}->{value} .= chr $char;
973 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
974     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
975 wakaba 1.1 # reconsume
976     redo A;
977     }
978     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
979     ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
980     if ($self->{c} == 0x000A) { # \n
981 wakaba 1.8 $self->{t}->{value} .= chr $self->{c};
982 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
983     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
984 wakaba 1.1 $self->{c} = $self->{get_char}->();
985     redo A;
986     } else {
987 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
988     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
989 wakaba 1.8 # reprocess
990 wakaba 1.1 redo A;
991     }
992     } elsif ($self->{state} == STRING_STATE) {
993     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
994     ## a character in |invalid$Q| in |invalid| in |INVALID|,
995     ## where |$Q = $q == 0x0022 ? 1 : 2|.
996 wakaba 1.3 ## Or, in |URI|.
997 wakaba 1.1 if ($self->{c} == 0x005C) { # \
998     $self->{state} = ESCAPE_OPEN_STATE;
999     $self->{c} = $self->{get_char}->();
1000     redo A;
1001     } elsif ($self->{c} == $q) { # " | '
1002 wakaba 1.5 if ($self->{t}->{type} == STRING_TOKEN) {
1003 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
1004     $self->{c} = $self->{get_char}->();
1005 wakaba 1.5 return $self->{t};
1006 wakaba 1.3 #redo A;
1007     } else {
1008     $self->{state} = URI_AFTER_WSP_STATE;
1009     $self->{c} = $self->{get_char}->();
1010     redo A;
1011     }
1012 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
1013     $self->{c} == 0x000D or # \r
1014     $self->{c} == 0x000C or # \f
1015     $self->{c} == -1) {
1016 wakaba 1.11 $self->{t}->{type} = {
1017     STRING_TOKEN, INVALID_TOKEN,
1018     INVALID_TOKEN, INVALID_TOKEN,
1019     URI_TOKEN, URI_INVALID_TOKEN,
1020     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1021     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1022     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1023     }->{$self->{t}->{type}};
1024 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1025     # reconsume
1026 wakaba 1.5 return $self->{t};
1027 wakaba 1.1 #redo A;
1028     } else {
1029 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1030 wakaba 1.1 # stay in the state
1031     $self->{c} = $self->{get_char}->();
1032     redo A;
1033     }
1034     } elsif ($self->{state} == NUMBER_STATE) {
1035     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1036     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1037 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1038 wakaba 1.1 # stay in the state
1039     $self->{c} = $self->{get_char}->();
1040     redo A;
1041     } elsif ($self->{c} == 0x002E) { # .
1042     $self->{state} = NUMBER_DOT_STATE;
1043     $self->{c} = $self->{get_char}->();
1044     redo A;
1045     } else {
1046 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1047     $self->{t}->{value} = '';
1048 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1049     # reprocess
1050 wakaba 1.2 redo A;
1051 wakaba 1.1 }
1052     } elsif ($self->{state} == NUMBER_DOT_STATE) {
1053     ## NOTE: The character immediately following |.| in |num|.
1054     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1055 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1056 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1057     $self->{c} = $self->{get_char}->();
1058     redo A;
1059     } else {
1060 wakaba 1.9 unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '.'};
1061 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1062     $self->{t}->{value} = '';
1063 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
1064     # reprocess
1065 wakaba 1.5 return $self->{t};
1066 wakaba 1.1 #redo A;
1067     }
1068     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1069     ## NOTE: The character immediately following |.| at the beginning of |num|.
1070     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1071 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
1072 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
1073     $self->{c} = $self->{get_char}->();
1074     redo A;
1075     } else {
1076     $self->{state} = BEFORE_TOKEN_STATE;
1077 wakaba 1.9 # reprocess
1078 wakaba 1.1 return {type => DELIM_TOKEN, value => '.'};
1079     #redo A;
1080     }
1081     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1082     ## NOTE: |[0-9]| in |num| after |.|.
1083     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1084 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1085 wakaba 1.1 # stay in the state
1086     $self->{c} = $self->{get_char}->();
1087     redo A;
1088     } else {
1089 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1090     $self->{t}->{value} = '';
1091 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1092     # reprocess
1093 wakaba 1.2 redo A;
1094 wakaba 1.1 }
1095     } else {
1096     die "$0: Unknown state |$self->{state}|";
1097     }
1098     } # A
1099     } # get_next_token
1100    
1101     1;
1102 wakaba 1.11 # $Date: 2007/09/08 13:43:58 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24