/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (hide annotations) (download)
Sat Sep 8 10:21:04 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.6: +32 -18 lines
++ whatpm/t/ChangeLog	8 Sep 2007 10:20:46 -0000
	* css-token-1.test: New tests are added.

2007-09-08  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 10:20:33 -0000
	* Tokenizer.pm: Bug fixes.

2007-09-08  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3    
4 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
5     sub BEFORE_NMSTART_STATE () { 1 }
6     sub NAME_STATE () { 2 }
7     sub ESCAPE_OPEN_STATE () { 3 }
8     sub STRING_STATE () { 4 }
9     sub HASH_OPEN_STATE () { 5 }
10     sub NUMBER_STATE () { 6 }
11     sub NUMBER_FRACTION_STATE () { 7 }
12     sub AFTER_NUMBER_STATE () { 8 }
13     sub URI_BEFORE_WSP_STATE () { 9 }
14     sub ESCAPE_STATE () { 10 }
15     sub ESCAPE_BEFORE_LF_STATE () { 11 }
16     sub ESCAPE_BEFORE_NL_STATE () { 12 }
17     sub NUMBER_DOT_STATE () { 13 }
18     sub NUMBER_DOT_NUMBER_STATE () { 14 }
19     sub DELIM_STATE () { 15 }
20 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
21     sub URI_AFTER_WSP_STATE () { 17 }
22     sub AFTER_AT_STATE () { 18 }
23     sub AFTER_AT_HYPHEN_STATE () { 19 }
24 wakaba 1.2
25     sub IDENT_TOKEN () { 1 }
26     sub ATKEYWORD_TOKEN () { 2 }
27     sub HASH_TOKEN () { 3 }
28     sub FUNCTION_TOKEN () { 4 }
29     sub URI_TOKEN () { 5 }
30     sub URI_INVALID_TOKEN () { 6 }
31     sub URI_PREFIX_TOKEN () { 7 }
32     sub URI_PREFIX_INVALID_TOKEN () { 8 }
33     sub STRING_TOKEN () { 9 }
34     sub INVALID_TOKEN () { 10 }
35     sub NUMBER_TOKEN () { 11 }
36     sub DIMENSION_TOKEN () { 12 }
37     sub PERCENTAGE_TOKEN () { 13 }
38     sub UNICODE_RANGE_TOKEN () { 14 }
39     sub DELIM_TOKEN () { 16 }
40     sub PLUS_TOKEN () { 17 }
41     sub GREATER_TOKEN () { 18 }
42     sub COMMA_TOKEN () { 19 }
43     sub TILDE_TOKEN () { 20 }
44     sub DASHMATCH_TOKEN () { 21 }
45     sub PREFIXMATCH_TOKEN () { 22 }
46     sub SUFFIXMATCH_TOKEN () { 23 }
47     sub SUBSTRINGMATCH_TOKEN () { 24 }
48     sub INCLUDES_TOKEN () { 25 }
49     sub SEMICOLON_TOKEN () { 26 }
50     sub LBRACE_TOKEN () { 27 }
51     sub RBRACE_TOKEN () { 28 }
52     sub LPAREN_TOKEN () { 29 }
53     sub RPAREN_TOKEN () { 30 }
54     sub LBRACKET_TOKEN () { 31 }
55     sub RBRACKET_TOKEN () { 32 }
56     sub S_TOKEN () { 33 }
57     sub CDO_TOKEN () { 34 }
58     sub CDC_TOKEN () { 35 }
59     sub COMMENT_TOKEN () { 36 }
60     sub COMMENT_INVALID_TOKEN () { 37 }
61     sub EOF_TOKEN () { 38 }
62    
63     our @TokenName = qw(
64 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
65 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
66 wakaba 1.6 0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
67 wakaba 1.2 PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
68     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
69     COMMENT_INVALID EOF
70     );
71    
72 wakaba 1.1 sub new ($) {
73 wakaba 1.2 my $self = bless {token => [], get_char => sub { -1 },
74     onerror => sub { }}, shift;
75 wakaba 1.1 return $self;
76     } # new
77    
78     sub init ($) {
79     my $self = shift;
80     $self->{state} = BEFORE_TOKEN_STATE;
81     $self->{c} = $self->{get_char}->();
82 wakaba 1.5 #$self->{t} = {type => token-type, value => value, number => number};
83 wakaba 1.1 } # init
84    
85     sub get_next_token ($) {
86     my $self = shift;
87     if (@{$self->{token}}) {
88     return shift @{$self->{token}};
89     }
90    
91     my $char;
92     my $num; # |{num}|, if any.
93     my $i; # |$i + 1|th character in |unicode| in |escape|.
94 wakaba 1.3 my $q;
95     ## NOTE:
96     ## 0: in |ident|.
97     ## 1: in |URI| outside of |string|.
98     ## 0x0022: in |string1| or |invalid1|.
99     ## 0x0027: in |string2| or |invalid2|.
100 wakaba 1.1
101     A: {
102     if ($self->{state} == BEFORE_TOKEN_STATE) {
103     if ($self->{c} == 0x002D) { # -
104     ## NOTE: |-| in |ident| in |IDENT|
105 wakaba 1.7 $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
106 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
107     $self->{c} = $self->{get_char}->();
108     redo A;
109 wakaba 1.5 } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
110     $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
111     $self->{c} = $self->{get_char}->();
112     if ($self->{c} == 0x002B) { # +
113     $self->{c} = $self->{get_char}->();
114     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
115     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
116     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
117     $self->{c} == 0x003F) { # ?
118     $self->{t}->{value} .= '+' . chr $self->{c};
119     $self->{t}->{type} = UNICODE_RANGE_TOKEN;
120     $self->{c} = $self->{get_char}->();
121     C: for (2..6) {
122     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
123     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
124     (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
125     $self->{c} == 0x003F) { # ?
126     $self->{t}->{value} .= chr $self->{c};
127     $self->{c} = $self->{get_char}->();
128     } else {
129     last C;
130     }
131     } # C
132    
133     if ($self->{c} == 0x002D) { # -
134     $self->{c} = $self->{get_char}->();
135     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
136     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
137     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
138     $self->{t}->{value} .= '-' . chr $self->{c};
139     $self->{c} = $self->{get_char}->();
140     C: for (2..6) {
141     if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
142     (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
143     (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
144     $self->{t}->{value} .= chr $self->{c};
145     $self->{c} = $self->{get_char}->();
146     } else {
147     last C;
148     }
149     } # C
150    
151     #
152     } else {
153     my $token = $self->{t};
154     $self->{t} = {type => IDENT_TOKEN, value => '-'};
155     $self->{state} = BEFORE_NMSTART_STATE;
156     # reprocess
157     return $token;
158     #redo A;
159     }
160     }
161    
162     $self->{state} = BEFORE_TOKEN_STATE;
163     # reprocess
164     return $self->{t};
165     #redo A;
166     } else {
167     unshift @{$self->{token}}, {type => PLUS_TOKEN};
168     $self->{state} = BEFORE_TOKEN_STATE;
169     # reprocess
170     return $self->{t};
171     #redo A;
172     }
173     } else {
174     $self->{state} = NAME_STATE;
175     # reprocess
176     redo A;
177     }
178 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
179     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
180 wakaba 1.1 $self->{c} == 0x005F or # _
181     $self->{c} > 0x007F) { # nonascii
182     ## NOTE: |nmstart| in |ident| in |IDENT|
183 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
184 wakaba 1.1 $self->{state} = NAME_STATE;
185     $self->{c} = $self->{get_char}->();
186     redo A;
187     } elsif ($self->{c} == 0x005C) { # \
188     ## NOTE: |nmstart| in |ident| in |IDENT|
189 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => ''};
190 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
191     $self->{c} = $self->{get_char}->();
192     redo A;
193     } elsif ($self->{c} == 0x0040) { # @
194     ## NOTE: |@| in |ATKEYWORD|
195 wakaba 1.5 $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
196 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
197 wakaba 1.1 $self->{c} = $self->{get_char}->();
198     redo A;
199 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
200 wakaba 1.5 $self->{t} = {type => STRING_TOKEN, value => ''};
201 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
202 wakaba 1.1 $self->{c} = $self->{get_char}->();
203     redo A;
204     } elsif ($self->{c} == 0x0023) { # #
205     ## NOTE: |#| in |HASH|.
206 wakaba 1.5 $self->{t} = {type => HASH_TOKEN, value => ''};
207 wakaba 1.1 $self->{state} = HASH_OPEN_STATE;
208     $self->{c} = $self->{get_char}->();
209     redo A;
210     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
211     ## NOTE: |num|.
212 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
213 wakaba 1.1 $self->{state} = NUMBER_STATE;
214     $self->{c} = $self->{get_char}->();
215     redo A;
216     } elsif ($self->{c} == 0x002E) { # .
217     ## NOTE: |num|.
218 wakaba 1.5 $self->{t} = {type => NUMBER_TOKEN, value => '0'};
219 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
220     $self->{c} = $self->{get_char}->();
221     redo A;
222 wakaba 1.4 } elsif ($self->{c} == 0x002F) { # /
223     $self->{c} = $self->{get_char}->();
224     if ($self->{c} == 0x002A) { # *
225     C: {
226     $self->{c} = $self->{get_char}->();
227     if ($self->{c} == 0x002A) { # *
228     D: {
229     $self->{c} = $self->{get_char}->();
230     if ($self->{c} == 0x002F) { # /
231     #
232     } elsif ($self->{c} == 0x002A) { # *
233     redo D;
234     } else {
235     redo C;
236     }
237     } # D
238     } elsif ($self->{c} == -1) {
239     # stay in the state
240     # reprocess
241     return {type => COMMENT_INVALID_TOKEN};
242     #redo A;
243     } else {
244     redo C;
245     }
246     } # C
247    
248     # stay in the state.
249     $self->{c} = $self->{get_char}->();
250     redo A;
251     } else {
252     # stay in the state.
253     # reprocess
254     return {type => DELIM_STATE, value => '/'};
255     #redo A;
256     }
257 wakaba 1.1 } elsif ($self->{c} == 0x003C) { # <
258     ## NOTE: |CDO|
259     $self->{c} = $self->{get_char}->();
260     if ($self->{c} == 0x0021) { # !
261     $self->{c} = $self->{get_char}->();
262     if ($self->{c} == 0x002C) { # -
263     $self->{c} = $self->{get_char}->();
264     if ($self->{c} == 0x002C) { # -
265     $self->{state} = BEFORE_TOKEN_STATE;
266     $self->{c} = $self->{get_char}->();
267     return {type => CDO_TOKEN};
268     #redo A;
269     } else {
270     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
271     ## NOTE: |-| in |ident| in |IDENT|
272 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
273 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
274     #reprocess
275     return {type => DELIM_TOKEN, value => '<'};
276     #redo A;
277     }
278     } else {
279     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
280     $self->{state} = BEFORE_TOKEN_STATE;
281     #reprocess
282     return {type => DELIM_TOKEN, value => '<'};
283     #redo A;
284     }
285     } else {
286     $self->{state} = BEFORE_TOKEN_STATE;
287     #reprocess
288     return {type => DELIM_TOKEN, value => '<'};
289     #redo A;
290     }
291 wakaba 1.2 } elsif (my $t = {
292     0x003B => SEMICOLON_TOKEN, # ;
293     0x007B => LBRACE_TOKEN, # {
294     0x007D => RBRACE_TOKEN, # }
295     0x0028 => LPAREN_TOKEN, # (
296     0x0029 => RPAREN_TOKEN, # )
297     0x005B => LBRACKET_TOKEN, # [
298     0x005D => RBRACKET_TOKEN, # ]
299 wakaba 1.1 }->{$self->{c}}) {
300     # stay in the state
301     $self->{c} = $self->{get_char}->();
302 wakaba 1.2 return {type => $t};
303 wakaba 1.1 # redo A;
304     } elsif ({
305     0x0020 => 1, # SP
306     0x0009 => 1, # \t
307     0x000D => 1, # \r
308     0x000A => 1, # \n
309     0x000C => 1, # \f
310     }->{$self->{c}}) {
311     W: {
312     $self->{c} = $self->{get_char}->();
313     if ({
314     0x0020 => 1, # SP
315     0x0009 => 1, # \t
316     0x000D => 1, # \r
317     0x000A => 1, # \n
318     0x000C => 1, # \f
319     }->{$self->{c}}) {
320     redo W;
321     } elsif (my $v = {
322     0x002B => PLUS_TOKEN, # +
323     0x003E => GREATER_TOKEN, # >
324     0x002C => COMMA_TOKEN, # ,
325     0x007E => TILDE_TOKEN, # ~
326     }->{$self->{c}}) {
327     # stay in the state
328     $self->{c} = $self->{get_char}->();
329     return {type => $v};
330     #redo A;
331     } else {
332     # stay in the state
333     # reprocess
334     return {type => S_TOKEN};
335     #redo A;
336     }
337     } # W
338     } elsif (my $v = {
339     0x007C => DASHMATCH_TOKEN, # |
340     0x005E => PREFIXMATCH_TOKEN, # ^
341     0x0024 => SUFFIXMATCH_TOKEN, # $
342     0x002A => SUBSTRINGMATCH_TOKEN, # *
343     }->{$self->{c}}) {
344 wakaba 1.2 my $c = $self->{c};
345 wakaba 1.1 $self->{c} = $self->{get_char}->();
346     if ($self->{c} == 0x003D) { # =
347     # stay in the state
348     $self->{c} = $self->{get_char}->();
349     return {type => $v};
350     #redo A;
351     } else {
352     # stay in the state
353     # reprocess
354 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
355 wakaba 1.1 #redo A;
356     }
357     } elsif ($self->{c} == 0x002B) { # +
358     # stay in the state
359     $self->{c} = $self->{get_char}->();
360     return {type => PLUS_TOKEN};
361     #redo A;
362     } elsif ($self->{c} == 0x003E) { # >
363     # stay in the state
364     $self->{c} = $self->{get_char}->();
365     return {type => GREATER_TOKEN};
366     #redo A;
367     } elsif ($self->{c} == 0x002C) { # ,
368     # stay in the state
369     $self->{c} = $self->{get_char}->();
370     return {type => COMMA_TOKEN};
371     #redo A;
372     } elsif ($self->{c} == 0x007E) { # ~
373     $self->{c} = $self->{get_char}->();
374     if ($self->{c} == 0x003D) { # =
375     # stay in the state
376     $self->{c} = $self->{get_char}->();
377     return {type => INCLUDES_TOKEN};
378     #redo A;
379     } else {
380     # stay in the state
381     # reprocess
382     return {type => TILDE_TOKEN};
383     #redo A;
384     }
385     } elsif ($self->{c} == -1) {
386     # stay in the state
387     $self->{c} = $self->{get_char}->();
388     return {type => EOF_TOKEN};
389     #redo A;
390     } else {
391     # stay in the state
392 wakaba 1.5 $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
393 wakaba 1.1 $self->{c} = $self->{get_char}->();
394 wakaba 1.5 return $self->{t};
395 wakaba 1.1 #redo A;
396     }
397     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
398 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
399     ## |FUNCTION|)
400 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
401     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
402 wakaba 1.1 $self->{c} == 0x005F or # _
403     $self->{c} > 0x007F) { # nonascii
404 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
405     $self->{t}->{type} = DIMENSION_TOKEN
406     if $self->{t}->{type} == NUMBER_TOKEN;
407 wakaba 1.1 $self->{state} = NAME_STATE;
408     $self->{c} = $self->{get_char}->();
409     redo A;
410     } elsif ($self->{c} == 0x005C) { # \
411 wakaba 1.2 ## TODO: 12-\X, 12-\{nl}
412 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
413     $self->{c} = $self->{get_char}->();
414     redo A;
415     } elsif ($self->{c} == 0x002D and # -
416 wakaba 1.5 $self->{t}->{type} == IDENT_TOKEN) {
417 wakaba 1.1 $self->{c} = $self->{get_char}->();
418     if ($self->{c} == 0x003E) { # >
419     $self->{state} = BEFORE_TOKEN_STATE;
420     $self->{c} = $self->{get_char}->();
421     return {type => CDC_TOKEN};
422     #redo A;
423     } else {
424     ## NOTE: |-|, |-|, $self->{c}
425 wakaba 1.5 #$self->{t} = {type => IDENT_TOKEN, value => '-'};
426 wakaba 1.1 # stay in the state
427     # reconsume
428     return {type => DELIM_TOKEN, value => '-'};
429     #redo A;
430     }
431     } else {
432 wakaba 1.5 if ($self->{t}->{type} == NUMBER_TOKEN) {
433 wakaba 1.2 ## NOTE: |-| after |NUMBER|.
434     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
435     $self->{state} = BEFORE_TOKEN_STATE;
436 wakaba 1.7 # reprocess
437 wakaba 1.5 $self->{t}->{value} = $self->{t}->{number};
438     delete $self->{t}->{number};
439     return $self->{t};
440 wakaba 1.1 } else {
441     ## NOTE: |-| not followed by |nmstart|.
442     $self->{state} = BEFORE_TOKEN_STATE;
443 wakaba 1.7 # reprocess
444 wakaba 1.1 return {type => DELIM_TOKEN, value => '-'};
445     }
446     }
447 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
448     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
449     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
450     $self->{c} == 0x005F or # _
451     $self->{c} > 0x007F) { # nonascii
452 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
453 wakaba 1.3 $self->{state} = NAME_STATE;
454     $self->{c} = $self->{get_char}->();
455     redo A;
456     } elsif ($self->{c} == 0x002D) { # -
457 wakaba 1.5 $self->{t}->{value} .= '-';
458 wakaba 1.3 $self->{state} = AFTER_AT_HYPHEN_STATE;
459     $self->{c} = $self->{get_char}->();
460     redo A;
461     } elsif ($self->{c} == 0x005C) { # \
462     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
463     $self->{c} = $self->{get_char}->();
464     redo A;
465     } else {
466     $self->{state} = BEFORE_TOKEN_STATE;
467     # reprocess
468     return {type => DELIM_TOKEN, value => '@'};
469     }
470     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
471     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
472     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
473     $self->{c} == 0x005F or # _
474     $self->{c} > 0x007F) { # nonascii
475 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
476 wakaba 1.3 $self->{state} = NAME_STATE;
477     $self->{c} = $self->{get_char}->();
478     redo A;
479     } elsif ($self->{c} == 0x002D) { # -
480     $self->{c} = $self->{get_char}->();
481     if ($self->{c} == 0x003E) { # >
482 wakaba 1.4 unshift @{$self->{token}}, {type => CDC_TOKEN};
483 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
484     $self->{c} = $self->{get_char}->();
485 wakaba 1.4 return {type => DELIM_TOKEN, value => '@'};
486 wakaba 1.3 #redo A;
487     } else {
488     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
489 wakaba 1.5 $self->{t} = {type => IDENT_TOKEN, value => '-'};
490 wakaba 1.3 $self->{state} = BEFORE_NMSTART_STATE;
491     # reprocess
492     return {type => DELIM_TOKEN, value => '@'};
493     #redo A;
494     }
495     } elsif ($self->{c} == 0x005C) { # \
496     ## TODO: @-\{nl}
497     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
498     $self->{c} = $self->{get_char}->();
499     redo A;
500     } else {
501     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
502     $self->{state} = BEFORE_TOKEN_STATE;
503     # reprocess
504     return {type => DELIM_TOKEN, value => '@'};
505     }
506 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
507     if ($self->{c} == 0x002D) { # -
508     ## NOTE: |-| in |ident|.
509 wakaba 1.5 $self->{t}->{value} = '-';
510 wakaba 1.1 $self->{state} = BEFORE_NMSTART_STATE;
511     $self->{c} = $self->{get_char}->();
512     redo A;
513 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
514     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
515 wakaba 1.1 $self->{c} == 0x005F or # _
516     $self->{c} > 0x007F) { # nonascii
517     ## NOTE: |nmstart| in |ident|.
518 wakaba 1.5 $self->{t}->{value} = chr $self->{c};
519     $self->{t}->{type} = DIMENSION_TOKEN;
520 wakaba 1.1 $self->{state} = NAME_STATE;
521     $self->{c} = $self->{get_char}->();
522     redo A;
523     } elsif ($self->{c} == 0x005C) { # \
524     ## NOTE: |nmstart| in |ident| in |IDENT|
525 wakaba 1.5 $self->{t}->{value} = '';
526 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
527     $self->{c} = $self->{get_char}->();
528     redo A;
529     } elsif ($self->{c} == 0x0025) { # %
530 wakaba 1.5 $self->{t}->{type} = PERCENTAGE_TOKEN;
531 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
532     $self->{c} = $self->{get_char}->();
533 wakaba 1.5 return $self->{t};
534 wakaba 1.1 #redo A;
535     } else {
536     $self->{state} = BEFORE_TOKEN_STATE;
537     # reprocess
538 wakaba 1.5 return $self->{t};
539 wakaba 1.1 #redo A;
540     }
541     } elsif ($self->{state} == HASH_OPEN_STATE) {
542     ## NOTE: The first |nmchar| in |name| in |HASH|.
543 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
544     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
545     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
546 wakaba 1.1 $self->{c} == 0x002D or # -
547     $self->{c} == 0x005F or # _
548     $self->{c} > 0x007F) { # nonascii
549 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
550 wakaba 1.1 $self->{state} = NAME_STATE;
551     $self->{c} = $self->{get_char}->();
552     redo A;
553     } elsif ($self->{c} == 0x005C) { # \
554     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
555     $self->{c} = $self->{get_char}->();
556     redo A;
557     } else {
558     $self->{state} = BEFORE_TOKEN_STATE;
559     $self->{c} = $self->{get_char}->();
560     return {type => DELIM_TOKEN, value => '#'};
561     #redo A;
562     }
563     } elsif ($self->{state} == NAME_STATE) {
564     ## NOTE: |nmchar| in (|ident| or |name|).
565 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
566     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
567     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
568 wakaba 1.1 $self->{c} == 0x005F or # _
569     $self->{c} == 0x002D or # -
570     $self->{c} > 0x007F) { # nonascii
571 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
572 wakaba 1.1 # stay in the state
573     $self->{c} = $self->{get_char}->();
574     redo A;
575     } elsif ($self->{c} == 0x005C) { # \
576 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
577 wakaba 1.1 $self->{c} = $self->{get_char}->();
578     redo A;
579     } elsif ($self->{c} == 0x0028 and # (
580 wakaba 1.5 $self->{t}->{type} == IDENT_TOKEN) { # (
581     my $func_name = $self->{t}->{value};
582 wakaba 1.3 $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
583     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
584 wakaba 1.5 if ($self->{t}->{has_escape}) {
585 wakaba 1.3 ## TODO: warn
586     }
587 wakaba 1.5 $self->{t}->{type}
588 wakaba 1.3 = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
589 wakaba 1.5 $self->{t}->{value} = '';
590 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
591     $self->{c} = $self->{get_char}->();
592     redo A;
593     } else {
594 wakaba 1.5 $self->{t}->{type} = FUNCTION_TOKEN;
595 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
596     $self->{c} = $self->{get_char}->();
597 wakaba 1.5 return $self->{t};
598 wakaba 1.1 #redo A;
599     }
600     } else {
601     $self->{state} = BEFORE_TOKEN_STATE;
602     # reconsume
603 wakaba 1.5 return $self->{t};
604 wakaba 1.1 #redo A;
605     }
606 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
607     while ({
608     0x0020 => 1, # SP
609     0x0009 => 1, # \t
610     0x000D => 1, # \r
611     0x000A => 1, # \n
612     0x000C => 1, # \f
613     }->{$self->{c}}) {
614     $self->{c} = $self->{get_char}->();
615     }
616     if ($self->{c} == -1) {
617 wakaba 1.5 $self->{t}->{type} = {
618 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
619     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
620     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
621     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
622 wakaba 1.5 }->{$self->{t}->{type}};
623 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
624     $self->{c} = $self->{get_char}->();
625 wakaba 1.5 return $self->{t};
626 wakaba 1.3 #redo A;
627     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
628     ## TODO: Should we consider matches of "(" and ")"?
629 wakaba 1.5 $self->{t}->{type} = {
630 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
631     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
632     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
633     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
634 wakaba 1.5 }->{$self->{t}->{type}};
635 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
636     $self->{c} = $self->{get_char}->();
637     redo A;
638     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
639     $self->{state} = STRING_STATE; $q = $self->{c};
640     $self->{c} = $self->{get_char}->();
641     redo A;
642     } elsif ($self->{c} == 0x0029) { # )
643     $self->{state} = BEFORE_TOKEN_STATE;
644     $self->{c} = $self->{get_char}->();
645 wakaba 1.5 return $self->{t};
646 wakaba 1.3 #redo A;
647     } elsif ($self->{c} == 0x005C) { # \
648     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
649     $self->{c} = $self->{get_char}->();
650     redo A;
651     } else {
652 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
653 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
654     $self->{c} = $self->{get_char}->();
655     redo A;
656     }
657     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
658     if ({
659     0x0020 => 1, # SP
660     0x0009 => 1, # \t
661     0x000D => 1, # \r
662     0x000A => 1, # \n
663     0x000C => 1, # \f
664     }->{$self->{c}}) {
665     $self->{state} = URI_AFTER_WSP_STATE;
666     $self->{c} = $self->{get_char}->();
667     redo A;
668     } elsif ($self->{c} == -1) {
669 wakaba 1.5 $self->{t}->{type} = {
670 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
671     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
672     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
673     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
674 wakaba 1.5 }->{$self->{t}->{type}};
675 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
676     $self->{c} = $self->{get_char}->();
677 wakaba 1.5 return $self->{t};
678 wakaba 1.3 #redo A;
679     } elsif ($self->{c} < 0x0020 or {
680     0x0022 => 1, # "
681     0x0027 => 1, # '
682     0x0028 => 1, # (
683     }->{$self->{c}}) { # C0 or (
684     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
685 wakaba 1.5 $self->{t}->{type} = {
686 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
687     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690 wakaba 1.5 }->{$self->{t}->{type}};
691 wakaba 1.3 # stay in the state.
692     $self->{c} = $self->{get_char}->();
693     redo A;
694     } elsif ($self->{c} == 0x0029) { # )
695     $self->{state} = BEFORE_TOKEN_STATE;
696     $self->{c} = $self->{get_char}->();
697 wakaba 1.5 return $self->{t};
698 wakaba 1.3 #redo A;
699     } elsif ($self->{c} == 0x005C) { # \
700     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
701     $self->{c} = $self->{get_char}->();
702     redo A;
703     } else {
704 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
705 wakaba 1.3 # stay in the state.
706     $self->{c} = $self->{get_char}->();
707     redo A;
708     }
709     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
710     if ({
711     0x0020 => 1, # SP
712     0x0009 => 1, # \t
713     0x000D => 1, # \r
714     0x000A => 1, # \n
715     0x000C => 1, # \f
716     }->{$self->{c}}) {
717     # stay in the state.
718     $self->{c} = $self->{get_char}->();
719     redo A;
720     } elsif ($self->{c} == -1) {
721 wakaba 1.5 $self->{t}->{type} = {
722 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
723     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
724     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
725     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
726 wakaba 1.5 }->{$self->{t}->{type}};
727 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
728     $self->{c} = $self->{get_char}->();
729 wakaba 1.5 return $self->{t};
730 wakaba 1.3 #redo A;
731     } elsif ($self->{c} == 0x0029) { # )
732     $self->{state} = BEFORE_TOKEN_STATE;
733     $self->{c} = $self->{get_char}->();
734 wakaba 1.5 return $self->{t};
735 wakaba 1.3 #redo A;
736     } elsif ($self->{c} == 0x005C) { # \
737     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
738     $self->{c} = $self->{get_char}->();
739     redo A;
740     } else {
741     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
742 wakaba 1.5 $self->{t}->{type} = {
743 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
744     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
745     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
746     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
747 wakaba 1.5 }->{$self->{t}->{type}};
748 wakaba 1.3 # stay in the state.
749     $self->{c} = $self->{get_char}->();
750     redo A;
751     }
752 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
753 wakaba 1.5 $self->{t}->{has_escape} = 1;
754 wakaba 1.1 if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
755     ## NOTE: second character of |unicode| in |escape|.
756     $char = $self->{c} - 0x0030;
757     $self->{state} = ESCAPE_STATE; $i = 2;
758     $self->{c} = $self->{get_char}->();
759     redo A;
760     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
761     ## NOTE: second character of |unicode| in |escape|.
762     $char = $self->{c} - 0x0041 + 0xA;
763     $self->{state} = ESCAPE_STATE; $i = 2;
764     $self->{c} = $self->{get_char}->();
765     redo A;
766 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
767 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
768 wakaba 1.7 $char = $self->{c} - 0x0061 + 0xA;
769 wakaba 1.1 $self->{state} = ESCAPE_STATE; $i = 2;
770     $self->{c} = $self->{get_char}->();
771     redo A;
772     } elsif ($self->{c} == 0x000A or # \n
773     $self->{c} == 0x000C) { # \f
774     if ($q == 0) {
775 wakaba 1.7 #
776 wakaba 1.3 } elsif ($q == 1) {
777     ## NOTE: In |escape| in |URI|.
778 wakaba 1.5 $self->{t}->{type} = {
779 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
780     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
781     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
782     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
783 wakaba 1.5 }->{$self->{t}->{type}};
784     $self->{t}->{value} .= chr $self->{c};
785 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
786     $self->{c} = $self->{get_char}->();
787     redo A;
788 wakaba 1.1 } else {
789     ## Note: In |nl| in ... in |string| or |ident|.
790 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
791 wakaba 1.1 $self->{state} = STRING_STATE;
792     $self->{c} = $self->{get_char}->();
793     redo A;
794     }
795     } elsif ($self->{c} == 0x000D) { # \r
796     if ($q == 0) {
797 wakaba 1.7 #
798 wakaba 1.3 } elsif ($q == 1) {
799 wakaba 1.7 ## NOTE: In |escape| in |URI|.
800 wakaba 1.5 $self->{t}->{type} = {
801 wakaba 1.3 URI_TOKEN, URI_INVALID_TOKEN,
802     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
803     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
804     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
805 wakaba 1.5 }->{$self->{t}->{type}};
806     $self->{t}->{value} .= "\x0D\x0A";
807 wakaba 1.3 $self->{state} = URI_UNQUOTED_STATE;
808     $self->{c} = $self->{get_char}->();
809     redo A;
810 wakaba 1.1 } else {
811     ## Note: In |nl| in ... in |string| or |ident|.
812 wakaba 1.5 $self->{t}->{value} .= "\x0D\x0A";
813 wakaba 1.1 $self->{state} = ESCAPE_BEFORE_LF_STATE;
814     $self->{c} = $self->{get_char}->();
815     redo A;
816     }
817 wakaba 1.7 } elsif ($self->{c} == -1) {
818     #
819 wakaba 1.1 } else {
820     ## NOTE: second character of |escape|.
821 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
822 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
823     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
824 wakaba 1.1 $self->{c} = $self->{get_char}->();
825     redo A;
826     }
827 wakaba 1.7
828     if ($q == 0) {
829     $self->{state} = BEFORE_TOKEN_STATE;
830     # reprocess
831     if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
832     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
833     return {type => DELIM_TOKEN, value => '-'};
834     #redo A;
835     } elsif (length $self->{t}->{value}) {
836     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
837     return $self->{t};
838     #redo A;
839     } else {
840     return {type => DELIM_TOKEN, value => '\\'};
841     #redo A;
842     }
843     } else {
844     $self->{state} = $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
845     $self->{c} = $self->{get_char}->();
846     redo A;
847     }
848 wakaba 1.1 } elsif ($self->{state} == ESCAPE_STATE) {
849     ## NOTE: third..seventh character of |unicode| in |escape|.
850     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
851     $char = $char * 0x10 + $self->{c} - 0x0030;
852     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
853     $self->{c} = $self->{get_char}->();
854     redo A;
855     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
856     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
857     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
858     $self->{c} = $self->{get_char}->();
859     redo A;
860 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
861 wakaba 1.7 $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
862 wakaba 1.1 $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
863     $self->{c} = $self->{get_char}->();
864     redo A;
865     } elsif ($self->{c} == 0x0020 or # SP
866     $self->{c} == 0x000A or # \n
867     $self->{c} == 0x0009 or # \t
868     $self->{c} == 0x000C) { # \f
869 wakaba 1.5 $self->{t}->{value} .= chr $char;
870 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
871     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
872 wakaba 1.1 $self->{c} = $self->{get_char}->();
873     redo A;
874     } elsif ($self->{c} == 0x000D) { # \r
875     $self->{state} = ESCAPE_BEFORE_LF_STATE;
876     $self->{c} = $self->{get_char}->();
877     redo A;
878     } else {
879 wakaba 1.5 $self->{t}->{value} .= chr $char;
880 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
881     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
882 wakaba 1.1 # reconsume
883     redo A;
884     }
885     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
886     ## NOTE: eightth character of |unicode| in |escape|.
887     if ($self->{c} == 0x0020 or # SP
888     $self->{c} == 0x000A or # \n
889     $self->{c} == 0x0009 or # \t
890     $self->{c} == 0x000C) { # \f
891 wakaba 1.5 $self->{t}->{value} .= chr $char;
892 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
893     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
894 wakaba 1.1 $self->{c} = $self->{get_char}->();
895     redo A;
896     } elsif ($self->{c} == 0x000D) { # \r
897     $self->{state} = ESCAPE_BEFORE_NL_STATE;
898     $self->{c} = $self->{get_char}->();
899     redo A;
900     } else {
901 wakaba 1.5 $self->{t}->{value} .= chr $char;
902 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
903     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
904 wakaba 1.1 # reconsume
905     redo A;
906     }
907     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
908     ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
909     if ($self->{c} == 0x000A) { # \n
910 wakaba 1.5 $self->{t}->{value} .= chr $char;
911 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
912     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
913 wakaba 1.1 $self->{c} = $self->{get_char}->();
914     redo A;
915     } else {
916 wakaba 1.5 $self->{t}->{value} .= chr $char;
917 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
918     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
919 wakaba 1.1 # reconsume
920     redo A;
921     }
922     } elsif ($self->{state} == STRING_STATE) {
923     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
924     ## a character in |invalid$Q| in |invalid| in |INVALID|,
925     ## where |$Q = $q == 0x0022 ? 1 : 2|.
926 wakaba 1.3 ## Or, in |URI|.
927 wakaba 1.1 if ($self->{c} == 0x005C) { # \
928     $self->{state} = ESCAPE_OPEN_STATE;
929     $self->{c} = $self->{get_char}->();
930     redo A;
931     } elsif ($self->{c} == $q) { # " | '
932 wakaba 1.5 if ($self->{t}->{type} == STRING_TOKEN) {
933 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
934     $self->{c} = $self->{get_char}->();
935 wakaba 1.5 return $self->{t};
936 wakaba 1.3 #redo A;
937     } else {
938     $self->{state} = URI_AFTER_WSP_STATE;
939     $self->{c} = $self->{get_char}->();
940     redo A;
941     }
942 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
943     $self->{c} == 0x000D or # \r
944     $self->{c} == 0x000C or # \f
945     $self->{c} == -1) {
946 wakaba 1.5 $self->{t}->{type} = INVALID_TOKEN;
947 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
948     # reconsume
949 wakaba 1.5 return $self->{t};
950 wakaba 1.1 #redo A;
951     } else {
952 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
953 wakaba 1.1 # stay in the state
954     $self->{c} = $self->{get_char}->();
955     redo A;
956     }
957     } elsif ($self->{state} == NUMBER_STATE) {
958     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
959     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
960 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
961 wakaba 1.1 # stay in the state
962     $self->{c} = $self->{get_char}->();
963     redo A;
964     } elsif ($self->{c} == 0x002E) { # .
965     $self->{state} = NUMBER_DOT_STATE;
966     $self->{c} = $self->{get_char}->();
967     redo A;
968     } else {
969 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
970     $self->{t}->{value} = '';
971 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
972     # reprocess
973 wakaba 1.2 redo A;
974 wakaba 1.1 }
975     } elsif ($self->{state} == NUMBER_DOT_STATE) {
976     ## NOTE: The character immediately following |.| in |num|.
977     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
978 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
979 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
980     $self->{c} = $self->{get_char}->();
981     redo A;
982     } else {
983     unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
984 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
985     $self->{t}->{value} = '';
986 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
987     # reprocess
988 wakaba 1.5 return $self->{t};
989 wakaba 1.1 #redo A;
990     }
991     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
992     ## NOTE: The character immediately following |.| at the beginning of |num|.
993     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
994 wakaba 1.5 $self->{t}->{value} .= '.' . chr $self->{c};
995 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
996     $self->{c} = $self->{get_char}->();
997     redo A;
998     } else {
999     $self->{state} = BEFORE_TOKEN_STATE;
1000     $self->{c} = $self->{get_char}->();
1001     return {type => DELIM_TOKEN, value => '.'};
1002     #redo A;
1003     }
1004     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1005     ## NOTE: |[0-9]| in |num| after |.|.
1006     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1007 wakaba 1.5 $self->{t}->{value} .= chr $self->{c};
1008 wakaba 1.1 # stay in the state
1009     $self->{c} = $self->{get_char}->();
1010     redo A;
1011     } else {
1012 wakaba 1.5 $self->{t}->{number} = $self->{t}->{value};
1013     $self->{t}->{value} = '';
1014 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
1015     # reprocess
1016 wakaba 1.2 redo A;
1017 wakaba 1.1 }
1018     } else {
1019     die "$0: Unknown state |$self->{state}|";
1020     }
1021     } # A
1022     } # get_next_token
1023    
1024     1;
1025 wakaba 1.7 # $Date: 2007/09/08 05:57:05 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24