/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download)
Fri Aug 17 11:53:52 2007 UTC (17 years, 2 months ago) by wakaba
Branch: MAIN
++ whatpm/t/ChangeLog	17 Aug 2007 07:08:23 -0000
	* content-model-2.dat: New tests for |base|
	following URI or hyperlink are added.

2007-08-17  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	17 Aug 2007 07:44:01 -0000
	* CSS/: New directory.

2007-08-17  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/CSS/ChangeLog	17 Aug 2007 11:53:38 -0000
2007-08-17  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm: New module.

	* ChangeLog: New file.

++ whatpm/Whatpm/ContentChecker/ChangeLog	17 Aug 2007 07:08:56 -0000
	* HTML.pm: Raise new errors if |base| is following
	URI attributes or hyperlink attributes.

2007-08-17  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3    
4     sub new ($) {
5     my $self = bless {token => []}, shift;
6     return $self;
7     } # new
8    
9     sub init ($) {
10     my $self = shift;
11     $self->{state} = BEFORE_TOKEN_STATE;
12     $self->{c} = $self->{get_char}->();
13     } # init
14    
15     sub get_next_token ($) {
16     my $self = shift;
17     if (@{$self->{token}}) {
18     return shift @{$self->{token}};
19     }
20    
21     my $current_token;
22     my $char;
23     my $num; # |{num}|, if any.
24     my $i; # |$i + 1|th character in |unicode| in |escape|.
25     my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|
26    
27     A: {
28     if ($self->{state} == BEFORE_TOKEN_STATE) {
29     if ($self->{c} == 0x002D) { # -
30     ## NOTE: |-| in |ident| in |IDENT|
31     $current_token = {type => IDENT_TOKEN, value => '-'};
32     $self->{state} = BEFORE_NMSTART_STATE;
33     $self->{c} = $self->{get_char}->();
34     redo A;
35     } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z
36     (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z
37     $self->{c} == 0x005F or # _
38     $self->{c} > 0x007F) { # nonascii
39     ## NOTE: |nmstart| in |ident| in |IDENT|
40     $current_token = {type => IDENT_TOKEN, value => chr $self->{char}};
41     $self->{state} = NAME_STATE;
42     $self->{c} = $self->{get_char}->();
43     redo A;
44     } elsif ($self->{c} == 0x005C) { # \
45     ## NOTE: |nmstart| in |ident| in |IDENT|
46     $current_token = {type => IDENT_TOKEN, value => ''};
47     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
48     $self->{c} = $self->{get_char}->();
49     redo A;
50     } elsif ($self->{c} == 0x0040) { # @
51     ## NOTE: |@| in |ATKEYWORD|
52     $current_token = {type => ATKEYWORD_TOKEN, value => ''};
53     $self->{state} = BEFORE_NMSTART_STATE;
54     $self->{c} = $self->{get_char}->();
55     redo A;
56     } elsif ($self->{c} == 0x0022) { # "
57     ## NOTE: |"| in |string1| in |string| in |STRING|, or
58     ## |"| in |invalid1| in |invalid| in |INVALID|.
59     $current_token = {type => STRING_TOKEN, value => ''};
60     $self->{state} = STRING_STATE; $q = 1;
61     $self->{c} = $self->{get_char}->();
62     redo A;
63     } elsif ($self->{c} == 0x0027) { # '
64     ## NOTE: |'| in |string2| in |string| in |STRING|, or
65     ## |'| in |invalid2| in |invalid| in |INVALID|.
66     $current_token = {type => STRING_TOKEN, value => ''};
67     $self->{state} = STRING_STATE; $q = 2;
68     $self->{c} = $self->{get_char}->();
69     redo A;
70     } elsif ($self->{c} == 0x0023) { # #
71     ## NOTE: |#| in |HASH|.
72     $current_token = {type => HASH_TOKEN, value => ''};
73     $self->{state} = HASH_OPEN_STATE;
74     $self->{c} = $self->{get_char}->();
75     redo A;
76     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
77     ## NOTE: |num|.
78     $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};
79     $self->{state} = NUMBER_STATE;
80     $self->{c} = $self->{get_char}->();
81     redo A;
82     } elsif ($self->{c} == 0x002E) { # .
83     ## NOTE: |num|.
84     $current_token = {type => NUMBER_TOKEN, value => '.'};
85     $self->{state} = NUMBER_FRACTION_STATE;
86     $self->{c} = $self->{get_char}->();
87     redo A;
88     } elsif ($self->{c} == 0x003C) { # <
89     ## NOTE: |CDO|
90     $self->{c} = $self->{get_char}->();
91     if ($self->{c} == 0x0021) { # !
92     $self->{c} = $self->{get_char}->();
93     if ($self->{c} == 0x002C) { # -
94     $self->{c} = $self->{get_char}->();
95     if ($self->{c} == 0x002C) { # -
96     $self->{state} = BEFORE_TOKEN_STATE;
97     $self->{c} = $self->{get_char}->();
98     return {type => CDO_TOKEN};
99     #redo A;
100     } else {
101     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
102     ## NOTE: |-| in |ident| in |IDENT|
103     $current_token = {type => IDENT_TOKEN, value => '-'};
104     $self->{state} = BEFORE_NMSTART_STATE;
105     #reprocess
106     return {type => DELIM_TOKEN, value => '<'};
107     #redo A;
108     }
109     } else {
110     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
111     $self->{state} = BEFORE_TOKEN_STATE;
112     #reprocess
113     return {type => DELIM_TOKEN, value => '<'};
114     #redo A;
115     }
116     } else {
117     $self->{state} = BEFORE_TOKEN_STATE;
118     #reprocess
119     return {type => DELIM_TOKEN, value => '<'};
120     #redo A;
121     }
122     } elsif ({
123     0x003B => 1, # ;
124     0x007B => 1, # {
125     0x007D => 1, # }
126     0x0028 => 1, # (
127     0x0029 => 1, # )
128     0x005B => 1, # [
129     0x005D => 1, # ]
130     }->{$self->{c}}) {
131     # stay in the state
132     $self->{c} = $self->{get_char}->();
133     return {type => chr $self->{c}};
134     # redo A;
135     } elsif ({
136     0x0020 => 1, # SP
137     0x0009 => 1, # \t
138     0x000D => 1, # \r
139     0x000A => 1, # \n
140     0x000C => 1, # \f
141     }->{$self->{c}}) {
142     W: {
143     $self->{c} = $self->{get_char}->();
144     if ({
145     0x0020 => 1, # SP
146     0x0009 => 1, # \t
147     0x000D => 1, # \r
148     0x000A => 1, # \n
149     0x000C => 1, # \f
150     }->{$self->{c}}) {
151     redo W;
152     } elsif (my $v = {
153     0x002B => PLUS_TOKEN, # +
154     0x003E => GREATER_TOKEN, # >
155     0x002C => COMMA_TOKEN, # ,
156     0x007E => TILDE_TOKEN, # ~
157     }->{$self->{c}}) {
158     # stay in the state
159     $self->{c} = $self->{get_char}->();
160     return {type => $v};
161     #redo A;
162     } else {
163     # stay in the state
164     # reprocess
165     return {type => S_TOKEN};
166     #redo A;
167     }
168     } # W
169     } elsif (my $v = {
170     0x007C => DASHMATCH_TOKEN, # |
171     0x005E => PREFIXMATCH_TOKEN, # ^
172     0x0024 => SUFFIXMATCH_TOKEN, # $
173     0x002A => SUBSTRINGMATCH_TOKEN, # *
174     }->{$self->{c}}) {
175     $self->{c} = $self->{get_char}->();
176     if ($self->{c} == 0x003D) { # =
177     # stay in the state
178     $self->{c} = $self->{get_char}->();
179     return {type => $v};
180     #redo A;
181     } else {
182     # stay in the state
183     # reprocess
184     return {type => DELIM_TOKEN, value => chr $self->{c}};
185     #redo A;
186     }
187     } elsif ($self->{c} == 0x002B) { # +
188     # stay in the state
189     $self->{c} = $self->{get_char}->();
190     return {type => PLUS_TOKEN};
191     #redo A;
192     } elsif ($self->{c} == 0x003E) { # >
193     # stay in the state
194     $self->{c} = $self->{get_char}->();
195     return {type => GREATER_TOKEN};
196     #redo A;
197     } elsif ($self->{c} == 0x002C) { # ,
198     # stay in the state
199     $self->{c} = $self->{get_char}->();
200     return {type => COMMA_TOKEN};
201     #redo A;
202     } elsif ($self->{c} == 0x007E) { # ~
203     $self->{c} = $self->{get_char}->();
204     if ($self->{c} == 0x003D) { # =
205     # stay in the state
206     $self->{c} = $self->{get_char}->();
207     return {type => INCLUDES_TOKEN};
208     #redo A;
209     } else {
210     # stay in the state
211     # reprocess
212     return {type => TILDE_TOKEN};
213     #redo A;
214     }
215     } elsif ($self->{c} == -1) {
216     # stay in the state
217     $self->{c} = $self->{get_char}->();
218     return {type => EOF_TOKEN};
219     #redo A;
220     } else {
221     # stay in the state
222     $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};
223     $self->{c} = $self->{get_char}->();
224     return $current_token;
225     #redo A;
226     }
227     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
228     ## NOTE: |nmstart| in |ident| in (|IDENT| or |ATKEYWORD|)
229     if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z
230     (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z
231     $self->{c} == 0x005F or # _
232     $self->{c} > 0x007F) { # nonascii
233     $current_token->{value} .= chr $self->{char};
234     $self->{state} = NAME_STATE;
235     $self->{c} = $self->{get_char}->();
236     redo A;
237     } elsif ($self->{c} == 0x005C) { # \
238     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
239     $self->{c} = $self->{get_char}->();
240     redo A;
241     } elsif ($self->{c} == 0x002D and # -
242     $current_token->{type} == IDENT_TOKEN) {
243     $self->{c} = $self->{get_char}->();
244     if ($self->{c} == 0x003E) { # >
245     $self->{state} = BEFORE_TOKEN_STATE;
246     $self->{c} = $self->{get_char}->();
247     return {type => CDC_TOKEN};
248     #redo A;
249     } else {
250     ## NOTE: |-|, |-|, $self->{c}
251     #$current_token = {type => IDENT_TOKEN, value => '-'};
252     # stay in the state
253     # reconsume
254     return {type => DELIM_TOKEN, value => '-'};
255     #redo A;
256     }
257     } else {
258     if ($current_token->{type} == NUMBER_TOKEN) {
259     ## NOTE: |-| after |num|.
260     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
261     $self->{state} = BEFORE_TOKEN_STATE;
262     $self->{c} = $self->{get_char}->();
263     return $current_token;
264     } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {
265     ## NOTE: |-| after |@|.
266     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};
267     $self->{state} = BEFORE_TOKEN_STATE;
268     $self->{c} = $self->{get_char}->();
269     return $current_token;
270     } else {
271     ## NOTE: |-| not followed by |nmstart|.
272     $self->{state} = BEFORE_TOKEN_STATE;
273     $self->{c} = $self->{get_char}->();
274     return {type => DELIM_TOKEN, value => '-'};
275     }
276     }
277     } elsif ($self->{state} == AFTER_NUMBER_STATE) {
278     if ($self->{c} == 0x002D) { # -
279     ## NOTE: |-| in |ident|.
280     $current_token->{value} = '-';
281     $self->{state} = BEFORE_NMSTART_STATE;
282     $self->{c} = $self->{get_char}->();
283     redo A;
284     } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z
285     (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z
286     $self->{c} == 0x005F or # _
287     $self->{c} > 0x007F) { # nonascii
288     ## NOTE: |nmstart| in |ident|.
289     $current_token->{value} = chr $self->{char};
290     $self->{state} = NAME_STATE;
291     $self->{c} = $self->{get_char}->();
292     redo A;
293     } elsif ($self->{c} == 0x005C) { # \
294     ## NOTE: |nmstart| in |ident| in |IDENT|
295     $current_token->{value} = '';
296     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
297     $self->{c} = $self->{get_char}->();
298     redo A;
299     } elsif ($self->{c} == 0x0025) { # %
300     $current_token->{type} = PERCENTAGE_TOKEN;
301     $self->{state} = BEFORE_TOKEN_STATE;
302     $self->{c} = $self->{get_char}->();
303     return $current_token;
304     #redo A;
305     } else {
306     $self->{state} = BEFORE_TOKEN_STATE;
307     # reprocess
308     return $current_token;
309     #redo A;
310     }
311     } elsif ($self->{state} == HASH_OPEN_STATE) {
312     ## NOTE: The first |nmchar| in |name| in |HASH|.
313     if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z
314     (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z
315     (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9
316     $self->{c} == 0x002D or # -
317     $self->{c} == 0x005F or # _
318     $self->{c} > 0x007F) { # nonascii
319     $current_token->{value} .= chr $self->{char};
320     $self->{state} = NAME_STATE;
321     $self->{c} = $self->{get_char}->();
322     redo A;
323     } elsif ($self->{c} == 0x005C) { # \
324     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
325     $self->{c} = $self->{get_char}->();
326     redo A;
327     } else {
328     $self->{state} = BEFORE_TOKEN_STATE;
329     $self->{c} = $self->{get_char}->();
330     return {type => DELIM_TOKEN, value => '#'};
331     #redo A;
332     }
333     } elsif ($self->{state} == NAME_STATE) {
334     ## NOTE: |nmchar| in (|ident| or |name|).
335     if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z
336     (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z
337     (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9
338     $self->{c} == 0x005F or # _
339     $self->{c} == 0x002D or # -
340     $self->{c} > 0x007F) { # nonascii
341     $current_token->{value} .= chr $self->{char};
342     # stay in the state
343     $self->{c} = $self->{get_char}->();
344     redo A;
345     } elsif ($self->{c} == 0x005C) { # \
346     $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;
347     $self->{c} = $self->{get_char}->();
348     redo A;
349     } elsif ($self->{c} == 0x0028 and # (
350     $current_token->{type} == IDENT_TOKEN) { # (
351     if (not $current_token->{has_escape} and
352     {url => 1, Url => 1, uRl => 1, urL => 1,
353     URl => 1, UrL => 1, uRL => 1, URL => 1}
354     ->{$current_token->{value}}) {
355     $current_token->{type} = URI_TOKEN;
356     $self->{state} = URI_BEFORE_WSP_STATE;
357     $self->{c} = $self->{get_char}->();
358    
359     ## NOTE: This version of the tokenizer does not support the |URI|
360     ## token type. Note that browsers disagree in how to tokenize
361     ## |url| function.
362     $current_token->{type} = FUNCTION_TOKEN;
363     $self->{state} = BEFORE_TOKEN_STATE;
364     $self->{c} = $self->{get_char}->();
365     return $current_token;
366    
367     redo A;
368     } else {
369     $current_token->{type} = FUNCTION_TOKEN;
370     $self->{state} = BEFORE_TOKEN_STATE;
371     $self->{c} = $self->{get_char}->();
372     return $current_token;
373     #redo A;
374     }
375     } else {
376     $self->{state} = BEFORE_TOKEN_STATE;
377     # reconsume
378     return $current_token;
379     #redo A;
380     }
381     } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
382     $current_token->{has_escape} = 1;
383     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
384     ## NOTE: second character of |unicode| in |escape|.
385     $char = $self->{c} - 0x0030;
386     $self->{state} = ESCAPE_STATE; $i = 2;
387     $self->{c} = $self->{get_char}->();
388     redo A;
389     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
390     ## NOTE: second character of |unicode| in |escape|.
391     $char = $self->{c} - 0x0041 + 0xA;
392     $self->{state} = ESCAPE_STATE; $i = 2;
393     $self->{c} = $self->{get_char}->();
394     redo A;
395     } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f
396     ## NOTE: second character of |unicode| in |escape|.
397     $char = $self->{c} - 0x0061 - 0xA;
398     $self->{state} = ESCAPE_STATE; $i = 2;
399     $self->{c} = $self->{get_char}->();
400     redo A;
401     } elsif ($self->{c} == 0x000A or # \n
402     $self->{c} == 0x000C) { # \f
403     if ($q == 0) {
404     ## NOTE: In |escape| in ... in |ident|.
405     $self->{state} = BEFORE_TOKEN_STATE;
406     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
407     return $current_token;
408     # reconsume
409     #redo A;
410     } else {
411     ## Note: In |nl| in ... in |string| or |ident|.
412     $current_token->{value} .= chr $self->{c};
413     $self->{state} = STRING_STATE;
414     $self->{c} = $self->{get_char}->();
415     redo A;
416     }
417     } elsif ($self->{c} == 0x000D) { # \r
418     if ($q == 0) {
419     ## NOTE: In |escape| in ... in |ident|.
420     $self->{state} = BEFORE_TOKEN_STATE;
421     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
422     return $current_token;
423     # reconsume
424     #redo A;
425     } else {
426     ## Note: In |nl| in ... in |string| or |ident|.
427     $current_token->{value} .= "\x0D\x0A";
428     $self->{state} = ESCAPE_BEFORE_LF_STATE;
429     $self->{c} = $self->{get_char}->();
430     redo A;
431     }
432     } else {
433     ## NOTE: second character of |escape|.
434     $current_token->{value} .= chr $self->{c};
435     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
436     $self->{c} = $self->{get_char}->();
437     redo A;
438     }
439     } elsif ($self->{state} == ESCAPE_STATE) {
440     ## NOTE: third..seventh character of |unicode| in |escape|.
441     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
442     $char = $char * 0x10 + $self->{c} - 0x0030;
443     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
444     $self->{c} = $self->{get_char}->();
445     redo A;
446     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
447     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
448     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
449     $self->{c} = $self->{get_char}->();
450     redo A;
451     } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f
452     $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
453     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
454     $self->{c} = $self->{get_char}->();
455     redo A;
456     } elsif ($self->{c} == 0x0020 or # SP
457     $self->{c} == 0x000A or # \n
458     $self->{c} == 0x0009 or # \t
459     $self->{c} == 0x000C) { # \f
460     $current_token->{value} .= chr $char;
461     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
462     $self->{c} = $self->{get_char}->();
463     redo A;
464     } elsif ($self->{c} == 0x000D) { # \r
465     $self->{state} = ESCAPE_BEFORE_LF_STATE;
466     $self->{c} = $self->{get_char}->();
467     redo A;
468     } else {
469     $current_token->{value} .= chr $char;
470     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
471     # reconsume
472     redo A;
473     }
474     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
475     ## NOTE: eightth character of |unicode| in |escape|.
476     if ($self->{c} == 0x0020 or # SP
477     $self->{c} == 0x000A or # \n
478     $self->{c} == 0x0009 or # \t
479     $self->{c} == 0x000C) { # \f
480     $current_token->{value} .= chr $char;
481     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
482     $self->{c} = $self->{get_char}->();
483     redo A;
484     } elsif ($self->{c} == 0x000D) { # \r
485     $self->{state} = ESCAPE_BEFORE_NL_STATE;
486     $self->{c} = $self->{get_char}->();
487     redo A;
488     } else {
489     $current_token->{value} .= chr $char;
490     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
491     # reconsume
492     redo A;
493     }
494     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
495     ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
496     if ($self->{c} == 0x000A) { # \n
497     $current_token->{value} .= chr $char;
498     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
499     $self->{c} = $self->{get_char}->();
500     redo A;
501     } else {
502     $current_token->{value} .= chr $char;
503     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
504     # reconsume
505     redo A;
506     }
507     } elsif ($self->{state} == STRING_STATE) {
508     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
509     ## a character in |invalid$Q| in |invalid| in |INVALID|,
510     ## where |$Q = $q == 0x0022 ? 1 : 2|.
511     if ($self->{c} == 0x005C) { # \
512     $self->{state} = ESCAPE_OPEN_STATE;
513     $self->{c} = $self->{get_char}->();
514     redo A;
515     } elsif ($self->{c} == $q) { # " | '
516     $self->{state} = BEFORE_TOKEN_STATE;
517     $self->{c} = $self->{get_char}->();
518     return $current_token;
519     #redo A;
520     } elsif ($self->{c} == 0x000A or # \n
521     $self->{c} == 0x000D or # \r
522     $self->{c} == 0x000C or # \f
523     $self->{c} == -1) {
524     $current_token->{type} = INVALID_TOKEN;
525     $self->{state} = BEFORE_TOKEN_STATE;
526     # reconsume
527     return $current_token;
528     #redo A;
529     } else {
530     $current_token->{value} .= chr $self->{c};
531     # stay in the state
532     $self->{c} = $self->{get_char}->();
533     redo A;
534     }
535     } elsif ($self->{state} == NUMBER_STATE) {
536     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
537     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
538     $current_token->{value} .= chr $self->{c};
539     # stay in the state
540     $self->{c} = $self->{get_char}->();
541     redo A;
542     } elsif ($self->{c} == 0x002E) { # .
543     $self->{state} = NUMBER_DOT_STATE;
544     $self->{c} = $self->{get_char}->();
545     redo A;
546     } else {
547     $self->{number} = $self->{value};
548     $self->{value} = '';
549     $self->{state} = AFTER_NUMBER_STATE;
550     # reprocess
551     return $current_token;
552     #redo A;
553     }
554     } elsif ($self->{state} == NUMBER_DOT_STATE) {
555     ## NOTE: The character immediately following |.| in |num|.
556     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
557     $current_token->{value} .= chr $self->{c};
558     $self->{state} = NUMBER_DOT_NUMBER_STATE;
559     $self->{c} = $self->{get_char}->();
560     redo A;
561     } else {
562     unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
563     $self->{number} = $self->{value};
564     $self->{value} = '';
565     $self->{state} = BEFORE_TOKEN_STATE;
566     # reprocess
567     return $current_token;
568     #redo A;
569     }
570     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
571     ## NOTE: The character immediately following |.| at the beginning of |num|.
572     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
573     $current_token->{value} .= chr $self->{c};
574     $self->{state} = NUMBER_DOT_NUMBER_STATE;
575     $self->{c} = $self->{get_char}->();
576     redo A;
577     } else {
578     $self->{state} = BEFORE_TOKEN_STATE;
579     $self->{c} = $self->{get_char}->();
580     return {type => DELIM_TOKEN, value => '.'};
581     #redo A;
582     }
583     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
584     ## NOTE: |[0-9]| in |num| after |.|.
585     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
586     $current_token->{value} .= chr $self->{c};
587     # stay in the state
588     $self->{c} = $self->{get_char}->();
589     redo A;
590     } else {
591     $self->{number} = $self->{value};
592     $self->{value} = '';
593     $self->{state} = AFTER_NUMBER_STATE;
594     # reprocess
595     return $current_token;
596     #redo A;
597     }
598     } else {
599     die "$0: Unknown state |$self->{state}|";
600     }
601     } # A
602    
603     ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|
604    
605     } # get_next_token
606    
607     1;
608     # $Date:$

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24