/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Sat Sep 8 01:31:44 2007 UTC (17 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +122 -45 lines
++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 01:31:14 -0000
2007-09-08  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm: First working version.

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3    
4 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
5     sub BEFORE_NMSTART_STATE () { 1 }
6     sub NAME_STATE () { 2 }
7     sub ESCAPE_OPEN_STATE () { 3 }
8     sub STRING_STATE () { 4 }
9     sub HASH_OPEN_STATE () { 5 }
10     sub NUMBER_STATE () { 6 }
11     sub NUMBER_FRACTION_STATE () { 7 }
12     sub AFTER_NUMBER_STATE () { 8 }
13     sub URI_BEFORE_WSP_STATE () { 9 }
14     sub ESCAPE_STATE () { 10 }
15     sub ESCAPE_BEFORE_LF_STATE () { 11 }
16     sub ESCAPE_BEFORE_NL_STATE () { 12 }
17     sub NUMBER_DOT_STATE () { 13 }
18     sub NUMBER_DOT_NUMBER_STATE () { 14 }
19     sub DELIM_STATE () { 15 }
20    
21     sub IDENT_TOKEN () { 1 }
22     sub ATKEYWORD_TOKEN () { 2 }
23     sub HASH_TOKEN () { 3 }
24     sub FUNCTION_TOKEN () { 4 }
25     sub URI_TOKEN () { 5 }
26     sub URI_INVALID_TOKEN () { 6 }
27     sub URI_PREFIX_TOKEN () { 7 }
28     sub URI_PREFIX_INVALID_TOKEN () { 8 }
29     sub STRING_TOKEN () { 9 }
30     sub INVALID_TOKEN () { 10 }
31     sub NUMBER_TOKEN () { 11 }
32     sub DIMENSION_TOKEN () { 12 }
33     sub PERCENTAGE_TOKEN () { 13 }
34     sub UNICODE_RANGE_TOKEN () { 14 }
35     sub UNICODE_RANGE_INVALID_TOKEN () { 15 }
36     sub DELIM_TOKEN () { 16 }
37     sub PLUS_TOKEN () { 17 }
38     sub GREATER_TOKEN () { 18 }
39     sub COMMA_TOKEN () { 19 }
40     sub TILDE_TOKEN () { 20 }
41     sub DASHMATCH_TOKEN () { 21 }
42     sub PREFIXMATCH_TOKEN () { 22 }
43     sub SUFFIXMATCH_TOKEN () { 23 }
44     sub SUBSTRINGMATCH_TOKEN () { 24 }
45     sub INCLUDES_TOKEN () { 25 }
46     sub SEMICOLON_TOKEN () { 26 }
47     sub LBRACE_TOKEN () { 27 }
48     sub RBRACE_TOKEN () { 28 }
49     sub LPAREN_TOKEN () { 29 }
50     sub RPAREN_TOKEN () { 30 }
51     sub LBRACKET_TOKEN () { 31 }
52     sub RBRACKET_TOKEN () { 32 }
53     sub S_TOKEN () { 33 }
54     sub CDO_TOKEN () { 34 }
55     sub CDC_TOKEN () { 35 }
56     sub COMMENT_TOKEN () { 36 }
57     sub COMMENT_INVALID_TOKEN () { 37 }
58     sub EOF_TOKEN () { 38 }
59    
60     our @TokenName = qw(
61     0 IDENT ATKWTWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
62     STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
63     UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH
64     PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
65     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
66     COMMENT_INVALID EOF
67     );
68    
69 wakaba 1.1 sub new ($) {
70 wakaba 1.2 my $self = bless {token => [], get_char => sub { -1 },
71     onerror => sub { }}, shift;
72 wakaba 1.1 return $self;
73     } # new
74    
75     sub init ($) {
76     my $self = shift;
77     $self->{state} = BEFORE_TOKEN_STATE;
78     $self->{c} = $self->{get_char}->();
79     } # init
80    
81     sub get_next_token ($) {
82     my $self = shift;
83     if (@{$self->{token}}) {
84     return shift @{$self->{token}};
85     }
86    
87     my $current_token;
88     my $char;
89     my $num; # |{num}|, if any.
90     my $i; # |$i + 1|th character in |unicode| in |escape|.
91     my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|
92    
93     A: {
94     if ($self->{state} == BEFORE_TOKEN_STATE) {
95     if ($self->{c} == 0x002D) { # -
96     ## NOTE: |-| in |ident| in |IDENT|
97     $current_token = {type => IDENT_TOKEN, value => '-'};
98     $self->{state} = BEFORE_NMSTART_STATE;
99     $self->{c} = $self->{get_char}->();
100     redo A;
101 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
102     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
103 wakaba 1.1 $self->{c} == 0x005F or # _
104     $self->{c} > 0x007F) { # nonascii
105     ## NOTE: |nmstart| in |ident| in |IDENT|
106 wakaba 1.2 $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};
107 wakaba 1.1 $self->{state} = NAME_STATE;
108     $self->{c} = $self->{get_char}->();
109     redo A;
110     } elsif ($self->{c} == 0x005C) { # \
111     ## NOTE: |nmstart| in |ident| in |IDENT|
112     $current_token = {type => IDENT_TOKEN, value => ''};
113     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
114     $self->{c} = $self->{get_char}->();
115     redo A;
116     } elsif ($self->{c} == 0x0040) { # @
117     ## NOTE: |@| in |ATKEYWORD|
118     $current_token = {type => ATKEYWORD_TOKEN, value => ''};
119     $self->{state} = BEFORE_NMSTART_STATE;
120     $self->{c} = $self->{get_char}->();
121     redo A;
122     } elsif ($self->{c} == 0x0022) { # "
123     ## NOTE: |"| in |string1| in |string| in |STRING|, or
124     ## |"| in |invalid1| in |invalid| in |INVALID|.
125     $current_token = {type => STRING_TOKEN, value => ''};
126     $self->{state} = STRING_STATE; $q = 1;
127     $self->{c} = $self->{get_char}->();
128     redo A;
129     } elsif ($self->{c} == 0x0027) { # '
130     ## NOTE: |'| in |string2| in |string| in |STRING|, or
131     ## |'| in |invalid2| in |invalid| in |INVALID|.
132     $current_token = {type => STRING_TOKEN, value => ''};
133     $self->{state} = STRING_STATE; $q = 2;
134     $self->{c} = $self->{get_char}->();
135     redo A;
136     } elsif ($self->{c} == 0x0023) { # #
137     ## NOTE: |#| in |HASH|.
138     $current_token = {type => HASH_TOKEN, value => ''};
139     $self->{state} = HASH_OPEN_STATE;
140     $self->{c} = $self->{get_char}->();
141     redo A;
142     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
143     ## NOTE: |num|.
144     $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};
145     $self->{state} = NUMBER_STATE;
146     $self->{c} = $self->{get_char}->();
147     redo A;
148     } elsif ($self->{c} == 0x002E) { # .
149     ## NOTE: |num|.
150 wakaba 1.2 $current_token = {type => NUMBER_TOKEN, value => '0'};
151 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
152     $self->{c} = $self->{get_char}->();
153     redo A;
154     } elsif ($self->{c} == 0x003C) { # <
155     ## NOTE: |CDO|
156     $self->{c} = $self->{get_char}->();
157     if ($self->{c} == 0x0021) { # !
158     $self->{c} = $self->{get_char}->();
159     if ($self->{c} == 0x002C) { # -
160     $self->{c} = $self->{get_char}->();
161     if ($self->{c} == 0x002C) { # -
162     $self->{state} = BEFORE_TOKEN_STATE;
163     $self->{c} = $self->{get_char}->();
164     return {type => CDO_TOKEN};
165     #redo A;
166     } else {
167     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
168     ## NOTE: |-| in |ident| in |IDENT|
169     $current_token = {type => IDENT_TOKEN, value => '-'};
170     $self->{state} = BEFORE_NMSTART_STATE;
171     #reprocess
172     return {type => DELIM_TOKEN, value => '<'};
173     #redo A;
174     }
175     } else {
176     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
177     $self->{state} = BEFORE_TOKEN_STATE;
178     #reprocess
179     return {type => DELIM_TOKEN, value => '<'};
180     #redo A;
181     }
182     } else {
183     $self->{state} = BEFORE_TOKEN_STATE;
184     #reprocess
185     return {type => DELIM_TOKEN, value => '<'};
186     #redo A;
187     }
188 wakaba 1.2 } elsif (my $t = {
189     0x003B => SEMICOLON_TOKEN, # ;
190     0x007B => LBRACE_TOKEN, # {
191     0x007D => RBRACE_TOKEN, # }
192     0x0028 => LPAREN_TOKEN, # (
193     0x0029 => RPAREN_TOKEN, # )
194     0x005B => LBRACKET_TOKEN, # [
195     0x005D => RBRACKET_TOKEN, # ]
196 wakaba 1.1 }->{$self->{c}}) {
197     # stay in the state
198     $self->{c} = $self->{get_char}->();
199 wakaba 1.2 return {type => $t};
200 wakaba 1.1 # redo A;
201     } elsif ({
202     0x0020 => 1, # SP
203     0x0009 => 1, # \t
204     0x000D => 1, # \r
205     0x000A => 1, # \n
206     0x000C => 1, # \f
207     }->{$self->{c}}) {
208     W: {
209     $self->{c} = $self->{get_char}->();
210     if ({
211     0x0020 => 1, # SP
212     0x0009 => 1, # \t
213     0x000D => 1, # \r
214     0x000A => 1, # \n
215     0x000C => 1, # \f
216     }->{$self->{c}}) {
217     redo W;
218     } elsif (my $v = {
219     0x002B => PLUS_TOKEN, # +
220     0x003E => GREATER_TOKEN, # >
221     0x002C => COMMA_TOKEN, # ,
222     0x007E => TILDE_TOKEN, # ~
223     }->{$self->{c}}) {
224     # stay in the state
225     $self->{c} = $self->{get_char}->();
226     return {type => $v};
227     #redo A;
228     } else {
229     # stay in the state
230     # reprocess
231     return {type => S_TOKEN};
232     #redo A;
233     }
234     } # W
235     } elsif (my $v = {
236     0x007C => DASHMATCH_TOKEN, # |
237     0x005E => PREFIXMATCH_TOKEN, # ^
238     0x0024 => SUFFIXMATCH_TOKEN, # $
239     0x002A => SUBSTRINGMATCH_TOKEN, # *
240     }->{$self->{c}}) {
241 wakaba 1.2 my $c = $self->{c};
242 wakaba 1.1 $self->{c} = $self->{get_char}->();
243     if ($self->{c} == 0x003D) { # =
244     # stay in the state
245     $self->{c} = $self->{get_char}->();
246     return {type => $v};
247     #redo A;
248     } else {
249     # stay in the state
250     # reprocess
251 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
252 wakaba 1.1 #redo A;
253     }
254     } elsif ($self->{c} == 0x002B) { # +
255     # stay in the state
256     $self->{c} = $self->{get_char}->();
257     return {type => PLUS_TOKEN};
258     #redo A;
259     } elsif ($self->{c} == 0x003E) { # >
260     # stay in the state
261     $self->{c} = $self->{get_char}->();
262     return {type => GREATER_TOKEN};
263     #redo A;
264     } elsif ($self->{c} == 0x002C) { # ,
265     # stay in the state
266     $self->{c} = $self->{get_char}->();
267     return {type => COMMA_TOKEN};
268     #redo A;
269     } elsif ($self->{c} == 0x007E) { # ~
270     $self->{c} = $self->{get_char}->();
271     if ($self->{c} == 0x003D) { # =
272     # stay in the state
273     $self->{c} = $self->{get_char}->();
274     return {type => INCLUDES_TOKEN};
275     #redo A;
276     } else {
277     # stay in the state
278     # reprocess
279     return {type => TILDE_TOKEN};
280     #redo A;
281     }
282     } elsif ($self->{c} == -1) {
283     # stay in the state
284     $self->{c} = $self->{get_char}->();
285     return {type => EOF_TOKEN};
286     #redo A;
287     } else {
288     # stay in the state
289     $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};
290     $self->{c} = $self->{get_char}->();
291     return $current_token;
292     #redo A;
293     }
294     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
295 wakaba 1.2 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or |ATKEYWORD|)
296     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
297     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
298 wakaba 1.1 $self->{c} == 0x005F or # _
299     $self->{c} > 0x007F) { # nonascii
300 wakaba 1.2 $current_token->{value} .= chr $self->{c};
301     $current_token->{type} = DIMENSION_TOKEN
302     if $current_token->{type} == NUMBER_TOKEN;
303 wakaba 1.1 $self->{state} = NAME_STATE;
304     $self->{c} = $self->{get_char}->();
305     redo A;
306     } elsif ($self->{c} == 0x005C) { # \
307 wakaba 1.2 ## TODO: 12-\X, 12-\{nl}
308 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
309     $self->{c} = $self->{get_char}->();
310     redo A;
311     } elsif ($self->{c} == 0x002D and # -
312     $current_token->{type} == IDENT_TOKEN) {
313     $self->{c} = $self->{get_char}->();
314     if ($self->{c} == 0x003E) { # >
315     $self->{state} = BEFORE_TOKEN_STATE;
316     $self->{c} = $self->{get_char}->();
317     return {type => CDC_TOKEN};
318     #redo A;
319     } else {
320     ## NOTE: |-|, |-|, $self->{c}
321     #$current_token = {type => IDENT_TOKEN, value => '-'};
322     # stay in the state
323     # reconsume
324     return {type => DELIM_TOKEN, value => '-'};
325     #redo A;
326     }
327     } else {
328     if ($current_token->{type} == NUMBER_TOKEN) {
329     ## NOTE: |-| after |num|.
330     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
331     $self->{state} = BEFORE_TOKEN_STATE;
332     $self->{c} = $self->{get_char}->();
333     return $current_token;
334     } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {
335     ## NOTE: |-| after |@|.
336     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};
337     $self->{state} = BEFORE_TOKEN_STATE;
338     $self->{c} = $self->{get_char}->();
339     return $current_token;
340 wakaba 1.2 } elsif ($current_token->{type} == NUMBER_TOKEN) {
341     ## NOTE: |-| after |NUMBER|.
342     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
343     $self->{state} = BEFORE_TOKEN_STATE;
344     # reconsume
345     $current_token->{value} = $current_token->{number};
346     delete $current_token->{number};
347     return $current_token;
348 wakaba 1.1 } else {
349     ## NOTE: |-| not followed by |nmstart|.
350     $self->{state} = BEFORE_TOKEN_STATE;
351     $self->{c} = $self->{get_char}->();
352     return {type => DELIM_TOKEN, value => '-'};
353     }
354     }
355     } elsif ($self->{state} == AFTER_NUMBER_STATE) {
356     if ($self->{c} == 0x002D) { # -
357     ## NOTE: |-| in |ident|.
358     $current_token->{value} = '-';
359     $self->{state} = BEFORE_NMSTART_STATE;
360     $self->{c} = $self->{get_char}->();
361     redo A;
362 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
363     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
364 wakaba 1.1 $self->{c} == 0x005F or # _
365     $self->{c} > 0x007F) { # nonascii
366     ## NOTE: |nmstart| in |ident|.
367 wakaba 1.2 $current_token->{value} = chr $self->{c};
368     $current_token->{type} = DIMENSION_TOKEN;
369 wakaba 1.1 $self->{state} = NAME_STATE;
370     $self->{c} = $self->{get_char}->();
371     redo A;
372     } elsif ($self->{c} == 0x005C) { # \
373     ## NOTE: |nmstart| in |ident| in |IDENT|
374     $current_token->{value} = '';
375     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
376     $self->{c} = $self->{get_char}->();
377     redo A;
378     } elsif ($self->{c} == 0x0025) { # %
379     $current_token->{type} = PERCENTAGE_TOKEN;
380     $self->{state} = BEFORE_TOKEN_STATE;
381     $self->{c} = $self->{get_char}->();
382     return $current_token;
383     #redo A;
384     } else {
385     $self->{state} = BEFORE_TOKEN_STATE;
386     # reprocess
387     return $current_token;
388     #redo A;
389     }
390     } elsif ($self->{state} == HASH_OPEN_STATE) {
391     ## NOTE: The first |nmchar| in |name| in |HASH|.
392 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
393     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
394     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
395 wakaba 1.1 $self->{c} == 0x002D or # -
396     $self->{c} == 0x005F or # _
397     $self->{c} > 0x007F) { # nonascii
398 wakaba 1.2 $current_token->{value} .= chr $self->{c};
399 wakaba 1.1 $self->{state} = NAME_STATE;
400     $self->{c} = $self->{get_char}->();
401     redo A;
402     } elsif ($self->{c} == 0x005C) { # \
403     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
404     $self->{c} = $self->{get_char}->();
405     redo A;
406     } else {
407     $self->{state} = BEFORE_TOKEN_STATE;
408     $self->{c} = $self->{get_char}->();
409     return {type => DELIM_TOKEN, value => '#'};
410     #redo A;
411     }
412     } elsif ($self->{state} == NAME_STATE) {
413     ## NOTE: |nmchar| in (|ident| or |name|).
414 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
415     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
416     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
417 wakaba 1.1 $self->{c} == 0x005F or # _
418     $self->{c} == 0x002D or # -
419     $self->{c} > 0x007F) { # nonascii
420 wakaba 1.2 $current_token->{value} .= chr $self->{c};
421 wakaba 1.1 # stay in the state
422     $self->{c} = $self->{get_char}->();
423     redo A;
424     } elsif ($self->{c} == 0x005C) { # \
425     $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;
426     $self->{c} = $self->{get_char}->();
427     redo A;
428     } elsif ($self->{c} == 0x0028 and # (
429     $current_token->{type} == IDENT_TOKEN) { # (
430     if (not $current_token->{has_escape} and
431     {url => 1, Url => 1, uRl => 1, urL => 1,
432     URl => 1, UrL => 1, uRL => 1, URL => 1}
433     ->{$current_token->{value}}) {
434     $current_token->{type} = URI_TOKEN;
435     $self->{state} = URI_BEFORE_WSP_STATE;
436     $self->{c} = $self->{get_char}->();
437    
438     ## NOTE: This version of the tokenizer does not support the |URI|
439     ## token type. Note that browsers disagree in how to tokenize
440     ## |url| function.
441     $current_token->{type} = FUNCTION_TOKEN;
442     $self->{state} = BEFORE_TOKEN_STATE;
443     $self->{c} = $self->{get_char}->();
444     return $current_token;
445    
446     redo A;
447     } else {
448     $current_token->{type} = FUNCTION_TOKEN;
449     $self->{state} = BEFORE_TOKEN_STATE;
450     $self->{c} = $self->{get_char}->();
451     return $current_token;
452     #redo A;
453     }
454     } else {
455     $self->{state} = BEFORE_TOKEN_STATE;
456     # reconsume
457     return $current_token;
458     #redo A;
459     }
460     } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
461     $current_token->{has_escape} = 1;
462     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
463     ## NOTE: second character of |unicode| in |escape|.
464     $char = $self->{c} - 0x0030;
465     $self->{state} = ESCAPE_STATE; $i = 2;
466     $self->{c} = $self->{get_char}->();
467     redo A;
468     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
469     ## NOTE: second character of |unicode| in |escape|.
470     $char = $self->{c} - 0x0041 + 0xA;
471     $self->{state} = ESCAPE_STATE; $i = 2;
472     $self->{c} = $self->{get_char}->();
473     redo A;
474 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
475 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
476     $char = $self->{c} - 0x0061 - 0xA;
477     $self->{state} = ESCAPE_STATE; $i = 2;
478     $self->{c} = $self->{get_char}->();
479     redo A;
480     } elsif ($self->{c} == 0x000A or # \n
481     $self->{c} == 0x000C) { # \f
482     if ($q == 0) {
483     ## NOTE: In |escape| in ... in |ident|.
484     $self->{state} = BEFORE_TOKEN_STATE;
485     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
486     return $current_token;
487     # reconsume
488     #redo A;
489     } else {
490     ## Note: In |nl| in ... in |string| or |ident|.
491     $current_token->{value} .= chr $self->{c};
492     $self->{state} = STRING_STATE;
493     $self->{c} = $self->{get_char}->();
494     redo A;
495     }
496     } elsif ($self->{c} == 0x000D) { # \r
497     if ($q == 0) {
498     ## NOTE: In |escape| in ... in |ident|.
499     $self->{state} = BEFORE_TOKEN_STATE;
500     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
501     return $current_token;
502     # reconsume
503     #redo A;
504     } else {
505     ## Note: In |nl| in ... in |string| or |ident|.
506     $current_token->{value} .= "\x0D\x0A";
507     $self->{state} = ESCAPE_BEFORE_LF_STATE;
508     $self->{c} = $self->{get_char}->();
509     redo A;
510     }
511     } else {
512     ## NOTE: second character of |escape|.
513     $current_token->{value} .= chr $self->{c};
514     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
515     $self->{c} = $self->{get_char}->();
516     redo A;
517     }
518     } elsif ($self->{state} == ESCAPE_STATE) {
519     ## NOTE: third..seventh character of |unicode| in |escape|.
520     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
521     $char = $char * 0x10 + $self->{c} - 0x0030;
522     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
523     $self->{c} = $self->{get_char}->();
524     redo A;
525     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
526     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
527     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
528     $self->{c} = $self->{get_char}->();
529     redo A;
530 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
531 wakaba 1.1 $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
532     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
533     $self->{c} = $self->{get_char}->();
534     redo A;
535     } elsif ($self->{c} == 0x0020 or # SP
536     $self->{c} == 0x000A or # \n
537     $self->{c} == 0x0009 or # \t
538     $self->{c} == 0x000C) { # \f
539     $current_token->{value} .= chr $char;
540     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
541     $self->{c} = $self->{get_char}->();
542     redo A;
543     } elsif ($self->{c} == 0x000D) { # \r
544     $self->{state} = ESCAPE_BEFORE_LF_STATE;
545     $self->{c} = $self->{get_char}->();
546     redo A;
547     } else {
548     $current_token->{value} .= chr $char;
549     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
550     # reconsume
551     redo A;
552     }
553     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
554     ## NOTE: eightth character of |unicode| in |escape|.
555     if ($self->{c} == 0x0020 or # SP
556     $self->{c} == 0x000A or # \n
557     $self->{c} == 0x0009 or # \t
558     $self->{c} == 0x000C) { # \f
559     $current_token->{value} .= chr $char;
560     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
561     $self->{c} = $self->{get_char}->();
562     redo A;
563     } elsif ($self->{c} == 0x000D) { # \r
564     $self->{state} = ESCAPE_BEFORE_NL_STATE;
565     $self->{c} = $self->{get_char}->();
566     redo A;
567     } else {
568     $current_token->{value} .= chr $char;
569     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
570     # reconsume
571     redo A;
572     }
573     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
574     ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
575     if ($self->{c} == 0x000A) { # \n
576     $current_token->{value} .= chr $char;
577     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
578     $self->{c} = $self->{get_char}->();
579     redo A;
580     } else {
581     $current_token->{value} .= chr $char;
582     $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;
583     # reconsume
584     redo A;
585     }
586     } elsif ($self->{state} == STRING_STATE) {
587     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
588     ## a character in |invalid$Q| in |invalid| in |INVALID|,
589     ## where |$Q = $q == 0x0022 ? 1 : 2|.
590     if ($self->{c} == 0x005C) { # \
591     $self->{state} = ESCAPE_OPEN_STATE;
592     $self->{c} = $self->{get_char}->();
593     redo A;
594     } elsif ($self->{c} == $q) { # " | '
595     $self->{state} = BEFORE_TOKEN_STATE;
596     $self->{c} = $self->{get_char}->();
597     return $current_token;
598     #redo A;
599     } elsif ($self->{c} == 0x000A or # \n
600     $self->{c} == 0x000D or # \r
601     $self->{c} == 0x000C or # \f
602     $self->{c} == -1) {
603     $current_token->{type} = INVALID_TOKEN;
604     $self->{state} = BEFORE_TOKEN_STATE;
605     # reconsume
606     return $current_token;
607     #redo A;
608     } else {
609     $current_token->{value} .= chr $self->{c};
610     # stay in the state
611     $self->{c} = $self->{get_char}->();
612     redo A;
613     }
614     } elsif ($self->{state} == NUMBER_STATE) {
615     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
616     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
617     $current_token->{value} .= chr $self->{c};
618     # stay in the state
619     $self->{c} = $self->{get_char}->();
620     redo A;
621     } elsif ($self->{c} == 0x002E) { # .
622     $self->{state} = NUMBER_DOT_STATE;
623     $self->{c} = $self->{get_char}->();
624     redo A;
625     } else {
626 wakaba 1.2 $current_token->{number} = $current_token->{value};
627     $current_token->{value} = '';
628 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
629     # reprocess
630 wakaba 1.2 redo A;
631 wakaba 1.1 }
632     } elsif ($self->{state} == NUMBER_DOT_STATE) {
633     ## NOTE: The character immediately following |.| in |num|.
634     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
635 wakaba 1.2 $current_token->{value} .= '.' . chr $self->{c};
636 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
637     $self->{c} = $self->{get_char}->();
638     redo A;
639     } else {
640     unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
641 wakaba 1.2 $current_token->{number} = $current_token->{value};
642     $current_token->{value} = '';
643 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
644     # reprocess
645     return $current_token;
646     #redo A;
647     }
648     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
649     ## NOTE: The character immediately following |.| at the beginning of |num|.
650     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
651 wakaba 1.2 $current_token->{value} .= '.' . chr $self->{c};
652 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
653     $self->{c} = $self->{get_char}->();
654     redo A;
655     } else {
656     $self->{state} = BEFORE_TOKEN_STATE;
657     $self->{c} = $self->{get_char}->();
658     return {type => DELIM_TOKEN, value => '.'};
659     #redo A;
660     }
661     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
662     ## NOTE: |[0-9]| in |num| after |.|.
663     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
664     $current_token->{value} .= chr $self->{c};
665     # stay in the state
666     $self->{c} = $self->{get_char}->();
667     redo A;
668     } else {
669 wakaba 1.2 $current_token->{number} = $current_token->{value};
670     $current_token->{value} = '';
671 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
672     # reprocess
673 wakaba 1.2 redo A;
674 wakaba 1.1 }
675     } else {
676     die "$0: Unknown state |$self->{state}|";
677     }
678     } # A
679    
680     ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|
681    
682     } # get_next_token
683    
684     1;
685 wakaba 1.2 # $Date: 2007/08/17 11:53:52 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24