/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (hide annotations) (download)
Sat Sep 8 02:58:24 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.3: +38 -2 lines
++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 02:58:20 -0000
	* Tokenizer.pm: |COMMENT| is implemented.
	A bug for treatement for |@-->| is fixed.

2007-09-08  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3    
4 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
5     sub BEFORE_NMSTART_STATE () { 1 }
6     sub NAME_STATE () { 2 }
7     sub ESCAPE_OPEN_STATE () { 3 }
8     sub STRING_STATE () { 4 }
9     sub HASH_OPEN_STATE () { 5 }
10     sub NUMBER_STATE () { 6 }
11     sub NUMBER_FRACTION_STATE () { 7 }
12     sub AFTER_NUMBER_STATE () { 8 }
13     sub URI_BEFORE_WSP_STATE () { 9 }
14     sub ESCAPE_STATE () { 10 }
15     sub ESCAPE_BEFORE_LF_STATE () { 11 }
16     sub ESCAPE_BEFORE_NL_STATE () { 12 }
17     sub NUMBER_DOT_STATE () { 13 }
18     sub NUMBER_DOT_NUMBER_STATE () { 14 }
19     sub DELIM_STATE () { 15 }
20 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
21     sub URI_AFTER_WSP_STATE () { 17 }
22     sub AFTER_AT_STATE () { 18 }
23     sub AFTER_AT_HYPHEN_STATE () { 19 }
24 wakaba 1.2
25     sub IDENT_TOKEN () { 1 }
26     sub ATKEYWORD_TOKEN () { 2 }
27     sub HASH_TOKEN () { 3 }
28     sub FUNCTION_TOKEN () { 4 }
29     sub URI_TOKEN () { 5 }
30     sub URI_INVALID_TOKEN () { 6 }
31     sub URI_PREFIX_TOKEN () { 7 }
32     sub URI_PREFIX_INVALID_TOKEN () { 8 }
33     sub STRING_TOKEN () { 9 }
34     sub INVALID_TOKEN () { 10 }
35     sub NUMBER_TOKEN () { 11 }
36     sub DIMENSION_TOKEN () { 12 }
37     sub PERCENTAGE_TOKEN () { 13 }
38     sub UNICODE_RANGE_TOKEN () { 14 }
39     sub UNICODE_RANGE_INVALID_TOKEN () { 15 }
40     sub DELIM_TOKEN () { 16 }
41     sub PLUS_TOKEN () { 17 }
42     sub GREATER_TOKEN () { 18 }
43     sub COMMA_TOKEN () { 19 }
44     sub TILDE_TOKEN () { 20 }
45     sub DASHMATCH_TOKEN () { 21 }
46     sub PREFIXMATCH_TOKEN () { 22 }
47     sub SUFFIXMATCH_TOKEN () { 23 }
48     sub SUBSTRINGMATCH_TOKEN () { 24 }
49     sub INCLUDES_TOKEN () { 25 }
50     sub SEMICOLON_TOKEN () { 26 }
51     sub LBRACE_TOKEN () { 27 }
52     sub RBRACE_TOKEN () { 28 }
53     sub LPAREN_TOKEN () { 29 }
54     sub RPAREN_TOKEN () { 30 }
55     sub LBRACKET_TOKEN () { 31 }
56     sub RBRACKET_TOKEN () { 32 }
57     sub S_TOKEN () { 33 }
58     sub CDO_TOKEN () { 34 }
59     sub CDC_TOKEN () { 35 }
60     sub COMMENT_TOKEN () { 36 }
61     sub COMMENT_INVALID_TOKEN () { 37 }
62     sub EOF_TOKEN () { 38 }
63    
64     our @TokenName = qw(
65 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
66 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
67     UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH
68     PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
69     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
70     COMMENT_INVALID EOF
71     );
72    
73 wakaba 1.1 sub new ($) {
74 wakaba 1.2 my $self = bless {token => [], get_char => sub { -1 },
75     onerror => sub { }}, shift;
76 wakaba 1.1 return $self;
77     } # new
78    
79     sub init ($) {
80     my $self = shift;
81     $self->{state} = BEFORE_TOKEN_STATE;
82     $self->{c} = $self->{get_char}->();
83     } # init
84    
85     sub get_next_token ($) {
86     my $self = shift;
87     if (@{$self->{token}}) {
88     return shift @{$self->{token}};
89     }
90    
91     my $current_token;
92     my $char;
93     my $num; # |{num}|, if any.
94     my $i; # |$i + 1|th character in |unicode| in |escape|.
95 wakaba 1.3 my $q;
96     ## NOTE:
97     ## 0: in |ident|.
98     ## 1: in |URI| outside of |string|.
99     ## 0x0022: in |string1| or |invalid1|.
100     ## 0x0027: in |string2| or |invalid2|.
101 wakaba 1.1
102     A: {
103     if ($self->{state} == BEFORE_TOKEN_STATE) {
104     if ($self->{c} == 0x002D) { # -
105     ## NOTE: |-| in |ident| in |IDENT|
106     $current_token = {type => IDENT_TOKEN, value => '-'};
107     $self->{state} = BEFORE_NMSTART_STATE;
108     $self->{c} = $self->{get_char}->();
109     redo A;
110 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
111     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
112 wakaba 1.1 $self->{c} == 0x005F or # _
113     $self->{c} > 0x007F) { # nonascii
114     ## NOTE: |nmstart| in |ident| in |IDENT|
115 wakaba 1.2 $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};
116 wakaba 1.1 $self->{state} = NAME_STATE;
117     $self->{c} = $self->{get_char}->();
118     redo A;
119     } elsif ($self->{c} == 0x005C) { # \
120     ## NOTE: |nmstart| in |ident| in |IDENT|
121     $current_token = {type => IDENT_TOKEN, value => ''};
122     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
123     $self->{c} = $self->{get_char}->();
124     redo A;
125     } elsif ($self->{c} == 0x0040) { # @
126     ## NOTE: |@| in |ATKEYWORD|
127     $current_token = {type => ATKEYWORD_TOKEN, value => ''};
128 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
129 wakaba 1.1 $self->{c} = $self->{get_char}->();
130     redo A;
131 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
132 wakaba 1.1 $current_token = {type => STRING_TOKEN, value => ''};
133 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
134 wakaba 1.1 $self->{c} = $self->{get_char}->();
135     redo A;
136     } elsif ($self->{c} == 0x0023) { # #
137     ## NOTE: |#| in |HASH|.
138     $current_token = {type => HASH_TOKEN, value => ''};
139     $self->{state} = HASH_OPEN_STATE;
140     $self->{c} = $self->{get_char}->();
141     redo A;
142     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
143     ## NOTE: |num|.
144     $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};
145     $self->{state} = NUMBER_STATE;
146     $self->{c} = $self->{get_char}->();
147     redo A;
148     } elsif ($self->{c} == 0x002E) { # .
149     ## NOTE: |num|.
150 wakaba 1.2 $current_token = {type => NUMBER_TOKEN, value => '0'};
151 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
152     $self->{c} = $self->{get_char}->();
153     redo A;
154 wakaba 1.4 } elsif ($self->{c} == 0x002F) { # /
155     $self->{c} = $self->{get_char}->();
156     if ($self->{c} == 0x002A) { # *
157     C: {
158     $self->{c} = $self->{get_char}->();
159     if ($self->{c} == 0x002A) { # *
160     D: {
161     $self->{c} = $self->{get_char}->();
162     if ($self->{c} == 0x002F) { # /
163     #
164     } elsif ($self->{c} == 0x002A) { # *
165     redo D;
166     } else {
167     redo C;
168     }
169     } # D
170     } elsif ($self->{c} == -1) {
171     # stay in the state
172     # reprocess
173     return {type => COMMENT_INVALID_TOKEN};
174     #redo A;
175     } else {
176     redo C;
177     }
178     } # C
179    
180     # stay in the state.
181     $self->{c} = $self->{get_char}->();
182     redo A;
183     } else {
184     # stay in the state.
185     # reprocess
186     return {type => DELIM_STATE, value => '/'};
187     #redo A;
188     }
189 wakaba 1.1 } elsif ($self->{c} == 0x003C) { # <
190     ## NOTE: |CDO|
191     $self->{c} = $self->{get_char}->();
192     if ($self->{c} == 0x0021) { # !
193     $self->{c} = $self->{get_char}->();
194     if ($self->{c} == 0x002C) { # -
195     $self->{c} = $self->{get_char}->();
196     if ($self->{c} == 0x002C) { # -
197     $self->{state} = BEFORE_TOKEN_STATE;
198     $self->{c} = $self->{get_char}->();
199     return {type => CDO_TOKEN};
200     #redo A;
201     } else {
202     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
203     ## NOTE: |-| in |ident| in |IDENT|
204     $current_token = {type => IDENT_TOKEN, value => '-'};
205     $self->{state} = BEFORE_NMSTART_STATE;
206     #reprocess
207     return {type => DELIM_TOKEN, value => '<'};
208     #redo A;
209     }
210     } else {
211     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
212     $self->{state} = BEFORE_TOKEN_STATE;
213     #reprocess
214     return {type => DELIM_TOKEN, value => '<'};
215     #redo A;
216     }
217     } else {
218     $self->{state} = BEFORE_TOKEN_STATE;
219     #reprocess
220     return {type => DELIM_TOKEN, value => '<'};
221     #redo A;
222     }
223 wakaba 1.2 } elsif (my $t = {
224     0x003B => SEMICOLON_TOKEN, # ;
225     0x007B => LBRACE_TOKEN, # {
226     0x007D => RBRACE_TOKEN, # }
227     0x0028 => LPAREN_TOKEN, # (
228     0x0029 => RPAREN_TOKEN, # )
229     0x005B => LBRACKET_TOKEN, # [
230     0x005D => RBRACKET_TOKEN, # ]
231 wakaba 1.1 }->{$self->{c}}) {
232     # stay in the state
233     $self->{c} = $self->{get_char}->();
234 wakaba 1.2 return {type => $t};
235 wakaba 1.1 # redo A;
236     } elsif ({
237     0x0020 => 1, # SP
238     0x0009 => 1, # \t
239     0x000D => 1, # \r
240     0x000A => 1, # \n
241     0x000C => 1, # \f
242     }->{$self->{c}}) {
243     W: {
244     $self->{c} = $self->{get_char}->();
245     if ({
246     0x0020 => 1, # SP
247     0x0009 => 1, # \t
248     0x000D => 1, # \r
249     0x000A => 1, # \n
250     0x000C => 1, # \f
251     }->{$self->{c}}) {
252     redo W;
253     } elsif (my $v = {
254     0x002B => PLUS_TOKEN, # +
255     0x003E => GREATER_TOKEN, # >
256     0x002C => COMMA_TOKEN, # ,
257     0x007E => TILDE_TOKEN, # ~
258     }->{$self->{c}}) {
259     # stay in the state
260     $self->{c} = $self->{get_char}->();
261     return {type => $v};
262     #redo A;
263     } else {
264     # stay in the state
265     # reprocess
266     return {type => S_TOKEN};
267     #redo A;
268     }
269     } # W
270     } elsif (my $v = {
271     0x007C => DASHMATCH_TOKEN, # |
272     0x005E => PREFIXMATCH_TOKEN, # ^
273     0x0024 => SUFFIXMATCH_TOKEN, # $
274     0x002A => SUBSTRINGMATCH_TOKEN, # *
275     }->{$self->{c}}) {
276 wakaba 1.2 my $c = $self->{c};
277 wakaba 1.1 $self->{c} = $self->{get_char}->();
278     if ($self->{c} == 0x003D) { # =
279     # stay in the state
280     $self->{c} = $self->{get_char}->();
281     return {type => $v};
282     #redo A;
283     } else {
284     # stay in the state
285     # reprocess
286 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
287 wakaba 1.1 #redo A;
288     }
289     } elsif ($self->{c} == 0x002B) { # +
290     # stay in the state
291     $self->{c} = $self->{get_char}->();
292     return {type => PLUS_TOKEN};
293     #redo A;
294     } elsif ($self->{c} == 0x003E) { # >
295     # stay in the state
296     $self->{c} = $self->{get_char}->();
297     return {type => GREATER_TOKEN};
298     #redo A;
299     } elsif ($self->{c} == 0x002C) { # ,
300     # stay in the state
301     $self->{c} = $self->{get_char}->();
302     return {type => COMMA_TOKEN};
303     #redo A;
304     } elsif ($self->{c} == 0x007E) { # ~
305     $self->{c} = $self->{get_char}->();
306     if ($self->{c} == 0x003D) { # =
307     # stay in the state
308     $self->{c} = $self->{get_char}->();
309     return {type => INCLUDES_TOKEN};
310     #redo A;
311     } else {
312     # stay in the state
313     # reprocess
314     return {type => TILDE_TOKEN};
315     #redo A;
316     }
317     } elsif ($self->{c} == -1) {
318     # stay in the state
319     $self->{c} = $self->{get_char}->();
320     return {type => EOF_TOKEN};
321     #redo A;
322     } else {
323     # stay in the state
324     $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};
325     $self->{c} = $self->{get_char}->();
326     return $current_token;
327     #redo A;
328     }
329     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
330 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
331     ## |FUNCTION|)
332 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
333     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
334 wakaba 1.1 $self->{c} == 0x005F or # _
335     $self->{c} > 0x007F) { # nonascii
336 wakaba 1.2 $current_token->{value} .= chr $self->{c};
337     $current_token->{type} = DIMENSION_TOKEN
338     if $current_token->{type} == NUMBER_TOKEN;
339 wakaba 1.1 $self->{state} = NAME_STATE;
340     $self->{c} = $self->{get_char}->();
341     redo A;
342     } elsif ($self->{c} == 0x005C) { # \
343 wakaba 1.2 ## TODO: 12-\X, 12-\{nl}
344 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
345     $self->{c} = $self->{get_char}->();
346     redo A;
347     } elsif ($self->{c} == 0x002D and # -
348     $current_token->{type} == IDENT_TOKEN) {
349     $self->{c} = $self->{get_char}->();
350     if ($self->{c} == 0x003E) { # >
351     $self->{state} = BEFORE_TOKEN_STATE;
352     $self->{c} = $self->{get_char}->();
353     return {type => CDC_TOKEN};
354     #redo A;
355     } else {
356     ## NOTE: |-|, |-|, $self->{c}
357     #$current_token = {type => IDENT_TOKEN, value => '-'};
358     # stay in the state
359     # reconsume
360     return {type => DELIM_TOKEN, value => '-'};
361     #redo A;
362     }
363     } else {
364     if ($current_token->{type} == NUMBER_TOKEN) {
365 wakaba 1.2 ## NOTE: |-| after |NUMBER|.
366     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
367     $self->{state} = BEFORE_TOKEN_STATE;
368     # reconsume
369     $current_token->{value} = $current_token->{number};
370     delete $current_token->{number};
371     return $current_token;
372 wakaba 1.1 } else {
373     ## NOTE: |-| not followed by |nmstart|.
374     $self->{state} = BEFORE_TOKEN_STATE;
375     $self->{c} = $self->{get_char}->();
376     return {type => DELIM_TOKEN, value => '-'};
377     }
378     }
379 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
380     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
381     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
382     $self->{c} == 0x005F or # _
383     $self->{c} > 0x007F) { # nonascii
384     $current_token->{value} .= chr $self->{c};
385     $self->{state} = NAME_STATE;
386     $self->{c} = $self->{get_char}->();
387     redo A;
388     } elsif ($self->{c} == 0x002D) { # -
389     $current_token->{value} .= '-';
390     $self->{state} = AFTER_AT_HYPHEN_STATE;
391     $self->{c} = $self->{get_char}->();
392     redo A;
393     } elsif ($self->{c} == 0x005C) { # \
394     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
395     $self->{c} = $self->{get_char}->();
396     redo A;
397     } else {
398     $self->{state} = BEFORE_TOKEN_STATE;
399     # reprocess
400     return {type => DELIM_TOKEN, value => '@'};
401     }
402     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
403     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
404     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
405     $self->{c} == 0x005F or # _
406     $self->{c} > 0x007F) { # nonascii
407     $current_token->{value} .= chr $self->{c};
408     $self->{state} = NAME_STATE;
409     $self->{c} = $self->{get_char}->();
410     redo A;
411     } elsif ($self->{c} == 0x002D) { # -
412     $self->{c} = $self->{get_char}->();
413     if ($self->{c} == 0x003E) { # >
414 wakaba 1.4 unshift @{$self->{token}}, {type => CDC_TOKEN};
415 wakaba 1.3 $self->{state} = BEFORE_TOKEN_STATE;
416     $self->{c} = $self->{get_char}->();
417 wakaba 1.4 return {type => DELIM_TOKEN, value => '@'};
418 wakaba 1.3 #redo A;
419     } else {
420     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
421     $current_token = {type => IDENT_TOKEN, value => '-'};
422     $self->{state} = BEFORE_NMSTART_STATE;
423     # reprocess
424     return {type => DELIM_TOKEN, value => '@'};
425     #redo A;
426     }
427     } elsif ($self->{c} == 0x005C) { # \
428     ## TODO: @-\{nl}
429     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
430     $self->{c} = $self->{get_char}->();
431     redo A;
432     } else {
433     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
434     $self->{state} = BEFORE_TOKEN_STATE;
435     # reprocess
436     return {type => DELIM_TOKEN, value => '@'};
437     }
438 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
439     if ($self->{c} == 0x002D) { # -
440     ## NOTE: |-| in |ident|.
441     $current_token->{value} = '-';
442     $self->{state} = BEFORE_NMSTART_STATE;
443     $self->{c} = $self->{get_char}->();
444     redo A;
445 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
446     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
447 wakaba 1.1 $self->{c} == 0x005F or # _
448     $self->{c} > 0x007F) { # nonascii
449     ## NOTE: |nmstart| in |ident|.
450 wakaba 1.2 $current_token->{value} = chr $self->{c};
451     $current_token->{type} = DIMENSION_TOKEN;
452 wakaba 1.1 $self->{state} = NAME_STATE;
453     $self->{c} = $self->{get_char}->();
454     redo A;
455     } elsif ($self->{c} == 0x005C) { # \
456     ## NOTE: |nmstart| in |ident| in |IDENT|
457     $current_token->{value} = '';
458     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
459     $self->{c} = $self->{get_char}->();
460     redo A;
461     } elsif ($self->{c} == 0x0025) { # %
462     $current_token->{type} = PERCENTAGE_TOKEN;
463     $self->{state} = BEFORE_TOKEN_STATE;
464     $self->{c} = $self->{get_char}->();
465     return $current_token;
466     #redo A;
467     } else {
468     $self->{state} = BEFORE_TOKEN_STATE;
469     # reprocess
470     return $current_token;
471     #redo A;
472     }
473     } elsif ($self->{state} == HASH_OPEN_STATE) {
474     ## NOTE: The first |nmchar| in |name| in |HASH|.
475 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
476     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
477     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
478 wakaba 1.1 $self->{c} == 0x002D or # -
479     $self->{c} == 0x005F or # _
480     $self->{c} > 0x007F) { # nonascii
481 wakaba 1.2 $current_token->{value} .= chr $self->{c};
482 wakaba 1.1 $self->{state} = NAME_STATE;
483     $self->{c} = $self->{get_char}->();
484     redo A;
485     } elsif ($self->{c} == 0x005C) { # \
486     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
487     $self->{c} = $self->{get_char}->();
488     redo A;
489     } else {
490     $self->{state} = BEFORE_TOKEN_STATE;
491     $self->{c} = $self->{get_char}->();
492     return {type => DELIM_TOKEN, value => '#'};
493     #redo A;
494     }
495     } elsif ($self->{state} == NAME_STATE) {
496     ## NOTE: |nmchar| in (|ident| or |name|).
497 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
498     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
499     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
500 wakaba 1.1 $self->{c} == 0x005F or # _
501     $self->{c} == 0x002D or # -
502     $self->{c} > 0x007F) { # nonascii
503 wakaba 1.2 $current_token->{value} .= chr $self->{c};
504 wakaba 1.1 # stay in the state
505     $self->{c} = $self->{get_char}->();
506     redo A;
507     } elsif ($self->{c} == 0x005C) { # \
508 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
509 wakaba 1.1 $self->{c} = $self->{get_char}->();
510     redo A;
511     } elsif ($self->{c} == 0x0028 and # (
512     $current_token->{type} == IDENT_TOKEN) { # (
513 wakaba 1.3 my $func_name = $current_token->{value};
514     $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
515     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
516     if ($current_token->{has_escape}) {
517     ## TODO: warn
518     }
519     $current_token->{type}
520     = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
521     $current_token->{value} = '';
522 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
523     $self->{c} = $self->{get_char}->();
524     redo A;
525     } else {
526     $current_token->{type} = FUNCTION_TOKEN;
527     $self->{state} = BEFORE_TOKEN_STATE;
528     $self->{c} = $self->{get_char}->();
529     return $current_token;
530     #redo A;
531     }
532     } else {
533     $self->{state} = BEFORE_TOKEN_STATE;
534     # reconsume
535     return $current_token;
536     #redo A;
537     }
538 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
539     while ({
540     0x0020 => 1, # SP
541     0x0009 => 1, # \t
542     0x000D => 1, # \r
543     0x000A => 1, # \n
544     0x000C => 1, # \f
545     }->{$self->{c}}) {
546     $self->{c} = $self->{get_char}->();
547     }
548     if ($self->{c} == -1) {
549     $current_token->{type} = {
550     URI_TOKEN, URI_INVALID_TOKEN,
551     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
552     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
553     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
554     }->{$current_token->{type}};
555     $self->{state} = BEFORE_TOKEN_STATE;
556     $self->{c} = $self->{get_char}->();
557     return $current_token;
558     #redo A;
559     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
560     ## TODO: Should we consider matches of "(" and ")"?
561     $current_token->{type} = {
562     URI_TOKEN, URI_INVALID_TOKEN,
563     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
564     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
565     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
566     }->{$current_token->{type}};
567     $self->{state} = URI_UNQUOTED_STATE;
568     $self->{c} = $self->{get_char}->();
569     redo A;
570     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
571     $self->{state} = STRING_STATE; $q = $self->{c};
572     $self->{c} = $self->{get_char}->();
573     redo A;
574     } elsif ($self->{c} == 0x0029) { # )
575     $self->{state} = BEFORE_TOKEN_STATE;
576     $self->{c} = $self->{get_char}->();
577     return $current_token;
578     #redo A;
579     } elsif ($self->{c} == 0x005C) { # \
580     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
581     $self->{c} = $self->{get_char}->();
582     redo A;
583     } else {
584     $current_token->{value} .= chr $self->{c};
585     $self->{state} = URI_UNQUOTED_STATE;
586     $self->{c} = $self->{get_char}->();
587     redo A;
588     }
589     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
590     if ({
591     0x0020 => 1, # SP
592     0x0009 => 1, # \t
593     0x000D => 1, # \r
594     0x000A => 1, # \n
595     0x000C => 1, # \f
596     }->{$self->{c}}) {
597     $self->{state} = URI_AFTER_WSP_STATE;
598     $self->{c} = $self->{get_char}->();
599     redo A;
600     } elsif ($self->{c} == -1) {
601     $current_token->{type} = {
602     URI_TOKEN, URI_INVALID_TOKEN,
603     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
604     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
605     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
606     }->{$current_token->{type}};
607     $self->{state} = BEFORE_TOKEN_STATE;
608     $self->{c} = $self->{get_char}->();
609     return $current_token;
610     #redo A;
611     } elsif ($self->{c} < 0x0020 or {
612     0x0022 => 1, # "
613     0x0027 => 1, # '
614     0x0028 => 1, # (
615     }->{$self->{c}}) { # C0 or (
616     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
617     $current_token->{type} = {
618     URI_TOKEN, URI_INVALID_TOKEN,
619     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
620     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
621     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
622     }->{$current_token->{type}};
623     # stay in the state.
624     $self->{c} = $self->{get_char}->();
625     redo A;
626     } elsif ($self->{c} == 0x0029) { # )
627     $self->{state} = BEFORE_TOKEN_STATE;
628     $self->{c} = $self->{get_char}->();
629     return $current_token;
630     #redo A;
631     } elsif ($self->{c} == 0x005C) { # \
632     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
633     $self->{c} = $self->{get_char}->();
634     redo A;
635     } else {
636     $current_token->{value} .= chr $self->{c};
637     # stay in the state.
638     $self->{c} = $self->{get_char}->();
639     redo A;
640     }
641     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
642     if ({
643     0x0020 => 1, # SP
644     0x0009 => 1, # \t
645     0x000D => 1, # \r
646     0x000A => 1, # \n
647     0x000C => 1, # \f
648     }->{$self->{c}}) {
649     # stay in the state.
650     $self->{c} = $self->{get_char}->();
651     redo A;
652     } elsif ($self->{c} == -1) {
653     $current_token->{type} = {
654     URI_TOKEN, URI_INVALID_TOKEN,
655     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
656     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
657     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
658     }->{$current_token->{type}};
659     $self->{state} = BEFORE_TOKEN_STATE;
660     $self->{c} = $self->{get_char}->();
661     return $current_token;
662     #redo A;
663     } elsif ($self->{c} == 0x0029) { # )
664     $self->{state} = BEFORE_TOKEN_STATE;
665     $self->{c} = $self->{get_char}->();
666     return $current_token;
667     #redo A;
668     } elsif ($self->{c} == 0x005C) { # \
669     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
670     $self->{c} = $self->{get_char}->();
671     redo A;
672     } else {
673     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
674     $current_token->{type} = {
675     URI_TOKEN, URI_INVALID_TOKEN,
676     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
677     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
678     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
679     }->{$current_token->{type}};
680     # stay in the state.
681     $self->{c} = $self->{get_char}->();
682     redo A;
683     }
684 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
685     $current_token->{has_escape} = 1;
686     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
687     ## NOTE: second character of |unicode| in |escape|.
688     $char = $self->{c} - 0x0030;
689     $self->{state} = ESCAPE_STATE; $i = 2;
690     $self->{c} = $self->{get_char}->();
691     redo A;
692     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
693     ## NOTE: second character of |unicode| in |escape|.
694     $char = $self->{c} - 0x0041 + 0xA;
695     $self->{state} = ESCAPE_STATE; $i = 2;
696     $self->{c} = $self->{get_char}->();
697     redo A;
698 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
699 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
700     $char = $self->{c} - 0x0061 - 0xA;
701     $self->{state} = ESCAPE_STATE; $i = 2;
702     $self->{c} = $self->{get_char}->();
703     redo A;
704     } elsif ($self->{c} == 0x000A or # \n
705     $self->{c} == 0x000C) { # \f
706     if ($q == 0) {
707     ## NOTE: In |escape| in ... in |ident|.
708     $self->{state} = BEFORE_TOKEN_STATE;
709     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
710     return $current_token;
711     # reconsume
712     #redo A;
713 wakaba 1.3 } elsif ($q == 1) {
714     ## NOTE: In |escape| in |URI|.
715     $current_token->{type} = {
716     URI_TOKEN, URI_INVALID_TOKEN,
717     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
718     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
719     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
720     }->{$current_token->{type}};
721     $current_token->{value} .= chr $self->{c};
722     $self->{state} = URI_UNQUOTED_STATE;
723     $self->{c} = $self->{get_char}->();
724     redo A;
725 wakaba 1.1 } else {
726     ## Note: In |nl| in ... in |string| or |ident|.
727     $current_token->{value} .= chr $self->{c};
728     $self->{state} = STRING_STATE;
729     $self->{c} = $self->{get_char}->();
730     redo A;
731     }
732     } elsif ($self->{c} == 0x000D) { # \r
733     if ($q == 0) {
734     ## NOTE: In |escape| in ... in |ident|.
735     $self->{state} = BEFORE_TOKEN_STATE;
736     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
737     return $current_token;
738     # reconsume
739     #redo A;
740 wakaba 1.3 } elsif ($q == 1) {
741     $current_token->{type} = {
742     URI_TOKEN, URI_INVALID_TOKEN,
743     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
744     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
745     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
746     }->{$current_token->{type}};
747     $current_token->{value} .= "\x0D\x0A";
748     $self->{state} = URI_UNQUOTED_STATE;
749     $self->{c} = $self->{get_char}->();
750     redo A;
751 wakaba 1.1 } else {
752     ## Note: In |nl| in ... in |string| or |ident|.
753     $current_token->{value} .= "\x0D\x0A";
754     $self->{state} = ESCAPE_BEFORE_LF_STATE;
755     $self->{c} = $self->{get_char}->();
756     redo A;
757     }
758     } else {
759     ## NOTE: second character of |escape|.
760     $current_token->{value} .= chr $self->{c};
761 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
762     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
763 wakaba 1.1 $self->{c} = $self->{get_char}->();
764     redo A;
765     }
766     } elsif ($self->{state} == ESCAPE_STATE) {
767     ## NOTE: third..seventh character of |unicode| in |escape|.
768     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
769     $char = $char * 0x10 + $self->{c} - 0x0030;
770     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
771     $self->{c} = $self->{get_char}->();
772     redo A;
773     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
774     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
775     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
776     $self->{c} = $self->{get_char}->();
777     redo A;
778 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
779 wakaba 1.1 $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
780     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
781     $self->{c} = $self->{get_char}->();
782     redo A;
783     } elsif ($self->{c} == 0x0020 or # SP
784     $self->{c} == 0x000A or # \n
785     $self->{c} == 0x0009 or # \t
786     $self->{c} == 0x000C) { # \f
787     $current_token->{value} .= chr $char;
788 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
789     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
790 wakaba 1.1 $self->{c} = $self->{get_char}->();
791     redo A;
792     } elsif ($self->{c} == 0x000D) { # \r
793     $self->{state} = ESCAPE_BEFORE_LF_STATE;
794     $self->{c} = $self->{get_char}->();
795     redo A;
796     } else {
797     $current_token->{value} .= chr $char;
798 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
799     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
800 wakaba 1.1 # reconsume
801     redo A;
802     }
803     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
804     ## NOTE: eightth character of |unicode| in |escape|.
805     if ($self->{c} == 0x0020 or # SP
806     $self->{c} == 0x000A or # \n
807     $self->{c} == 0x0009 or # \t
808     $self->{c} == 0x000C) { # \f
809     $current_token->{value} .= chr $char;
810 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
811     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
812 wakaba 1.1 $self->{c} = $self->{get_char}->();
813     redo A;
814     } elsif ($self->{c} == 0x000D) { # \r
815     $self->{state} = ESCAPE_BEFORE_NL_STATE;
816     $self->{c} = $self->{get_char}->();
817     redo A;
818     } else {
819     $current_token->{value} .= chr $char;
820 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
821     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
822 wakaba 1.1 # reconsume
823     redo A;
824     }
825     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
826     ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
827     if ($self->{c} == 0x000A) { # \n
828     $current_token->{value} .= chr $char;
829 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
830     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
831 wakaba 1.1 $self->{c} = $self->{get_char}->();
832     redo A;
833     } else {
834     $current_token->{value} .= chr $char;
835 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
836     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
837 wakaba 1.1 # reconsume
838     redo A;
839     }
840     } elsif ($self->{state} == STRING_STATE) {
841     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
842     ## a character in |invalid$Q| in |invalid| in |INVALID|,
843     ## where |$Q = $q == 0x0022 ? 1 : 2|.
844 wakaba 1.3 ## Or, in |URI|.
845 wakaba 1.1 if ($self->{c} == 0x005C) { # \
846     $self->{state} = ESCAPE_OPEN_STATE;
847     $self->{c} = $self->{get_char}->();
848     redo A;
849     } elsif ($self->{c} == $q) { # " | '
850 wakaba 1.3 if ($current_token->{type} == STRING_TOKEN) {
851     $self->{state} = BEFORE_TOKEN_STATE;
852     $self->{c} = $self->{get_char}->();
853     return $current_token;
854     #redo A;
855     } else {
856     $self->{state} = URI_AFTER_WSP_STATE;
857     $self->{c} = $self->{get_char}->();
858     redo A;
859     }
860 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
861     $self->{c} == 0x000D or # \r
862     $self->{c} == 0x000C or # \f
863     $self->{c} == -1) {
864     $current_token->{type} = INVALID_TOKEN;
865     $self->{state} = BEFORE_TOKEN_STATE;
866     # reconsume
867     return $current_token;
868     #redo A;
869     } else {
870     $current_token->{value} .= chr $self->{c};
871     # stay in the state
872     $self->{c} = $self->{get_char}->();
873     redo A;
874     }
875     } elsif ($self->{state} == NUMBER_STATE) {
876     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
877     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
878     $current_token->{value} .= chr $self->{c};
879     # stay in the state
880     $self->{c} = $self->{get_char}->();
881     redo A;
882     } elsif ($self->{c} == 0x002E) { # .
883     $self->{state} = NUMBER_DOT_STATE;
884     $self->{c} = $self->{get_char}->();
885     redo A;
886     } else {
887 wakaba 1.2 $current_token->{number} = $current_token->{value};
888     $current_token->{value} = '';
889 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
890     # reprocess
891 wakaba 1.2 redo A;
892 wakaba 1.1 }
893     } elsif ($self->{state} == NUMBER_DOT_STATE) {
894     ## NOTE: The character immediately following |.| in |num|.
895     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
896 wakaba 1.2 $current_token->{value} .= '.' . chr $self->{c};
897 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
898     $self->{c} = $self->{get_char}->();
899     redo A;
900     } else {
901     unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
902 wakaba 1.2 $current_token->{number} = $current_token->{value};
903     $current_token->{value} = '';
904 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
905     # reprocess
906     return $current_token;
907     #redo A;
908     }
909     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
910     ## NOTE: The character immediately following |.| at the beginning of |num|.
911     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
912 wakaba 1.2 $current_token->{value} .= '.' . chr $self->{c};
913 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
914     $self->{c} = $self->{get_char}->();
915     redo A;
916     } else {
917     $self->{state} = BEFORE_TOKEN_STATE;
918     $self->{c} = $self->{get_char}->();
919     return {type => DELIM_TOKEN, value => '.'};
920     #redo A;
921     }
922     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
923     ## NOTE: |[0-9]| in |num| after |.|.
924     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
925     $current_token->{value} .= chr $self->{c};
926     # stay in the state
927     $self->{c} = $self->{get_char}->();
928     redo A;
929     } else {
930 wakaba 1.2 $current_token->{number} = $current_token->{value};
931     $current_token->{value} = '';
932 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
933     # reprocess
934 wakaba 1.2 redo A;
935 wakaba 1.1 }
936     } else {
937     die "$0: Unknown state |$self->{state}|";
938     }
939     } # A
940    
941     ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|
942    
943     } # get_next_token
944    
945     1;
946 wakaba 1.4 # $Date: 2007/09/08 02:40:47 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24