/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations) (download)
Sat Sep 8 02:40:47 2007 UTC (17 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.2: +279 -54 lines
++ whatpm/Whatpm/CSS/ChangeLog	8 Sep 2007 02:40:23 -0000
	* Tokenizer.pm: |url()| and |url-prefix()| are implemented.
	Bug for treatement for |@-| is fixed.

2007-09-08  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::CSS::Tokenizer;
2     use strict;
3    
4 wakaba 1.2 sub BEFORE_TOKEN_STATE () { 0 }
5     sub BEFORE_NMSTART_STATE () { 1 }
6     sub NAME_STATE () { 2 }
7     sub ESCAPE_OPEN_STATE () { 3 }
8     sub STRING_STATE () { 4 }
9     sub HASH_OPEN_STATE () { 5 }
10     sub NUMBER_STATE () { 6 }
11     sub NUMBER_FRACTION_STATE () { 7 }
12     sub AFTER_NUMBER_STATE () { 8 }
13     sub URI_BEFORE_WSP_STATE () { 9 }
14     sub ESCAPE_STATE () { 10 }
15     sub ESCAPE_BEFORE_LF_STATE () { 11 }
16     sub ESCAPE_BEFORE_NL_STATE () { 12 }
17     sub NUMBER_DOT_STATE () { 13 }
18     sub NUMBER_DOT_NUMBER_STATE () { 14 }
19     sub DELIM_STATE () { 15 }
20 wakaba 1.3 sub URI_UNQUOTED_STATE () { 16 }
21     sub URI_AFTER_WSP_STATE () { 17 }
22     sub AFTER_AT_STATE () { 18 }
23     sub AFTER_AT_HYPHEN_STATE () { 19 }
24 wakaba 1.2
25     sub IDENT_TOKEN () { 1 }
26     sub ATKEYWORD_TOKEN () { 2 }
27     sub HASH_TOKEN () { 3 }
28     sub FUNCTION_TOKEN () { 4 }
29     sub URI_TOKEN () { 5 }
30     sub URI_INVALID_TOKEN () { 6 }
31     sub URI_PREFIX_TOKEN () { 7 }
32     sub URI_PREFIX_INVALID_TOKEN () { 8 }
33     sub STRING_TOKEN () { 9 }
34     sub INVALID_TOKEN () { 10 }
35     sub NUMBER_TOKEN () { 11 }
36     sub DIMENSION_TOKEN () { 12 }
37     sub PERCENTAGE_TOKEN () { 13 }
38     sub UNICODE_RANGE_TOKEN () { 14 }
39     sub UNICODE_RANGE_INVALID_TOKEN () { 15 }
40     sub DELIM_TOKEN () { 16 }
41     sub PLUS_TOKEN () { 17 }
42     sub GREATER_TOKEN () { 18 }
43     sub COMMA_TOKEN () { 19 }
44     sub TILDE_TOKEN () { 20 }
45     sub DASHMATCH_TOKEN () { 21 }
46     sub PREFIXMATCH_TOKEN () { 22 }
47     sub SUFFIXMATCH_TOKEN () { 23 }
48     sub SUBSTRINGMATCH_TOKEN () { 24 }
49     sub INCLUDES_TOKEN () { 25 }
50     sub SEMICOLON_TOKEN () { 26 }
51     sub LBRACE_TOKEN () { 27 }
52     sub RBRACE_TOKEN () { 28 }
53     sub LPAREN_TOKEN () { 29 }
54     sub RPAREN_TOKEN () { 30 }
55     sub LBRACKET_TOKEN () { 31 }
56     sub RBRACKET_TOKEN () { 32 }
57     sub S_TOKEN () { 33 }
58     sub CDO_TOKEN () { 34 }
59     sub CDC_TOKEN () { 35 }
60     sub COMMENT_TOKEN () { 36 }
61     sub COMMENT_INVALID_TOKEN () { 37 }
62     sub EOF_TOKEN () { 38 }
63    
64     our @TokenName = qw(
65 wakaba 1.3 0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
66 wakaba 1.2 STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
67     UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH
68     PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
69     LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
70     COMMENT_INVALID EOF
71     );
72    
73 wakaba 1.1 sub new ($) {
74 wakaba 1.2 my $self = bless {token => [], get_char => sub { -1 },
75     onerror => sub { }}, shift;
76 wakaba 1.1 return $self;
77     } # new
78    
79     sub init ($) {
80     my $self = shift;
81     $self->{state} = BEFORE_TOKEN_STATE;
82     $self->{c} = $self->{get_char}->();
83     } # init
84    
85     sub get_next_token ($) {
86     my $self = shift;
87     if (@{$self->{token}}) {
88     return shift @{$self->{token}};
89     }
90    
91     my $current_token;
92     my $char;
93     my $num; # |{num}|, if any.
94     my $i; # |$i + 1|th character in |unicode| in |escape|.
95 wakaba 1.3 my $q;
96     ## NOTE:
97     ## 0: in |ident|.
98     ## 1: in |URI| outside of |string|.
99     ## 0x0022: in |string1| or |invalid1|.
100     ## 0x0027: in |string2| or |invalid2|.
101 wakaba 1.1
102     A: {
103     if ($self->{state} == BEFORE_TOKEN_STATE) {
104     if ($self->{c} == 0x002D) { # -
105     ## NOTE: |-| in |ident| in |IDENT|
106     $current_token = {type => IDENT_TOKEN, value => '-'};
107     $self->{state} = BEFORE_NMSTART_STATE;
108     $self->{c} = $self->{get_char}->();
109     redo A;
110 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
111     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
112 wakaba 1.1 $self->{c} == 0x005F or # _
113     $self->{c} > 0x007F) { # nonascii
114     ## NOTE: |nmstart| in |ident| in |IDENT|
115 wakaba 1.2 $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};
116 wakaba 1.1 $self->{state} = NAME_STATE;
117     $self->{c} = $self->{get_char}->();
118     redo A;
119     } elsif ($self->{c} == 0x005C) { # \
120     ## NOTE: |nmstart| in |ident| in |IDENT|
121     $current_token = {type => IDENT_TOKEN, value => ''};
122     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
123     $self->{c} = $self->{get_char}->();
124     redo A;
125     } elsif ($self->{c} == 0x0040) { # @
126     ## NOTE: |@| in |ATKEYWORD|
127     $current_token = {type => ATKEYWORD_TOKEN, value => ''};
128 wakaba 1.3 $self->{state} = AFTER_AT_STATE;
129 wakaba 1.1 $self->{c} = $self->{get_char}->();
130     redo A;
131 wakaba 1.3 } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
132 wakaba 1.1 $current_token = {type => STRING_TOKEN, value => ''};
133 wakaba 1.3 $self->{state} = STRING_STATE; $q = $self->{c};
134 wakaba 1.1 $self->{c} = $self->{get_char}->();
135     redo A;
136     } elsif ($self->{c} == 0x0023) { # #
137     ## NOTE: |#| in |HASH|.
138     $current_token = {type => HASH_TOKEN, value => ''};
139     $self->{state} = HASH_OPEN_STATE;
140     $self->{c} = $self->{get_char}->();
141     redo A;
142     } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
143     ## NOTE: |num|.
144     $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};
145     $self->{state} = NUMBER_STATE;
146     $self->{c} = $self->{get_char}->();
147     redo A;
148     } elsif ($self->{c} == 0x002E) { # .
149     ## NOTE: |num|.
150 wakaba 1.2 $current_token = {type => NUMBER_TOKEN, value => '0'};
151 wakaba 1.1 $self->{state} = NUMBER_FRACTION_STATE;
152     $self->{c} = $self->{get_char}->();
153     redo A;
154     } elsif ($self->{c} == 0x003C) { # <
155     ## NOTE: |CDO|
156     $self->{c} = $self->{get_char}->();
157     if ($self->{c} == 0x0021) { # !
158     $self->{c} = $self->{get_char}->();
159     if ($self->{c} == 0x002C) { # -
160     $self->{c} = $self->{get_char}->();
161     if ($self->{c} == 0x002C) { # -
162     $self->{state} = BEFORE_TOKEN_STATE;
163     $self->{c} = $self->{get_char}->();
164     return {type => CDO_TOKEN};
165     #redo A;
166     } else {
167     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
168     ## NOTE: |-| in |ident| in |IDENT|
169     $current_token = {type => IDENT_TOKEN, value => '-'};
170     $self->{state} = BEFORE_NMSTART_STATE;
171     #reprocess
172     return {type => DELIM_TOKEN, value => '<'};
173     #redo A;
174     }
175     } else {
176     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
177     $self->{state} = BEFORE_TOKEN_STATE;
178     #reprocess
179     return {type => DELIM_TOKEN, value => '<'};
180     #redo A;
181     }
182     } else {
183     $self->{state} = BEFORE_TOKEN_STATE;
184     #reprocess
185     return {type => DELIM_TOKEN, value => '<'};
186     #redo A;
187     }
188 wakaba 1.2 } elsif (my $t = {
189     0x003B => SEMICOLON_TOKEN, # ;
190     0x007B => LBRACE_TOKEN, # {
191     0x007D => RBRACE_TOKEN, # }
192     0x0028 => LPAREN_TOKEN, # (
193     0x0029 => RPAREN_TOKEN, # )
194     0x005B => LBRACKET_TOKEN, # [
195     0x005D => RBRACKET_TOKEN, # ]
196 wakaba 1.1 }->{$self->{c}}) {
197     # stay in the state
198     $self->{c} = $self->{get_char}->();
199 wakaba 1.2 return {type => $t};
200 wakaba 1.1 # redo A;
201     } elsif ({
202     0x0020 => 1, # SP
203     0x0009 => 1, # \t
204     0x000D => 1, # \r
205     0x000A => 1, # \n
206     0x000C => 1, # \f
207     }->{$self->{c}}) {
208     W: {
209     $self->{c} = $self->{get_char}->();
210     if ({
211     0x0020 => 1, # SP
212     0x0009 => 1, # \t
213     0x000D => 1, # \r
214     0x000A => 1, # \n
215     0x000C => 1, # \f
216     }->{$self->{c}}) {
217     redo W;
218     } elsif (my $v = {
219     0x002B => PLUS_TOKEN, # +
220     0x003E => GREATER_TOKEN, # >
221     0x002C => COMMA_TOKEN, # ,
222     0x007E => TILDE_TOKEN, # ~
223     }->{$self->{c}}) {
224     # stay in the state
225     $self->{c} = $self->{get_char}->();
226     return {type => $v};
227     #redo A;
228     } else {
229     # stay in the state
230     # reprocess
231     return {type => S_TOKEN};
232     #redo A;
233     }
234     } # W
235     } elsif (my $v = {
236     0x007C => DASHMATCH_TOKEN, # |
237     0x005E => PREFIXMATCH_TOKEN, # ^
238     0x0024 => SUFFIXMATCH_TOKEN, # $
239     0x002A => SUBSTRINGMATCH_TOKEN, # *
240     }->{$self->{c}}) {
241 wakaba 1.2 my $c = $self->{c};
242 wakaba 1.1 $self->{c} = $self->{get_char}->();
243     if ($self->{c} == 0x003D) { # =
244     # stay in the state
245     $self->{c} = $self->{get_char}->();
246     return {type => $v};
247     #redo A;
248     } else {
249     # stay in the state
250     # reprocess
251 wakaba 1.2 return {type => DELIM_TOKEN, value => chr $c};
252 wakaba 1.1 #redo A;
253     }
254     } elsif ($self->{c} == 0x002B) { # +
255     # stay in the state
256     $self->{c} = $self->{get_char}->();
257     return {type => PLUS_TOKEN};
258     #redo A;
259     } elsif ($self->{c} == 0x003E) { # >
260     # stay in the state
261     $self->{c} = $self->{get_char}->();
262     return {type => GREATER_TOKEN};
263     #redo A;
264     } elsif ($self->{c} == 0x002C) { # ,
265     # stay in the state
266     $self->{c} = $self->{get_char}->();
267     return {type => COMMA_TOKEN};
268     #redo A;
269     } elsif ($self->{c} == 0x007E) { # ~
270     $self->{c} = $self->{get_char}->();
271     if ($self->{c} == 0x003D) { # =
272     # stay in the state
273     $self->{c} = $self->{get_char}->();
274     return {type => INCLUDES_TOKEN};
275     #redo A;
276     } else {
277     # stay in the state
278     # reprocess
279     return {type => TILDE_TOKEN};
280     #redo A;
281     }
282     } elsif ($self->{c} == -1) {
283     # stay in the state
284     $self->{c} = $self->{get_char}->();
285     return {type => EOF_TOKEN};
286     #redo A;
287     } else {
288     # stay in the state
289     $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};
290     $self->{c} = $self->{get_char}->();
291     return $current_token;
292     #redo A;
293     }
294     } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
295 wakaba 1.3 ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
296     ## |FUNCTION|)
297 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
298     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
299 wakaba 1.1 $self->{c} == 0x005F or # _
300     $self->{c} > 0x007F) { # nonascii
301 wakaba 1.2 $current_token->{value} .= chr $self->{c};
302     $current_token->{type} = DIMENSION_TOKEN
303     if $current_token->{type} == NUMBER_TOKEN;
304 wakaba 1.1 $self->{state} = NAME_STATE;
305     $self->{c} = $self->{get_char}->();
306     redo A;
307     } elsif ($self->{c} == 0x005C) { # \
308 wakaba 1.2 ## TODO: 12-\X, 12-\{nl}
309 wakaba 1.1 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
310     $self->{c} = $self->{get_char}->();
311     redo A;
312     } elsif ($self->{c} == 0x002D and # -
313     $current_token->{type} == IDENT_TOKEN) {
314     $self->{c} = $self->{get_char}->();
315     if ($self->{c} == 0x003E) { # >
316     $self->{state} = BEFORE_TOKEN_STATE;
317     $self->{c} = $self->{get_char}->();
318     return {type => CDC_TOKEN};
319     #redo A;
320     } else {
321     ## NOTE: |-|, |-|, $self->{c}
322     #$current_token = {type => IDENT_TOKEN, value => '-'};
323     # stay in the state
324     # reconsume
325     return {type => DELIM_TOKEN, value => '-'};
326     #redo A;
327     }
328     } else {
329     if ($current_token->{type} == NUMBER_TOKEN) {
330 wakaba 1.2 ## NOTE: |-| after |NUMBER|.
331     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
332     $self->{state} = BEFORE_TOKEN_STATE;
333     # reconsume
334     $current_token->{value} = $current_token->{number};
335     delete $current_token->{number};
336     return $current_token;
337 wakaba 1.1 } else {
338     ## NOTE: |-| not followed by |nmstart|.
339     $self->{state} = BEFORE_TOKEN_STATE;
340     $self->{c} = $self->{get_char}->();
341     return {type => DELIM_TOKEN, value => '-'};
342     }
343     }
344 wakaba 1.3 } elsif ($self->{state} == AFTER_AT_STATE) {
345     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
346     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
347     $self->{c} == 0x005F or # _
348     $self->{c} > 0x007F) { # nonascii
349     $current_token->{value} .= chr $self->{c};
350     $self->{state} = NAME_STATE;
351     $self->{c} = $self->{get_char}->();
352     redo A;
353     } elsif ($self->{c} == 0x002D) { # -
354     $current_token->{value} .= '-';
355     $self->{state} = AFTER_AT_HYPHEN_STATE;
356     $self->{c} = $self->{get_char}->();
357     redo A;
358     } elsif ($self->{c} == 0x005C) { # \
359     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
360     $self->{c} = $self->{get_char}->();
361     redo A;
362     } else {
363     $self->{state} = BEFORE_TOKEN_STATE;
364     # reprocess
365     return {type => DELIM_TOKEN, value => '@'};
366     }
367     } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
368     if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
369     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
370     $self->{c} == 0x005F or # _
371     $self->{c} > 0x007F) { # nonascii
372     $current_token->{value} .= chr $self->{c};
373     $self->{state} = NAME_STATE;
374     $self->{c} = $self->{get_char}->();
375     redo A;
376     } elsif ($self->{c} == 0x002D) { # -
377     $self->{c} = $self->{get_char}->();
378     if ($self->{c} == 0x003E) { # >
379     $self->{state} = BEFORE_TOKEN_STATE;
380     $self->{c} = $self->{get_char}->();
381     return {type => CDC_TOKEN};
382     #redo A;
383     } else {
384     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
385     $current_token = {type => IDENT_TOKEN, value => '-'};
386     $self->{state} = BEFORE_NMSTART_STATE;
387     # reprocess
388     return {type => DELIM_TOKEN, value => '@'};
389     #redo A;
390     }
391     } elsif ($self->{c} == 0x005C) { # \
392     ## TODO: @-\{nl}
393     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
394     $self->{c} = $self->{get_char}->();
395     redo A;
396     } else {
397     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
398     $self->{state} = BEFORE_TOKEN_STATE;
399     # reprocess
400     return {type => DELIM_TOKEN, value => '@'};
401     }
402 wakaba 1.1 } elsif ($self->{state} == AFTER_NUMBER_STATE) {
403     if ($self->{c} == 0x002D) { # -
404     ## NOTE: |-| in |ident|.
405     $current_token->{value} = '-';
406     $self->{state} = BEFORE_NMSTART_STATE;
407     $self->{c} = $self->{get_char}->();
408     redo A;
409 wakaba 1.2 } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
410     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
411 wakaba 1.1 $self->{c} == 0x005F or # _
412     $self->{c} > 0x007F) { # nonascii
413     ## NOTE: |nmstart| in |ident|.
414 wakaba 1.2 $current_token->{value} = chr $self->{c};
415     $current_token->{type} = DIMENSION_TOKEN;
416 wakaba 1.1 $self->{state} = NAME_STATE;
417     $self->{c} = $self->{get_char}->();
418     redo A;
419     } elsif ($self->{c} == 0x005C) { # \
420     ## NOTE: |nmstart| in |ident| in |IDENT|
421     $current_token->{value} = '';
422     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
423     $self->{c} = $self->{get_char}->();
424     redo A;
425     } elsif ($self->{c} == 0x0025) { # %
426     $current_token->{type} = PERCENTAGE_TOKEN;
427     $self->{state} = BEFORE_TOKEN_STATE;
428     $self->{c} = $self->{get_char}->();
429     return $current_token;
430     #redo A;
431     } else {
432     $self->{state} = BEFORE_TOKEN_STATE;
433     # reprocess
434     return $current_token;
435     #redo A;
436     }
437     } elsif ($self->{state} == HASH_OPEN_STATE) {
438     ## NOTE: The first |nmchar| in |name| in |HASH|.
439 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
440     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
441     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
442 wakaba 1.1 $self->{c} == 0x002D or # -
443     $self->{c} == 0x005F or # _
444     $self->{c} > 0x007F) { # nonascii
445 wakaba 1.2 $current_token->{value} .= chr $self->{c};
446 wakaba 1.1 $self->{state} = NAME_STATE;
447     $self->{c} = $self->{get_char}->();
448     redo A;
449     } elsif ($self->{c} == 0x005C) { # \
450     $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
451     $self->{c} = $self->{get_char}->();
452     redo A;
453     } else {
454     $self->{state} = BEFORE_TOKEN_STATE;
455     $self->{c} = $self->{get_char}->();
456     return {type => DELIM_TOKEN, value => '#'};
457     #redo A;
458     }
459     } elsif ($self->{state} == NAME_STATE) {
460     ## NOTE: |nmchar| in (|ident| or |name|).
461 wakaba 1.2 if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
462     (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
463     (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
464 wakaba 1.1 $self->{c} == 0x005F or # _
465     $self->{c} == 0x002D or # -
466     $self->{c} > 0x007F) { # nonascii
467 wakaba 1.2 $current_token->{value} .= chr $self->{c};
468 wakaba 1.1 # stay in the state
469     $self->{c} = $self->{get_char}->();
470     redo A;
471     } elsif ($self->{c} == 0x005C) { # \
472 wakaba 1.3 $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
473 wakaba 1.1 $self->{c} = $self->{get_char}->();
474     redo A;
475     } elsif ($self->{c} == 0x0028 and # (
476     $current_token->{type} == IDENT_TOKEN) { # (
477 wakaba 1.3 my $func_name = $current_token->{value};
478     $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
479     if ($func_name eq 'url' or $func_name eq 'url-prefix') {
480     if ($current_token->{has_escape}) {
481     ## TODO: warn
482     }
483     $current_token->{type}
484     = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
485     $current_token->{value} = '';
486 wakaba 1.1 $self->{state} = URI_BEFORE_WSP_STATE;
487     $self->{c} = $self->{get_char}->();
488     redo A;
489     } else {
490     $current_token->{type} = FUNCTION_TOKEN;
491     $self->{state} = BEFORE_TOKEN_STATE;
492     $self->{c} = $self->{get_char}->();
493     return $current_token;
494     #redo A;
495     }
496     } else {
497     $self->{state} = BEFORE_TOKEN_STATE;
498     # reconsume
499     return $current_token;
500     #redo A;
501     }
502 wakaba 1.3 } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
503     while ({
504     0x0020 => 1, # SP
505     0x0009 => 1, # \t
506     0x000D => 1, # \r
507     0x000A => 1, # \n
508     0x000C => 1, # \f
509     }->{$self->{c}}) {
510     $self->{c} = $self->{get_char}->();
511     }
512     if ($self->{c} == -1) {
513     $current_token->{type} = {
514     URI_TOKEN, URI_INVALID_TOKEN,
515     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
516     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
517     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
518     }->{$current_token->{type}};
519     $self->{state} = BEFORE_TOKEN_STATE;
520     $self->{c} = $self->{get_char}->();
521     return $current_token;
522     #redo A;
523     } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
524     ## TODO: Should we consider matches of "(" and ")"?
525     $current_token->{type} = {
526     URI_TOKEN, URI_INVALID_TOKEN,
527     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
528     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
529     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
530     }->{$current_token->{type}};
531     $self->{state} = URI_UNQUOTED_STATE;
532     $self->{c} = $self->{get_char}->();
533     redo A;
534     } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
535     $self->{state} = STRING_STATE; $q = $self->{c};
536     $self->{c} = $self->{get_char}->();
537     redo A;
538     } elsif ($self->{c} == 0x0029) { # )
539     $self->{state} = BEFORE_TOKEN_STATE;
540     $self->{c} = $self->{get_char}->();
541     return $current_token;
542     #redo A;
543     } elsif ($self->{c} == 0x005C) { # \
544     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
545     $self->{c} = $self->{get_char}->();
546     redo A;
547     } else {
548     $current_token->{value} .= chr $self->{c};
549     $self->{state} = URI_UNQUOTED_STATE;
550     $self->{c} = $self->{get_char}->();
551     redo A;
552     }
553     } elsif ($self->{state} == URI_UNQUOTED_STATE) {
554     if ({
555     0x0020 => 1, # SP
556     0x0009 => 1, # \t
557     0x000D => 1, # \r
558     0x000A => 1, # \n
559     0x000C => 1, # \f
560     }->{$self->{c}}) {
561     $self->{state} = URI_AFTER_WSP_STATE;
562     $self->{c} = $self->{get_char}->();
563     redo A;
564     } elsif ($self->{c} == -1) {
565     $current_token->{type} = {
566     URI_TOKEN, URI_INVALID_TOKEN,
567     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
568     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
569     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
570     }->{$current_token->{type}};
571     $self->{state} = BEFORE_TOKEN_STATE;
572     $self->{c} = $self->{get_char}->();
573     return $current_token;
574     #redo A;
575     } elsif ($self->{c} < 0x0020 or {
576     0x0022 => 1, # "
577     0x0027 => 1, # '
578     0x0028 => 1, # (
579     }->{$self->{c}}) { # C0 or (
580     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
581     $current_token->{type} = {
582     URI_TOKEN, URI_INVALID_TOKEN,
583     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
584     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
585     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
586     }->{$current_token->{type}};
587     # stay in the state.
588     $self->{c} = $self->{get_char}->();
589     redo A;
590     } elsif ($self->{c} == 0x0029) { # )
591     $self->{state} = BEFORE_TOKEN_STATE;
592     $self->{c} = $self->{get_char}->();
593     return $current_token;
594     #redo A;
595     } elsif ($self->{c} == 0x005C) { # \
596     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
597     $self->{c} = $self->{get_char}->();
598     redo A;
599     } else {
600     $current_token->{value} .= chr $self->{c};
601     # stay in the state.
602     $self->{c} = $self->{get_char}->();
603     redo A;
604     }
605     } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
606     if ({
607     0x0020 => 1, # SP
608     0x0009 => 1, # \t
609     0x000D => 1, # \r
610     0x000A => 1, # \n
611     0x000C => 1, # \f
612     }->{$self->{c}}) {
613     # stay in the state.
614     $self->{c} = $self->{get_char}->();
615     redo A;
616     } elsif ($self->{c} == -1) {
617     $current_token->{type} = {
618     URI_TOKEN, URI_INVALID_TOKEN,
619     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
620     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
621     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
622     }->{$current_token->{type}};
623     $self->{state} = BEFORE_TOKEN_STATE;
624     $self->{c} = $self->{get_char}->();
625     return $current_token;
626     #redo A;
627     } elsif ($self->{c} == 0x0029) { # )
628     $self->{state} = BEFORE_TOKEN_STATE;
629     $self->{c} = $self->{get_char}->();
630     return $current_token;
631     #redo A;
632     } elsif ($self->{c} == 0x005C) { # \
633     $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
634     $self->{c} = $self->{get_char}->();
635     redo A;
636     } else {
637     ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
638     $current_token->{type} = {
639     URI_TOKEN, URI_INVALID_TOKEN,
640     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
641     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
642     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
643     }->{$current_token->{type}};
644     # stay in the state.
645     $self->{c} = $self->{get_char}->();
646     redo A;
647     }
648 wakaba 1.1 } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
649     $current_token->{has_escape} = 1;
650     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
651     ## NOTE: second character of |unicode| in |escape|.
652     $char = $self->{c} - 0x0030;
653     $self->{state} = ESCAPE_STATE; $i = 2;
654     $self->{c} = $self->{get_char}->();
655     redo A;
656     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
657     ## NOTE: second character of |unicode| in |escape|.
658     $char = $self->{c} - 0x0041 + 0xA;
659     $self->{state} = ESCAPE_STATE; $i = 2;
660     $self->{c} = $self->{get_char}->();
661     redo A;
662 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
663 wakaba 1.1 ## NOTE: second character of |unicode| in |escape|.
664     $char = $self->{c} - 0x0061 - 0xA;
665     $self->{state} = ESCAPE_STATE; $i = 2;
666     $self->{c} = $self->{get_char}->();
667     redo A;
668     } elsif ($self->{c} == 0x000A or # \n
669     $self->{c} == 0x000C) { # \f
670     if ($q == 0) {
671     ## NOTE: In |escape| in ... in |ident|.
672     $self->{state} = BEFORE_TOKEN_STATE;
673     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
674     return $current_token;
675     # reconsume
676     #redo A;
677 wakaba 1.3 } elsif ($q == 1) {
678     ## NOTE: In |escape| in |URI|.
679     $current_token->{type} = {
680     URI_TOKEN, URI_INVALID_TOKEN,
681     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
682     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
683     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
684     }->{$current_token->{type}};
685     $current_token->{value} .= chr $self->{c};
686     $self->{state} = URI_UNQUOTED_STATE;
687     $self->{c} = $self->{get_char}->();
688     redo A;
689 wakaba 1.1 } else {
690     ## Note: In |nl| in ... in |string| or |ident|.
691     $current_token->{value} .= chr $self->{c};
692     $self->{state} = STRING_STATE;
693     $self->{c} = $self->{get_char}->();
694     redo A;
695     }
696     } elsif ($self->{c} == 0x000D) { # \r
697     if ($q == 0) {
698     ## NOTE: In |escape| in ... in |ident|.
699     $self->{state} = BEFORE_TOKEN_STATE;
700     unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
701     return $current_token;
702     # reconsume
703     #redo A;
704 wakaba 1.3 } elsif ($q == 1) {
705     $current_token->{type} = {
706     URI_TOKEN, URI_INVALID_TOKEN,
707     URI_INVALID_TOKEN, URI_INVALID_TOKEN,
708     URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
709     URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
710     }->{$current_token->{type}};
711     $current_token->{value} .= "\x0D\x0A";
712     $self->{state} = URI_UNQUOTED_STATE;
713     $self->{c} = $self->{get_char}->();
714     redo A;
715 wakaba 1.1 } else {
716     ## Note: In |nl| in ... in |string| or |ident|.
717     $current_token->{value} .= "\x0D\x0A";
718     $self->{state} = ESCAPE_BEFORE_LF_STATE;
719     $self->{c} = $self->{get_char}->();
720     redo A;
721     }
722     } else {
723     ## NOTE: second character of |escape|.
724     $current_token->{value} .= chr $self->{c};
725 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
726     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
727 wakaba 1.1 $self->{c} = $self->{get_char}->();
728     redo A;
729     }
730     } elsif ($self->{state} == ESCAPE_STATE) {
731     ## NOTE: third..seventh character of |unicode| in |escape|.
732     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
733     $char = $char * 0x10 + $self->{c} - 0x0030;
734     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
735     $self->{c} = $self->{get_char}->();
736     redo A;
737     } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
738     $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
739     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
740     $self->{c} = $self->{get_char}->();
741     redo A;
742 wakaba 1.2 } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
743 wakaba 1.1 $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
744     $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
745     $self->{c} = $self->{get_char}->();
746     redo A;
747     } elsif ($self->{c} == 0x0020 or # SP
748     $self->{c} == 0x000A or # \n
749     $self->{c} == 0x0009 or # \t
750     $self->{c} == 0x000C) { # \f
751     $current_token->{value} .= chr $char;
752 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
753     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
754 wakaba 1.1 $self->{c} = $self->{get_char}->();
755     redo A;
756     } elsif ($self->{c} == 0x000D) { # \r
757     $self->{state} = ESCAPE_BEFORE_LF_STATE;
758     $self->{c} = $self->{get_char}->();
759     redo A;
760     } else {
761     $current_token->{value} .= chr $char;
762 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
763     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
764 wakaba 1.1 # reconsume
765     redo A;
766     }
767     } elsif ($self->{state} == ESCAPE_BEFORE_NL_STATE) {
768     ## NOTE: eightth character of |unicode| in |escape|.
769     if ($self->{c} == 0x0020 or # SP
770     $self->{c} == 0x000A or # \n
771     $self->{c} == 0x0009 or # \t
772     $self->{c} == 0x000C) { # \f
773     $current_token->{value} .= chr $char;
774 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
775     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
776 wakaba 1.1 $self->{c} = $self->{get_char}->();
777     redo A;
778     } elsif ($self->{c} == 0x000D) { # \r
779     $self->{state} = ESCAPE_BEFORE_NL_STATE;
780     $self->{c} = $self->{get_char}->();
781     redo A;
782     } else {
783     $current_token->{value} .= chr $char;
784 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
785     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
786 wakaba 1.1 # reconsume
787     redo A;
788     }
789     } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
790     ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
791     if ($self->{c} == 0x000A) { # \n
792     $current_token->{value} .= chr $char;
793 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
794     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
795 wakaba 1.1 $self->{c} = $self->{get_char}->();
796     redo A;
797     } else {
798     $current_token->{value} .= chr $char;
799 wakaba 1.3 $self->{state} = $q == 0 ? NAME_STATE :
800     $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
801 wakaba 1.1 # reconsume
802     redo A;
803     }
804     } elsif ($self->{state} == STRING_STATE) {
805     ## NOTE: A character in |string$Q| in |string| in |STRING|, or
806     ## a character in |invalid$Q| in |invalid| in |INVALID|,
807     ## where |$Q = $q == 0x0022 ? 1 : 2|.
808 wakaba 1.3 ## Or, in |URI|.
809 wakaba 1.1 if ($self->{c} == 0x005C) { # \
810     $self->{state} = ESCAPE_OPEN_STATE;
811     $self->{c} = $self->{get_char}->();
812     redo A;
813     } elsif ($self->{c} == $q) { # " | '
814 wakaba 1.3 if ($current_token->{type} == STRING_TOKEN) {
815     $self->{state} = BEFORE_TOKEN_STATE;
816     $self->{c} = $self->{get_char}->();
817     return $current_token;
818     #redo A;
819     } else {
820     $self->{state} = URI_AFTER_WSP_STATE;
821     $self->{c} = $self->{get_char}->();
822     redo A;
823     }
824 wakaba 1.1 } elsif ($self->{c} == 0x000A or # \n
825     $self->{c} == 0x000D or # \r
826     $self->{c} == 0x000C or # \f
827     $self->{c} == -1) {
828     $current_token->{type} = INVALID_TOKEN;
829     $self->{state} = BEFORE_TOKEN_STATE;
830     # reconsume
831     return $current_token;
832     #redo A;
833     } else {
834     $current_token->{value} .= chr $self->{c};
835     # stay in the state
836     $self->{c} = $self->{get_char}->();
837     redo A;
838     }
839     } elsif ($self->{state} == NUMBER_STATE) {
840     ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
841     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
842     $current_token->{value} .= chr $self->{c};
843     # stay in the state
844     $self->{c} = $self->{get_char}->();
845     redo A;
846     } elsif ($self->{c} == 0x002E) { # .
847     $self->{state} = NUMBER_DOT_STATE;
848     $self->{c} = $self->{get_char}->();
849     redo A;
850     } else {
851 wakaba 1.2 $current_token->{number} = $current_token->{value};
852     $current_token->{value} = '';
853 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
854     # reprocess
855 wakaba 1.2 redo A;
856 wakaba 1.1 }
857     } elsif ($self->{state} == NUMBER_DOT_STATE) {
858     ## NOTE: The character immediately following |.| in |num|.
859     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
860 wakaba 1.2 $current_token->{value} .= '.' . chr $self->{c};
861 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
862     $self->{c} = $self->{get_char}->();
863     redo A;
864     } else {
865     unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
866 wakaba 1.2 $current_token->{number} = $current_token->{value};
867     $current_token->{value} = '';
868 wakaba 1.1 $self->{state} = BEFORE_TOKEN_STATE;
869     # reprocess
870     return $current_token;
871     #redo A;
872     }
873     } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
874     ## NOTE: The character immediately following |.| at the beginning of |num|.
875     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
876 wakaba 1.2 $current_token->{value} .= '.' . chr $self->{c};
877 wakaba 1.1 $self->{state} = NUMBER_DOT_NUMBER_STATE;
878     $self->{c} = $self->{get_char}->();
879     redo A;
880     } else {
881     $self->{state} = BEFORE_TOKEN_STATE;
882     $self->{c} = $self->{get_char}->();
883     return {type => DELIM_TOKEN, value => '.'};
884     #redo A;
885     }
886     } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
887     ## NOTE: |[0-9]| in |num| after |.|.
888     if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
889     $current_token->{value} .= chr $self->{c};
890     # stay in the state
891     $self->{c} = $self->{get_char}->();
892     redo A;
893     } else {
894 wakaba 1.2 $current_token->{number} = $current_token->{value};
895     $current_token->{value} = '';
896 wakaba 1.1 $self->{state} = AFTER_NUMBER_STATE;
897     # reprocess
898 wakaba 1.2 redo A;
899 wakaba 1.1 }
900     } else {
901     die "$0: Unknown state |$self->{state}|";
902     }
903     } # A
904    
905     ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|
906    
907     } # get_next_token
908    
909     1;
910 wakaba 1.3 # $Date: 2007/09/08 01:31:44 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24