/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Fri Aug 17 11:53:52 2007 UTC revision 1.4 by wakaba, Sat Sep 8 02:58:24 2007 UTC
# Line 1  Line 1 
1  package Whatpm::CSS::Tokenizer;  package Whatpm::CSS::Tokenizer;
2  use strict;  use strict;
3    
4    sub BEFORE_TOKEN_STATE () { 0 }
5    sub BEFORE_NMSTART_STATE () { 1 }
6    sub NAME_STATE () { 2 }
7    sub ESCAPE_OPEN_STATE () { 3 }
8    sub STRING_STATE () { 4 }
9    sub HASH_OPEN_STATE () { 5 }
10    sub NUMBER_STATE () { 6 }
11    sub NUMBER_FRACTION_STATE () { 7 }
12    sub AFTER_NUMBER_STATE () { 8 }
13    sub URI_BEFORE_WSP_STATE () { 9 }
14    sub ESCAPE_STATE () { 10 }
15    sub ESCAPE_BEFORE_LF_STATE () { 11 }
16    sub ESCAPE_BEFORE_NL_STATE () { 12 }
17    sub NUMBER_DOT_STATE () { 13 }
18    sub NUMBER_DOT_NUMBER_STATE () { 14 }
19    sub DELIM_STATE () { 15 }
20    sub URI_UNQUOTED_STATE () { 16 }
21    sub URI_AFTER_WSP_STATE () { 17 }
22    sub AFTER_AT_STATE () { 18 }
23    sub AFTER_AT_HYPHEN_STATE () { 19 }
24    
25    sub IDENT_TOKEN () { 1 }
26    sub ATKEYWORD_TOKEN () { 2 }
27    sub HASH_TOKEN () { 3 }
28    sub FUNCTION_TOKEN () { 4 }
29    sub URI_TOKEN () { 5 }
30    sub URI_INVALID_TOKEN () { 6 }
31    sub URI_PREFIX_TOKEN () { 7 }
32    sub URI_PREFIX_INVALID_TOKEN () { 8 }
33    sub STRING_TOKEN () { 9 }
34    sub INVALID_TOKEN () { 10 }
35    sub NUMBER_TOKEN () { 11 }
36    sub DIMENSION_TOKEN () { 12 }
37    sub PERCENTAGE_TOKEN () { 13 }
38    sub UNICODE_RANGE_TOKEN () { 14 }
39    sub UNICODE_RANGE_INVALID_TOKEN () { 15 }
40    sub DELIM_TOKEN () { 16 }
41    sub PLUS_TOKEN () { 17 }
42    sub GREATER_TOKEN () { 18 }
43    sub COMMA_TOKEN () { 19 }
44    sub TILDE_TOKEN () { 20 }
45    sub DASHMATCH_TOKEN () { 21 }
46    sub PREFIXMATCH_TOKEN () { 22 }
47    sub SUFFIXMATCH_TOKEN () { 23 }
48    sub SUBSTRINGMATCH_TOKEN () { 24 }
49    sub INCLUDES_TOKEN () { 25 }
50    sub SEMICOLON_TOKEN () { 26 }
51    sub LBRACE_TOKEN () { 27 }
52    sub RBRACE_TOKEN () { 28 }
53    sub LPAREN_TOKEN () { 29 }
54    sub RPAREN_TOKEN () { 30 }
55    sub LBRACKET_TOKEN () { 31 }
56    sub RBRACKET_TOKEN () { 32 }
57    sub S_TOKEN () { 33 }
58    sub CDO_TOKEN () { 34 }
59    sub CDC_TOKEN () { 35 }
60    sub COMMENT_TOKEN () { 36 }
61    sub COMMENT_INVALID_TOKEN () { 37 }
62    sub EOF_TOKEN () { 38 }
63    
64    our @TokenName = qw(
65      0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
66      STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
67      UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH
68      PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
69      LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
70      COMMENT_INVALID EOF
71    );
72    
73  sub new ($) {  sub new ($) {
74    my $self = bless {token => []}, shift;    my $self = bless {token => [], get_char => sub { -1 },
75                        onerror => sub { }}, shift;
76    return $self;    return $self;
77  } # new  } # new
78    
# Line 22  sub get_next_token ($) { Line 92  sub get_next_token ($) {
92    my $char;    my $char;
93    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
94    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
95    my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|    my $q;
96          ## NOTE:
97          ##   0: in |ident|.
98          ##   1: in |URI| outside of |string|.
99          ##   0x0022: in |string1| or |invalid1|.
100          ##   0x0027: in |string2| or |invalid2|.
101    
102    A: {    A: {
103      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
# Line 32  sub get_next_token ($) { Line 107  sub get_next_token ($) {
107          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
108          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
109          redo A;          redo A;
110        } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
111                 (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
112                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
113                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
114          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
115          $current_token = {type => IDENT_TOKEN, value => chr $self->{char}};          $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};
116          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
117          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
118          redo A;          redo A;
# Line 50  sub get_next_token ($) { Line 125  sub get_next_token ($) {
125        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
126          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
127          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $current_token = {type => ATKEYWORD_TOKEN, value => ''};
128          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = AFTER_AT_STATE;
129          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
130          redo A;          redo A;
131        } elsif ($self->{c} == 0x0022) { # "        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
         ## NOTE: |"| in |string1| in |string| in |STRING|, or  
         ## |"| in |invalid1| in |invalid| in |INVALID|.  
132          $current_token = {type => STRING_TOKEN, value => ''};          $current_token = {type => STRING_TOKEN, value => ''};
133          $self->{state} = STRING_STATE; $q = 1;          $self->{state} = STRING_STATE; $q = $self->{c};
         $self->{c} = $self->{get_char}->();  
         redo A;  
       } elsif ($self->{c} == 0x0027) { # '  
         ## NOTE: |'| in |string2| in |string| in |STRING|, or  
         ## |'| in |invalid2| in |invalid| in |INVALID|.  
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 2;  
134          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
135          redo A;          redo A;
136        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
# Line 81  sub get_next_token ($) { Line 147  sub get_next_token ($) {
147          redo A;          redo A;
148        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
149          ## NOTE: |num|.          ## NOTE: |num|.
150          $current_token = {type => NUMBER_TOKEN, value => '.'};          $current_token = {type => NUMBER_TOKEN, value => '0'};
151          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
152          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
153          redo A;          redo A;
154          } elsif ($self->{c} == 0x002F) { # /
155            $self->{c} = $self->{get_char}->();
156            if ($self->{c} == 0x002A) { # *
157              C: {
158                $self->{c} = $self->{get_char}->();
159                if ($self->{c} == 0x002A) { # *
160                  D: {
161                    $self->{c} = $self->{get_char}->();
162                    if ($self->{c} == 0x002F) { # /
163                      #
164                    } elsif ($self->{c} == 0x002A) { # *
165                      redo D;
166                    } else {
167                      redo C;
168                    }
169                  } # D
170                } elsif ($self->{c} == -1) {
171                  # stay in the state
172                  # reprocess
173                  return {type => COMMENT_INVALID_TOKEN};
174                  #redo A;
175                } else {
176                  redo C;
177                }
178              } # C
179    
180              # stay in the state.
181              $self->{c} = $self->{get_char}->();
182              redo A;
183            } else {
184              # stay in the state.
185              # reprocess
186              return {type => DELIM_STATE, value => '/'};
187              #redo A;
188            }        
189        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
190          ## NOTE: |CDO|          ## NOTE: |CDO|
191          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 119  sub get_next_token ($) { Line 220  sub get_next_token ($) {
220            return {type => DELIM_TOKEN, value => '<'};            return {type => DELIM_TOKEN, value => '<'};
221            #redo A;            #redo A;
222          }          }
223        } elsif ({        } elsif (my $t = {
224                  0x003B => 1, # ;                  0x003B => SEMICOLON_TOKEN, # ;
225                  0x007B => 1, # {                  0x007B => LBRACE_TOKEN, # {
226                  0x007D => 1, # }                  0x007D => RBRACE_TOKEN, # }
227                  0x0028 => 1, # (                  0x0028 => LPAREN_TOKEN, # (
228                  0x0029 => 1, # )                  0x0029 => RPAREN_TOKEN, # )
229                  0x005B => 1, # [                  0x005B => LBRACKET_TOKEN, # [
230                  0x005D => 1, # ]                  0x005D => RBRACKET_TOKEN, # ]
231                 }->{$self->{c}}) {                 }->{$self->{c}}) {
232          # stay in the state          # stay in the state
233          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
234          return {type => chr $self->{c}};          return {type => $t};
235          # redo A;          # redo A;
236        } elsif ({        } elsif ({
237                  0x0020 => 1, # SP                  0x0020 => 1, # SP
# Line 172  sub get_next_token ($) { Line 273  sub get_next_token ($) {
273                          0x0024 => SUFFIXMATCH_TOKEN, # $                          0x0024 => SUFFIXMATCH_TOKEN, # $
274                          0x002A => SUBSTRINGMATCH_TOKEN, # *                          0x002A => SUBSTRINGMATCH_TOKEN, # *
275                         }->{$self->{c}}) {                         }->{$self->{c}}) {
276            my $c = $self->{c};
277          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
278          if ($self->{c} == 0x003D) { # =          if ($self->{c} == 0x003D) { # =
279            # stay in the state            # stay in the state
# Line 181  sub get_next_token ($) { Line 283  sub get_next_token ($) {
283          } else {          } else {
284            # stay in the state            # stay in the state
285            # reprocess            # reprocess
286            return {type => DELIM_TOKEN, value => chr $self->{c}};            return {type => DELIM_TOKEN, value => chr $c};
287            #redo A;            #redo A;
288          }          }
289        } elsif ($self->{c} == 0x002B) { # +        } elsif ($self->{c} == 0x002B) { # +
# Line 225  sub get_next_token ($) { Line 327  sub get_next_token ($) {
327          #redo A;          #redo A;
328        }        }
329      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
330        ## NOTE: |nmstart| in |ident| in (|IDENT| or |ATKEYWORD|)        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
331        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        ## |FUNCTION|)
332            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
333              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
334            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
335            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
336          $current_token->{value} .= chr $self->{char};          $current_token->{value} .= chr $self->{c};
337            $current_token->{type} = DIMENSION_TOKEN
338                if $current_token->{type} == NUMBER_TOKEN;
339          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
340          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
341          redo A;          redo A;
342        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
343    ## TODO: 12-\X, 12-\{nl}
344          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
345          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
346          redo A;          redo A;
# Line 256  sub get_next_token ($) { Line 362  sub get_next_token ($) {
362          }          }
363        } else {        } else {
364          if ($current_token->{type} == NUMBER_TOKEN) {          if ($current_token->{type} == NUMBER_TOKEN) {
365            ## NOTE: |-| after |num|.            ## NOTE: |-| after |NUMBER|.
366            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
367            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
368            $self->{c} = $self->{get_char}->();            # reconsume
369            return $current_token;            $current_token->{value} = $current_token->{number};
370          } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {            delete $current_token->{number};
           ## NOTE: |-| after |@|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
371            return $current_token;            return $current_token;
372          } else {          } else {
373            ## NOTE: |-| not followed by |nmstart|.            ## NOTE: |-| not followed by |nmstart|.
# Line 274  sub get_next_token ($) { Line 376  sub get_next_token ($) {
376            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '-'};
377          }          }
378        }        }
379        } elsif ($self->{state} == AFTER_AT_STATE) {
380          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
381              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
382              $self->{c} == 0x005F or # _
383              $self->{c} > 0x007F) { # nonascii
384            $current_token->{value} .= chr $self->{c};
385            $self->{state} = NAME_STATE;
386            $self->{c} = $self->{get_char}->();
387            redo A;
388          } elsif ($self->{c} == 0x002D) { # -
389            $current_token->{value} .= '-';
390            $self->{state} = AFTER_AT_HYPHEN_STATE;
391            $self->{c} = $self->{get_char}->();
392            redo A;
393          } elsif ($self->{c} == 0x005C) { # \
394            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
395            $self->{c} = $self->{get_char}->();
396            redo A;
397          } else {
398            $self->{state} = BEFORE_TOKEN_STATE;
399            # reprocess
400            return {type => DELIM_TOKEN, value => '@'};
401          }
402        } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
403          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
404              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
405              $self->{c} == 0x005F or # _
406              $self->{c} > 0x007F) { # nonascii
407            $current_token->{value} .= chr $self->{c};
408            $self->{state} = NAME_STATE;
409            $self->{c} = $self->{get_char}->();
410            redo A;
411          } elsif ($self->{c} == 0x002D) { # -
412            $self->{c} = $self->{get_char}->();
413            if ($self->{c} == 0x003E) { # >
414              unshift @{$self->{token}}, {type => CDC_TOKEN};
415              $self->{state} = BEFORE_TOKEN_STATE;
416              $self->{c} = $self->{get_char}->();
417              return {type => DELIM_TOKEN, value => '@'};
418              #redo A;
419            } else {
420              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
421              $current_token = {type => IDENT_TOKEN, value => '-'};
422              $self->{state} = BEFORE_NMSTART_STATE;
423              # reprocess
424              return {type => DELIM_TOKEN, value => '@'};
425              #redo A;
426            }
427          } elsif ($self->{c} == 0x005C) { # \
428            ## TODO: @-\{nl}
429            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
430            $self->{c} = $self->{get_char}->();
431            redo A;
432          } else {
433            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
434            $self->{state} = BEFORE_TOKEN_STATE;
435            # reprocess
436            return {type => DELIM_TOKEN, value => '@'};
437          }
438      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
439        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
440          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
# Line 281  sub get_next_token ($) { Line 442  sub get_next_token ($) {
442          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
443          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
444          redo A;          redo A;
445        } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
446                 (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
447                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
448                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
449          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
450          $current_token->{value} = chr $self->{char};          $current_token->{value} = chr $self->{c};
451            $current_token->{type} = DIMENSION_TOKEN;
452          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
453          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
454          redo A;          redo A;
# Line 310  sub get_next_token ($) { Line 472  sub get_next_token ($) {
472        }        }
473      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
474        ## NOTE: The first |nmchar| in |name| in |HASH|.        ## NOTE: The first |nmchar| in |name| in |HASH|.
475        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
476            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
477            (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9            (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
478            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
479            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
480            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
481          $current_token->{value} .= chr $self->{char};          $current_token->{value} .= chr $self->{c};
482          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
483          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
484          redo A;          redo A;
# Line 332  sub get_next_token ($) { Line 494  sub get_next_token ($) {
494        }        }
495      } elsif ($self->{state} == NAME_STATE) {      } elsif ($self->{state} == NAME_STATE) {
496        ## NOTE: |nmchar| in (|ident| or |name|).        ## NOTE: |nmchar| in (|ident| or |name|).
497        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
498            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
499            (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9            (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
500            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
501            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
502            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
503          $current_token->{value} .= chr $self->{char};          $current_token->{value} .= chr $self->{c};
504          # stay in the state          # stay in the state
505          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
506          redo A;          redo A;
507        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
508          $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
509          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
510          redo A;          redo A;
511        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
512                 $current_token->{type} == IDENT_TOKEN) { # (                 $current_token->{type} == IDENT_TOKEN) { # (
513          if (not $current_token->{has_escape} and          my $func_name = $current_token->{value};
514              {url => 1, Url => 1, uRl => 1, urL => 1,          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
515               URl => 1, UrL => 1, uRL => 1, URL => 1}          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
516              ->{$current_token->{value}}) {            if ($current_token->{has_escape}) {
517            $current_token->{type} = URI_TOKEN;              ## TODO: warn
518              }
519              $current_token->{type}
520                  = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
521              $current_token->{value} = '';
522            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
523            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
   
           ## NOTE: This version of the tokenizer does not support the |URI|  
           ## token type.  Note that browsers disagree in how to tokenize  
           ## |url| function.  
           $current_token->{type} = FUNCTION_TOKEN;  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
   
524            redo A;            redo A;
525          } else {          } else {
526            $current_token->{type} = FUNCTION_TOKEN;            $current_token->{type} = FUNCTION_TOKEN;
# Line 378  sub get_next_token ($) { Line 535  sub get_next_token ($) {
535          return $current_token;          return $current_token;
536          #redo A;          #redo A;
537        }        }
538        } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
539          while ({
540                    0x0020 => 1, # SP
541                    0x0009 => 1, # \t
542                    0x000D => 1, # \r
543                    0x000A => 1, # \n
544                    0x000C => 1, # \f
545                 }->{$self->{c}}) {
546            $self->{c} = $self->{get_char}->();
547          }
548          if ($self->{c} == -1) {
549            $current_token->{type} = {
550                URI_TOKEN, URI_INVALID_TOKEN,
551                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
552                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
553                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
554            }->{$current_token->{type}};        
555            $self->{state} = BEFORE_TOKEN_STATE;
556            $self->{c} = $self->{get_char}->();
557            return $current_token;
558            #redo A;
559          } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
560            ## TODO: Should we consider matches of "(" and ")"?
561            $current_token->{type} = {
562                URI_TOKEN, URI_INVALID_TOKEN,
563                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
564                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
565                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
566            }->{$current_token->{type}};
567            $self->{state} = URI_UNQUOTED_STATE;
568            $self->{c} = $self->{get_char}->();
569            redo A;
570          } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
571            $self->{state} = STRING_STATE; $q = $self->{c};
572            $self->{c} = $self->{get_char}->();
573            redo A;
574          } elsif ($self->{c} == 0x0029) { # )
575            $self->{state} = BEFORE_TOKEN_STATE;
576            $self->{c} = $self->{get_char}->();
577            return $current_token;
578            #redo A;
579          } elsif ($self->{c} == 0x005C) { # \
580            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
581            $self->{c} = $self->{get_char}->();
582            redo A;
583          } else {
584            $current_token->{value} .= chr $self->{c};
585            $self->{state} = URI_UNQUOTED_STATE;
586            $self->{c} = $self->{get_char}->();
587            redo A;
588          }
589        } elsif ($self->{state} == URI_UNQUOTED_STATE) {
590          if ({
591               0x0020 => 1, # SP
592               0x0009 => 1, # \t
593               0x000D => 1, # \r
594               0x000A => 1, # \n
595               0x000C => 1, # \f
596              }->{$self->{c}}) {
597            $self->{state} = URI_AFTER_WSP_STATE;
598            $self->{c} = $self->{get_char}->();
599            redo A;
600          } elsif ($self->{c} == -1) {
601            $current_token->{type} = {
602                URI_TOKEN, URI_INVALID_TOKEN,
603                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
604                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
605                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
606            }->{$current_token->{type}};        
607            $self->{state} = BEFORE_TOKEN_STATE;
608            $self->{c} = $self->{get_char}->();
609            return $current_token;
610            #redo A;
611          } elsif ($self->{c} < 0x0020 or {
612              0x0022 => 1, # "
613              0x0027 => 1, # '
614              0x0028 => 1, # (
615          }->{$self->{c}}) { # C0 or (
616            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
617            $current_token->{type} = {
618                URI_TOKEN, URI_INVALID_TOKEN,
619                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
620                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
621                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
622            }->{$current_token->{type}};
623            # stay in the state.
624            $self->{c} = $self->{get_char}->();
625            redo A;
626          } elsif ($self->{c} == 0x0029) { # )
627            $self->{state} = BEFORE_TOKEN_STATE;
628            $self->{c} = $self->{get_char}->();
629            return $current_token;
630            #redo A;
631          } elsif ($self->{c} == 0x005C) { # \
632            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
633            $self->{c} = $self->{get_char}->();
634            redo A;
635          } else {
636            $current_token->{value} .= chr $self->{c};
637            # stay in the state.
638            $self->{c} = $self->{get_char}->();
639            redo A;
640          }
641        } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
642          if ({
643               0x0020 => 1, # SP
644               0x0009 => 1, # \t
645               0x000D => 1, # \r
646               0x000A => 1, # \n
647               0x000C => 1, # \f
648              }->{$self->{c}}) {
649            # stay in the state.
650            $self->{c} = $self->{get_char}->();
651            redo A;
652          } elsif ($self->{c} == -1) {
653            $current_token->{type} = {
654                URI_TOKEN, URI_INVALID_TOKEN,
655                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
656                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
657                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
658            }->{$current_token->{type}};        
659            $self->{state} = BEFORE_TOKEN_STATE;
660            $self->{c} = $self->{get_char}->();
661            return $current_token;
662            #redo A;
663          } elsif ($self->{c} == 0x0029) { # )
664            $self->{state} = BEFORE_TOKEN_STATE;
665            $self->{c} = $self->{get_char}->();
666            return $current_token;
667            #redo A;
668          } elsif ($self->{c} == 0x005C) { # \
669            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
670            $self->{c} = $self->{get_char}->();
671            redo A;
672          } else {
673            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
674            $current_token->{type} = {
675                URI_TOKEN, URI_INVALID_TOKEN,
676                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
677                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
678                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
679            }->{$current_token->{type}};
680            # stay in the state.
681            $self->{c} = $self->{get_char}->();
682            redo A;
683          }
684      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
685        $current_token->{has_escape} = 1;        $current_token->{has_escape} = 1;
686        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
# Line 392  sub get_next_token ($) { Line 695  sub get_next_token ($) {
695          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
696          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
697          redo A;          redo A;
698        } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
699          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
700          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 - 0xA;
701          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
# Line 407  sub get_next_token ($) { Line 710  sub get_next_token ($) {
710            return $current_token;            return $current_token;
711            # reconsume            # reconsume
712            #redo A;            #redo A;
713            } elsif ($q == 1) {
714              ## NOTE: In |escape| in |URI|.
715              $current_token->{type} = {
716                  URI_TOKEN, URI_INVALID_TOKEN,
717                  URI_INVALID_TOKEN, URI_INVALID_TOKEN,
718                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
719                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
720              }->{$current_token->{type}};
721              $current_token->{value} .= chr $self->{c};
722              $self->{state} = URI_UNQUOTED_STATE;
723              $self->{c} = $self->{get_char}->();
724              redo A;
725          } else {          } else {
726            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
727            $current_token->{value} .= chr $self->{c};            $current_token->{value} .= chr $self->{c};
# Line 422  sub get_next_token ($) { Line 737  sub get_next_token ($) {
737            return $current_token;            return $current_token;
738            # reconsume            # reconsume
739            #redo A;            #redo A;
740            } elsif ($q == 1) {
741              $current_token->{type} = {
742                  URI_TOKEN, URI_INVALID_TOKEN,
743                  URI_INVALID_TOKEN, URI_INVALID_TOKEN,
744                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
745                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
746              }->{$current_token->{type}};
747              $current_token->{value} .= "\x0D\x0A";
748              $self->{state} = URI_UNQUOTED_STATE;
749              $self->{c} = $self->{get_char}->();
750              redo A;
751          } else {          } else {
752            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
753            $current_token->{value} .= "\x0D\x0A";            $current_token->{value} .= "\x0D\x0A";
# Line 432  sub get_next_token ($) { Line 758  sub get_next_token ($) {
758        } else {        } else {
759          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
760          $current_token->{value} .= chr $self->{c};          $current_token->{value} .= chr $self->{c};
761          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
762                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
763          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
764          redo A;          redo A;
765        }        }
# Line 448  sub get_next_token ($) { Line 775  sub get_next_token ($) {
775          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
776          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
777          redo A;          redo A;
778        } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
779          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
780          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
781          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 458  sub get_next_token ($) { Line 785  sub get_next_token ($) {
785                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
786                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
787          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
788          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
789                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
790          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
791          redo A;          redo A;
792        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 467  sub get_next_token ($) { Line 795  sub get_next_token ($) {
795          redo A;          redo A;
796        } else {        } else {
797          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
798          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
799                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
800          # reconsume          # reconsume
801          redo A;          redo A;
802        }        }
# Line 478  sub get_next_token ($) { Line 807  sub get_next_token ($) {
807            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
808            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
809          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
810          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
811                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
812          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
813          redo A;          redo A;
814        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 487  sub get_next_token ($) { Line 817  sub get_next_token ($) {
817          redo A;          redo A;
818        } else {        } else {
819          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
820          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
821                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
822          # reconsume          # reconsume
823          redo A;          redo A;
824        }        }
# Line 495  sub get_next_token ($) { Line 826  sub get_next_token ($) {
826        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
827        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
828          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
829          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
830                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
831          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
832          redo A;          redo A;
833        } else {        } else {
834          $current_token->{value} .= chr $char;          $current_token->{value} .= chr $char;
835          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
836                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
837          # reconsume          # reconsume
838          redo A;          redo A;
839        }        }
# Line 508  sub get_next_token ($) { Line 841  sub get_next_token ($) {
841        ## NOTE: A character in |string$Q| in |string| in |STRING|, or        ## NOTE: A character in |string$Q| in |string| in |STRING|, or
842        ## a character in |invalid$Q| in |invalid| in |INVALID|,        ## a character in |invalid$Q| in |invalid| in |INVALID|,
843        ## where |$Q = $q == 0x0022 ? 1 : 2|.        ## where |$Q = $q == 0x0022 ? 1 : 2|.
844          ## Or, in |URI|.
845        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
846          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
847          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
848          redo A;          redo A;
849        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
850          $self->{state} = BEFORE_TOKEN_STATE;          if ($current_token->{type} == STRING_TOKEN) {
851          $self->{c} = $self->{get_char}->();            $self->{state} = BEFORE_TOKEN_STATE;
852          return $current_token;            $self->{c} = $self->{get_char}->();
853          #redo A;            return $current_token;
854              #redo A;
855            } else {
856              $self->{state} = URI_AFTER_WSP_STATE;
857              $self->{c} = $self->{get_char}->();
858              redo A;
859            }
860        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
861                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
862                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
# Line 544  sub get_next_token ($) { Line 884  sub get_next_token ($) {
884          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
885          redo A;          redo A;
886        } else {        } else {
887          $self->{number} = $self->{value};          $current_token->{number} = $current_token->{value};
888          $self->{value} = '';          $current_token->{value} = '';
889          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
890          # reprocess          # reprocess
891          return $current_token;          redo A;
         #redo A;  
892        }        }
893      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
894        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
895        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
896          $current_token->{value} .= chr $self->{c};          $current_token->{value} .= '.' . chr $self->{c};
897          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
898          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
899          redo A;          redo A;
900        } else {        } else {
901          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
902          $self->{number} = $self->{value};          $current_token->{number} = $current_token->{value};
903          $self->{value} = '';          $current_token->{value} = '';
904          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
905          # reprocess          # reprocess
906          return $current_token;          return $current_token;
# Line 570  sub get_next_token ($) { Line 909  sub get_next_token ($) {
909      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
910        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
911        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
912          $current_token->{value} .= chr $self->{c};          $current_token->{value} .= '.' . chr $self->{c};
913          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
914          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
915          redo A;          redo A;
# Line 588  sub get_next_token ($) { Line 927  sub get_next_token ($) {
927          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
928          redo A;          redo A;
929        } else {        } else {
930          $self->{number} = $self->{value};          $current_token->{number} = $current_token->{value};
931          $self->{value} = '';          $current_token->{value} = '';
932          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
933          # reprocess          # reprocess
934          return $current_token;          redo A;
         #redo A;  
935        }        }
936      } else {      } else {
937        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.4

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24