/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2 by wakaba, Sat Sep 8 01:31:44 2007 UTC revision 1.17 by wakaba, Sun Jan 20 04:02:25 2008 UTC
# Line 1  Line 1 
1  package Whatpm::CSS::Tokenizer;  package Whatpm::CSS::Tokenizer;
2  use strict;  use strict;
3    our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    require Exporter;
6    push our @ISA, 'Exporter';
7    
8  sub BEFORE_TOKEN_STATE () { 0 }  sub BEFORE_TOKEN_STATE () { 0 }
9  sub BEFORE_NMSTART_STATE () { 1 }  sub BEFORE_NMSTART_STATE () { 1 }
# Line 17  sub ESCAPE_BEFORE_NL_STATE () { 12 } Line 21  sub ESCAPE_BEFORE_NL_STATE () { 12 }
21  sub NUMBER_DOT_STATE () { 13 }  sub NUMBER_DOT_STATE () { 13 }
22  sub NUMBER_DOT_NUMBER_STATE () { 14 }  sub NUMBER_DOT_NUMBER_STATE () { 14 }
23  sub DELIM_STATE () { 15 }  sub DELIM_STATE () { 15 }
24    sub URI_UNQUOTED_STATE () { 16 }
25    sub URI_AFTER_WSP_STATE () { 17 }
26    sub AFTER_AT_STATE () { 18 }
27    sub AFTER_AT_HYPHEN_STATE () { 19 }
28    
29  sub IDENT_TOKEN () { 1 }  sub IDENT_TOKEN () { 1 }
30  sub ATKEYWORD_TOKEN () { 2 }  sub ATKEYWORD_TOKEN () { 2 }
# Line 32  sub NUMBER_TOKEN () { 11 } Line 40  sub NUMBER_TOKEN () { 11 }
40  sub DIMENSION_TOKEN () { 12 }  sub DIMENSION_TOKEN () { 12 }
41  sub PERCENTAGE_TOKEN () { 13 }  sub PERCENTAGE_TOKEN () { 13 }
42  sub UNICODE_RANGE_TOKEN () { 14 }  sub UNICODE_RANGE_TOKEN () { 14 }
 sub UNICODE_RANGE_INVALID_TOKEN () { 15 }  
43  sub DELIM_TOKEN () { 16 }  sub DELIM_TOKEN () { 16 }
44  sub PLUS_TOKEN () { 17 }  sub PLUS_TOKEN () { 17 }
45  sub GREATER_TOKEN () { 18 }  sub GREATER_TOKEN () { 18 }
# Line 56  sub CDC_TOKEN () { 35 } Line 63  sub CDC_TOKEN () { 35 }
63  sub COMMENT_TOKEN () { 36 }  sub COMMENT_TOKEN () { 36 }
64  sub COMMENT_INVALID_TOKEN () { 37 }  sub COMMENT_INVALID_TOKEN () { 37 }
65  sub EOF_TOKEN () { 38 }  sub EOF_TOKEN () { 38 }
66    sub MINUS_TOKEN () { 39 }
67    sub STAR_TOKEN () { 40 }
68    sub VBAR_TOKEN () { 41 }
69    sub DOT_TOKEN () { 42 }
70    sub COLON_TOKEN () { 43 }
71    sub MATCH_TOKEN () { 44 }
72    sub EXCLAMATION_TOKEN () { 45 }
73    
74  our @TokenName = qw(  our @TokenName = qw(
75    0 IDENT ATKWTWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77    UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH    0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80    COMMENT_INVALID EOF    COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81    );
82    
83    our @EXPORT_OK = qw(
84      IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85      URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86      STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87      UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88      TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89      SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90      RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91      S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92      MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93      EXCLAMATION_TOKEN
94  );  );
95    
96    our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97    
98  sub new ($) {  sub new ($) {
99    my $self = bless {token => [], get_char => sub { -1 },    my $self = bless {token => [], get_char => sub { -1 }}, shift;
                     onerror => sub { }}, shift;  
100    return $self;    return $self;
101  } # new  } # new
102    
# Line 76  sub init ($) { Line 104  sub init ($) {
104    my $self = shift;    my $self = shift;
105    $self->{state} = BEFORE_TOKEN_STATE;    $self->{state} = BEFORE_TOKEN_STATE;
106    $self->{c} = $self->{get_char}->();    $self->{c} = $self->{get_char}->();
107      #$self->{t} = {type => token-type, value => value, number => number};
108  } # init  } # init
109    
110  sub get_next_token ($) {  sub get_next_token ($) {
# Line 84  sub get_next_token ($) { Line 113  sub get_next_token ($) {
113      return shift @{$self->{token}};      return shift @{$self->{token}};
114    }    }
115    
   my $current_token;  
116    my $char;    my $char;
117    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
118    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
119    my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|    my $q;
120          ## NOTE:
121          ##   0: in |ident|.
122          ##   1: in |URI| outside of |string|.
123          ##   0x0022: in |string1| or |invalid1|.
124          ##   0x0027: in |string2| or |invalid2|.
125    
126    A: {    A: {
127      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
128        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
129          ## NOTE: |-| in |ident| in |IDENT|          ## NOTE: |-| in |ident| in |IDENT|
130          $current_token = {type => IDENT_TOKEN, value => '-'};          $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
131          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
132          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
133          redo A;          redo A;
134          } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
135            $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
136            $self->{c} = $self->{get_char}->();
137            if ($self->{c} == 0x002B) { # +
138              $self->{c} = $self->{get_char}->();
139              if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
140                  (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
141                  (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
142                  $self->{c} == 0x003F) { # ?
143                $self->{t}->{value} = chr $self->{c};
144                $self->{t}->{type} = UNICODE_RANGE_TOKEN;
145                $self->{c} = $self->{get_char}->();
146                C: for (2..6) {
147                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
148                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
149                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
150                      $self->{c} == 0x003F) { # ?
151                    $self->{t}->{value} .= chr $self->{c};
152                    $self->{c} = $self->{get_char}->();
153                  } else {
154                    last C;
155                  }
156                } # C
157    
158                if ($self->{c} == 0x002D) { # -
159                  $self->{c} = $self->{get_char}->();
160                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
161                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
162                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
163                    $self->{t}->{value} .= '-' . chr $self->{c};
164                    $self->{c} = $self->{get_char}->();
165                    C: for (2..6) {
166                      if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
167                          (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
168                          (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
169                        $self->{t}->{value} .= chr $self->{c};
170                        $self->{c} = $self->{get_char}->();
171                      } else {
172                        last C;
173                      }
174                    } # C
175                    
176                    #
177                  } else {
178                    my $token = $self->{t};
179                    $self->{t} = {type => IDENT_TOKEN, value => '-'};
180                    $self->{state} = BEFORE_NMSTART_STATE;
181                    # reprocess
182                    return $token;
183                    #redo A;
184                  }
185                }
186    
187                $self->{state} = BEFORE_TOKEN_STATE;
188                # reprocess
189                return $self->{t};
190                #redo A;
191              } else {
192                unshift @{$self->{token}}, {type => PLUS_TOKEN};
193                $self->{state} = BEFORE_TOKEN_STATE;
194                # reprocess
195                return $self->{t};
196                #redo A;
197              }
198            } else {
199              $self->{state} = NAME_STATE;
200              # reprocess
201              redo A;
202            }
203        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
204                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
205                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
206                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
207          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
208          $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
209          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
210          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
211          redo A;          redo A;
212        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
213          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
214          $current_token = {type => IDENT_TOKEN, value => ''};          $self->{t} = {type => IDENT_TOKEN, value => ''};
215          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
216          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
217          redo A;          redo A;
218        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
219          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
220          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
221          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = AFTER_AT_STATE;
         $self->{c} = $self->{get_char}->();  
         redo A;  
       } elsif ($self->{c} == 0x0022) { # "  
         ## NOTE: |"| in |string1| in |string| in |STRING|, or  
         ## |"| in |invalid1| in |invalid| in |INVALID|.  
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 1;  
222          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
223          redo A;          redo A;
224        } elsif ($self->{c} == 0x0027) { # '        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
225          ## NOTE: |'| in |string2| in |string| in |STRING|, or          $self->{t} = {type => STRING_TOKEN, value => ''};
226          ## |'| in |invalid2| in |invalid| in |INVALID|.          $self->{state} = STRING_STATE; $q = $self->{c};
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 2;  
227          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
228          redo A;          redo A;
229        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
230          ## NOTE: |#| in |HASH|.          ## NOTE: |#| in |HASH|.
231          $current_token = {type => HASH_TOKEN, value => ''};          $self->{t} = {type => HASH_TOKEN, value => ''};
232          $self->{state} = HASH_OPEN_STATE;          $self->{state} = HASH_OPEN_STATE;
233          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
234          redo A;          redo A;
235        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
236          ## NOTE: |num|.          ## NOTE: |num|.
237          $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};          $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
238          $self->{state} = NUMBER_STATE;          $self->{state} = NUMBER_STATE;
239          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
240          redo A;          redo A;
241        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
242          ## NOTE: |num|.          ## NOTE: |num|.
243          $current_token = {type => NUMBER_TOKEN, value => '0'};          $self->{t} = {type => NUMBER_TOKEN, value => '0'};
244          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
245          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
246          redo A;          redo A;
247          } elsif ($self->{c} == 0x002F) { # /
248            $self->{c} = $self->{get_char}->();
249            if ($self->{c} == 0x002A) { # *
250              C: {
251                $self->{c} = $self->{get_char}->();
252                if ($self->{c} == 0x002A) { # *
253                  D: {
254                    $self->{c} = $self->{get_char}->();
255                    if ($self->{c} == 0x002F) { # /
256                      #
257                    } elsif ($self->{c} == 0x002A) { # *
258                      redo D;
259                    } else {
260                      redo C;
261                    }
262                  } # D
263                } elsif ($self->{c} == -1) {
264                  # stay in the state
265                  # reprocess
266                  return {type => COMMENT_INVALID_TOKEN};
267                  #redo A;
268                } else {
269                  redo C;
270                }
271              } # C
272    
273              # stay in the state.
274              $self->{c} = $self->{get_char}->();
275              redo A;
276            } else {
277              # stay in the state.
278              # reprocess
279              return {type => DELIM_TOKEN, value => '/'};
280              #redo A;
281            }        
282        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
283          ## NOTE: |CDO|          ## NOTE: |CDO|
284          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
285          if ($self->{c} == 0x0021) { # !          if ($self->{c} == 0x0021) { # !
286            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
287            if ($self->{c} == 0x002C) { # -            if ($self->{c} == 0x002D) { # -
288              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->();
289              if ($self->{c} == 0x002C) { # -              if ($self->{c} == 0x002D) { # -
290                $self->{state} = BEFORE_TOKEN_STATE;                $self->{state} = BEFORE_TOKEN_STATE;
291                $self->{c} = $self->{get_char}->();                $self->{c} = $self->{get_char}->();
292                return {type => CDO_TOKEN};                return {type => CDO_TOKEN};
293                #redo A;                #redo A;
294              } else {              } else {
295                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
296                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
297                $current_token = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-'};
298                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
299                #reprocess                #reprocess
300                return {type => DELIM_TOKEN, value => '<'};                return {type => DELIM_TOKEN, value => '<'};
301                #redo A;                #redo A;
302              }              }
303            } else {            } else {
304              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};              unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
305              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
306              #reprocess              #reprocess
307              return {type => DELIM_TOKEN, value => '<'};              return {type => DELIM_TOKEN, value => '<'};
# Line 186  sub get_next_token ($) { Line 314  sub get_next_token ($) {
314            #redo A;            #redo A;
315          }          }
316        } elsif (my $t = {        } elsif (my $t = {
317                  0x003B => SEMICOLON_TOKEN, # ;                          0x0021 => EXCLAMATION_TOKEN, # !
318                  0x007B => LBRACE_TOKEN, # {                          0x002D => MINUS_TOKEN, # -
319                  0x007D => RBRACE_TOKEN, # }                          0x002E => DOT_TOKEN, # .
320                  0x0028 => LPAREN_TOKEN, # (                          0x003A => COLON_TOKEN, # :
321                  0x0029 => RPAREN_TOKEN, # )                          0x003B => SEMICOLON_TOKEN, # ;
322                  0x005B => LBRACKET_TOKEN, # [                          0x003D => MATCH_TOKEN, # =
323                  0x005D => RBRACKET_TOKEN, # ]                          0x007B => LBRACE_TOKEN, # {
324                            0x007D => RBRACE_TOKEN, # }
325                            0x0028 => LPAREN_TOKEN, # (
326                            0x0029 => RPAREN_TOKEN, # )
327                            0x005B => LBRACKET_TOKEN, # [
328                            0x005D => RBRACKET_TOKEN, # ]
329                 }->{$self->{c}}) {                 }->{$self->{c}}) {
330            my ($l, $c) = ($self->{line}, $self->{column});
331          # stay in the state          # stay in the state
332          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
333          return {type => $t};          return {type => $t, line => $l, column => $c};
334          # redo A;          # redo A;
335        } elsif ({        } elsif ({
336                  0x0020 => 1, # SP                  0x0020 => 1, # SP
# Line 245  sub get_next_token ($) { Line 379  sub get_next_token ($) {
379            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
380            return {type => $v};            return {type => $v};
381            #redo A;            #redo A;
382            } elsif ($v = {
383                           0x002A => STAR_TOKEN, # *
384                           0x007C => VBAR_TOKEN, # |
385                          }->{$c}) {
386              # stay in the state.
387              # reprocess
388              return {type => $v};
389              #redo A;
390          } else {          } else {
391            # stay in the state            # stay in the state
392            # reprocess            # reprocess
# Line 286  sub get_next_token ($) { Line 428  sub get_next_token ($) {
428          #redo A;          #redo A;
429        } else {        } else {
430          # stay in the state          # stay in the state
431          $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};          $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
432          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
433          return $current_token;          return $self->{t};
434          #redo A;          #redo A;
435        }        }
436      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
437        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or |ATKEYWORD|)        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
438          ## |FUNCTION|)
439        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
440            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
441            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
442            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
443          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
444          $current_token->{type} = DIMENSION_TOKEN          $self->{t}->{type} = DIMENSION_TOKEN
445              if $current_token->{type} == NUMBER_TOKEN;              if $self->{t}->{type} == NUMBER_TOKEN;
446          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
447          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
448          redo A;          redo A;
449        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
 ## TODO: 12-\X, 12-\{nl}  
450          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
451          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
452          redo A;          redo A;
453        } elsif ($self->{c} == 0x002D and # -        } elsif ($self->{c} == 0x002D) { # -
454                 $current_token->{type} == IDENT_TOKEN) {          if ($self->{t}->{type} == IDENT_TOKEN) {
455              $self->{c} = $self->{get_char}->();
456              if ($self->{c} == 0x003E) { # >
457                $self->{state} = BEFORE_TOKEN_STATE;
458                $self->{c} = $self->{get_char}->();
459                return {type => CDC_TOKEN};
460                #redo A;
461              } else {
462                ## NOTE: |-|, |-|, $self->{c}
463                #$self->{t} = {type => IDENT_TOKEN, value => '-'};
464                # stay in the state
465                # reconsume
466                return {type => MINUS_TOKEN};
467                #redo A;
468              }
469            } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
470              $self->{c} = $self->{get_char}->();
471              if ($self->{c} == 0x003E) { # >
472                unshift @{$self->{token}}, {type => CDC_TOKEN};
473                $self->{t}->{type} = NUMBER_TOKEN;
474                $self->{t}->{value} = '';
475                $self->{state} = BEFORE_TOKEN_STATE;
476                $self->{c} = $self->{get_char}->();
477                return $self->{t};
478                #redo A;
479              } else {
480                ## NOTE: |-|, |-|, $self->{c}
481                my $t = $self->{t};
482                $t->{type} = NUMBER_TOKEN;
483                $t->{value} = '';
484                $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
485                unshift @{$self->{token}}, {type => MINUS_TOKEN};
486                # stay in the state
487                # reconsume
488                return $t;
489                #redo A;
490              }
491            } else {
492              #
493            }
494          } else {
495            #
496          }
497          
498          if ($self->{t}->{type} == DIMENSION_TOKEN) {
499            ## NOTE: |-| after |NUMBER|.
500            unshift @{$self->{token}}, {type => MINUS_TOKEN};
501            $self->{state} = BEFORE_TOKEN_STATE;
502            # reprocess
503            $self->{t}->{type} = NUMBER_TOKEN;
504            $self->{t}->{value} = '';
505            return $self->{t};
506          } else {
507            ## NOTE: |-| not followed by |nmstart|.
508            $self->{state} = BEFORE_TOKEN_STATE;
509            # reprocess
510            return {type => MINUS_TOKEN};
511          }
512        } elsif ($self->{state} == AFTER_AT_STATE) {
513          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
514              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
515              $self->{c} == 0x005F or # _
516              $self->{c} > 0x007F) { # nonascii
517            $self->{t}->{value} .= chr $self->{c};
518            $self->{state} = NAME_STATE;
519            $self->{c} = $self->{get_char}->();
520            redo A;
521          } elsif ($self->{c} == 0x002D) { # -
522            $self->{t}->{value} .= '-';
523            $self->{state} = AFTER_AT_HYPHEN_STATE;
524            $self->{c} = $self->{get_char}->();
525            redo A;
526          } elsif ($self->{c} == 0x005C) { # \
527            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
528            $self->{c} = $self->{get_char}->();
529            redo A;
530          } else {
531            $self->{state} = BEFORE_TOKEN_STATE;
532            # reprocess
533            return {type => DELIM_TOKEN, value => '@'};
534          }
535        } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
536          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
537              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
538              $self->{c} == 0x005F or # _
539              $self->{c} > 0x007F) { # nonascii
540            $self->{t}->{value} .= chr $self->{c};
541            $self->{state} = NAME_STATE;
542            $self->{c} = $self->{get_char}->();
543            redo A;
544          } elsif ($self->{c} == 0x002D) { # -
545          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
546          if ($self->{c} == 0x003E) { # >          if ($self->{c} == 0x003E) { # >
547              unshift @{$self->{token}}, {type => CDC_TOKEN};
548            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
549            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
550            return {type => CDC_TOKEN};            return {type => DELIM_TOKEN, value => '@'};
551            #redo A;            #redo A;
552          } else {          } else {
553            ## NOTE: |-|, |-|, $self->{c}            unshift @{$self->{token}}, {type => MINUS_TOKEN};
554            #$current_token = {type => IDENT_TOKEN, value => '-'};            $self->{t} = {type => IDENT_TOKEN, value => '-'};
555            # stay in the state            $self->{state} = BEFORE_NMSTART_STATE;
556            # reconsume            # reprocess
557            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '@'};
558            #redo A;            #redo A;
559          }          }
560          } elsif ($self->{c} == 0x005C) { # \
561            ## TODO: @-\{nl}
562            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
563            $self->{c} = $self->{get_char}->();
564            redo A;
565        } else {        } else {
566          if ($current_token->{type} == NUMBER_TOKEN) {          unshift @{$self->{token}}, {type => MINUS_TOKEN};
567            ## NOTE: |-| after |num|.          $self->{state} = BEFORE_TOKEN_STATE;
568            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};          # reprocess
569            $self->{state} = BEFORE_TOKEN_STATE;          return {type => DELIM_TOKEN, value => '@'};
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {  
           ## NOTE: |-| after |@|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == NUMBER_TOKEN) {  
           ## NOTE: |-| after |NUMBER|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           # reconsume  
           $current_token->{value} = $current_token->{number};  
           delete $current_token->{number};  
           return $current_token;  
         } else {  
           ## NOTE: |-| not followed by |nmstart|.  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return {type => DELIM_TOKEN, value => '-'};  
         }  
570        }        }
571      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
572        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
573          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
574          $current_token->{value} = '-';          $self->{t}->{hyphen} = 1;
575            $self->{t}->{value} = '-';
576            $self->{t}->{type} = DIMENSION_TOKEN;
577          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
578          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
579          redo A;          redo A;
# Line 364  sub get_next_token ($) { Line 582  sub get_next_token ($) {
582                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
583                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
584          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
585          $current_token->{value} = chr $self->{c};          $self->{t}->{value} = chr $self->{c};
586          $current_token->{type} = DIMENSION_TOKEN;          $self->{t}->{type} = DIMENSION_TOKEN;
587          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
588          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
589          redo A;          redo A;
590        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
591          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
592          $current_token->{value} = '';          $self->{t}->{value} = '';
593            $self->{t}->{type} = DIMENSION_TOKEN;
594          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
595          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
596          redo A;          redo A;
597        } elsif ($self->{c} == 0x0025) { # %        } elsif ($self->{c} == 0x0025) { # %
598          $current_token->{type} = PERCENTAGE_TOKEN;          $self->{t}->{type} = PERCENTAGE_TOKEN;
599          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
600          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
601          return $current_token;          return $self->{t};
602          #redo A;          #redo A;
603        } else {        } else {
604          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
605          # reprocess          # reprocess
606          return $current_token;          return $self->{t};
607          #redo A;          #redo A;
608        }        }
609      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
# Line 395  sub get_next_token ($) { Line 614  sub get_next_token ($) {
614            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
615            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
616            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
617          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
618          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
619          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
620          redo A;          redo A;
# Line 405  sub get_next_token ($) { Line 624  sub get_next_token ($) {
624          redo A;          redo A;
625        } else {        } else {
626          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
627          $self->{c} = $self->{get_char}->();          # reprocess
628          return {type => DELIM_TOKEN, value => '#'};          return {type => DELIM_TOKEN, value => '#'};
629          #redo A;          #redo A;
630        }        }
# Line 417  sub get_next_token ($) { Line 636  sub get_next_token ($) {
636            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
637            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
638            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
639          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
640          # stay in the state          # stay in the state
641          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
642          redo A;          redo A;
643        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
644          $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
645          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
646          redo A;          redo A;
647        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
648                 $current_token->{type} == IDENT_TOKEN) { # (                 $self->{t}->{type} == IDENT_TOKEN) { # (
649          if (not $current_token->{has_escape} and          my $func_name = $self->{t}->{value};
650              {url => 1, Url => 1, uRl => 1, urL => 1,          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
651               URl => 1, UrL => 1, uRL => 1, URL => 1}          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
652              ->{$current_token->{value}}) {            if ($self->{t}->{has_escape}) {
653            $current_token->{type} = URI_TOKEN;              ## TODO: warn
654              }
655              $self->{t}->{type}
656                  = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
657              $self->{t}->{value} = '';
658            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
659            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
   
           ## NOTE: This version of the tokenizer does not support the |URI|  
           ## token type.  Note that browsers disagree in how to tokenize  
           ## |url| function.  
           $current_token->{type} = FUNCTION_TOKEN;  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
   
660            redo A;            redo A;
661          } else {          } else {
662            $current_token->{type} = FUNCTION_TOKEN;            $self->{t}->{type} = FUNCTION_TOKEN;
663            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
664            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
665            return $current_token;            return $self->{t};
666            #redo A;            #redo A;
667          }          }
668        } else {        } else {
669          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
670          # reconsume          # reconsume
671          return $current_token;          return $self->{t};
672            #redo A;
673          }
674        } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
675          while ({
676                    0x0020 => 1, # SP
677                    0x0009 => 1, # \t
678                    0x000D => 1, # \r
679                    0x000A => 1, # \n
680                    0x000C => 1, # \f
681                 }->{$self->{c}}) {
682            $self->{c} = $self->{get_char}->();
683          }
684          if ($self->{c} == -1) {
685            $self->{t}->{type} = {
686                URI_TOKEN, URI_INVALID_TOKEN,
687                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690            }->{$self->{t}->{type}};        
691            $self->{state} = BEFORE_TOKEN_STATE;
692            $self->{c} = $self->{get_char}->();
693            return $self->{t};
694            #redo A;
695          } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
696            ## TODO: Should we consider matches of "(" and ")"?
697            $self->{t}->{type} = {
698                URI_TOKEN, URI_INVALID_TOKEN,
699                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
700                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
701                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
702            }->{$self->{t}->{type}};
703            $self->{state} = URI_UNQUOTED_STATE;
704            $self->{c} = $self->{get_char}->();
705            redo A;
706          } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
707            $self->{state} = STRING_STATE; $q = $self->{c};
708            $self->{c} = $self->{get_char}->();
709            redo A;
710          } elsif ($self->{c} == 0x0029) { # )
711            $self->{state} = BEFORE_TOKEN_STATE;
712            $self->{c} = $self->{get_char}->();
713            return $self->{t};
714            #redo A;
715          } elsif ($self->{c} == 0x005C) { # \
716            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
717            $self->{c} = $self->{get_char}->();
718            redo A;
719          } else {
720            $self->{t}->{value} .= chr $self->{c};
721            $self->{state} = URI_UNQUOTED_STATE;
722            $self->{c} = $self->{get_char}->();
723            redo A;
724          }
725        } elsif ($self->{state} == URI_UNQUOTED_STATE) {
726          if ({
727               0x0020 => 1, # SP
728               0x0009 => 1, # \t
729               0x000D => 1, # \r
730               0x000A => 1, # \n
731               0x000C => 1, # \f
732              }->{$self->{c}}) {
733            $self->{state} = URI_AFTER_WSP_STATE;
734            $self->{c} = $self->{get_char}->();
735            redo A;
736          } elsif ($self->{c} == -1) {
737            $self->{t}->{type} = {
738                URI_TOKEN, URI_INVALID_TOKEN,
739                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
740                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
741                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
742            }->{$self->{t}->{type}};        
743            $self->{state} = BEFORE_TOKEN_STATE;
744            $self->{c} = $self->{get_char}->();
745            return $self->{t};
746            #redo A;
747          } elsif ($self->{c} < 0x0020 or {
748              0x0022 => 1, # "
749              0x0027 => 1, # '
750              0x0028 => 1, # (
751          }->{$self->{c}}) { # C0 or (
752            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
753            $self->{t}->{type} = {
754                URI_TOKEN, URI_INVALID_TOKEN,
755                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
756                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
757                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
758            }->{$self->{t}->{type}};
759            # stay in the state.
760            $self->{c} = $self->{get_char}->();
761            redo A;
762          } elsif ($self->{c} == 0x0029) { # )
763            $self->{state} = BEFORE_TOKEN_STATE;
764            $self->{c} = $self->{get_char}->();
765            return $self->{t};
766            #redo A;
767          } elsif ($self->{c} == 0x005C) { # \
768            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
769            $self->{c} = $self->{get_char}->();
770            redo A;
771          } else {
772            $self->{t}->{value} .= chr $self->{c};
773            # stay in the state.
774            $self->{c} = $self->{get_char}->();
775            redo A;
776          }
777        } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
778          if ({
779               0x0020 => 1, # SP
780               0x0009 => 1, # \t
781               0x000D => 1, # \r
782               0x000A => 1, # \n
783               0x000C => 1, # \f
784              }->{$self->{c}}) {
785            # stay in the state.
786            $self->{c} = $self->{get_char}->();
787            redo A;
788          } elsif ($self->{c} == -1) {
789            $self->{t}->{type} = {
790                URI_TOKEN, URI_INVALID_TOKEN,
791                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
792                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
793                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
794            }->{$self->{t}->{type}};        
795            $self->{state} = BEFORE_TOKEN_STATE;
796            $self->{c} = $self->{get_char}->();
797            return $self->{t};
798            #redo A;
799          } elsif ($self->{c} == 0x0029) { # )
800            $self->{state} = BEFORE_TOKEN_STATE;
801            $self->{c} = $self->{get_char}->();
802            return $self->{t};
803          #redo A;          #redo A;
804          } elsif ($self->{c} == 0x005C) { # \
805            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
806            $self->{c} = $self->{get_char}->();
807            redo A;
808          } else {
809            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
810            $self->{t}->{type} = {
811                URI_TOKEN, URI_INVALID_TOKEN,
812                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
813                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
814                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
815            }->{$self->{t}->{type}};
816            # stay in the state.
817            $self->{c} = $self->{get_char}->();
818            redo A;
819        }        }
820      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
821        $current_token->{has_escape} = 1;        $self->{t}->{has_escape} = 1;
822        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
823          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
824          $char = $self->{c} - 0x0030;          $char = $self->{c} - 0x0030;
# Line 473  sub get_next_token ($) { Line 833  sub get_next_token ($) {
833          redo A;          redo A;
834        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
835          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
836          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 + 0xA;
837          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
838          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
839          redo A;          redo A;
840        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
841                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
842          if ($q == 0) {          if ($q == 0) {
843            ## NOTE: In |escape| in ... in |ident|.            #
844            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
845            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
846            return $current_token;            $self->{t}->{type} = {
847            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
848            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
849                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
850                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
851              }->{$self->{t}->{type}};
852              $self->{t}->{value} .= chr $self->{c};
853              $self->{state} = URI_UNQUOTED_STATE;
854              $self->{c} = $self->{get_char}->();
855              redo A;
856          } else {          } else {
857            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $current_token->{value} .= chr $self->{c};  
858            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
859            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
860            redo A;            redo A;
861          }          }
862        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
863          if ($q == 0) {          if ($q == 0) {
864            ## NOTE: In |escape| in ... in |ident|.            #
865            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
866            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
867            return $current_token;            $self->{t}->{type} = {
868            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
869            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
870                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872              }->{$self->{t}->{type}};
873              $self->{state} = ESCAPE_BEFORE_LF_STATE;
874              $self->{c} = $self->{get_char}->();
875              redo A;
876          } else {          } else {
877            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $current_token->{value} .= "\x0D\x0A";  
878            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
879            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
880            redo A;            redo A;
881          }          }
882          } elsif ($self->{c} == -1) {
883            #
884        } else {        } else {
885          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
886          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
887          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
888                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
889          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
890          redo A;          redo A;
891        }        }
892    
893          if ($q == 0) {
894            if ($self->{t}->{type} == DIMENSION_TOKEN) {
895              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
896                $self->{state} = BEFORE_TOKEN_STATE;
897                # reprocess
898                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
899                unshift @{$self->{token}}, {type => MINUS_TOKEN};
900                $self->{t}->{type} = NUMBER_TOKEN;
901                $self->{t}->{value} = '';
902                return $self->{t};
903                #redo A;
904              } elsif (length $self->{t}->{value}) {
905                $self->{state} = BEFORE_TOKEN_STATE;
906                # reprocess
907                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
908                return $self->{t};
909                #redo A;
910              } else {
911                $self->{state} = BEFORE_TOKEN_STATE;
912                # reprocess
913                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
914                $self->{t}->{type} = NUMBER_TOKEN;
915                $self->{t}->{value} = '';
916                return $self->{t};
917                #redo A;
918              }
919            } else {
920              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
921                $self->{state} = BEFORE_TOKEN_STATE;
922                # reprocess
923                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
924                return {type => MINUS_TOKEN};
925                #redo A;
926              } elsif (length $self->{t}->{value}) {
927                $self->{state} = BEFORE_TOKEN_STATE;
928                # reprocess
929                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
930                return $self->{t};
931                #redo A;
932              } else {
933                $self->{state} = BEFORE_TOKEN_STATE;
934                # reprocess
935                return {type => DELIM_TOKEN, value => '\\'};
936                #redo A;
937              }
938            }
939          } elsif ($q == 1) {
940            $self->{state} = URI_UNQUOTED_STATE;
941            $self->{c} = $self->{get_char}->();
942            redo A;
943          } else {
944            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
945            $self->{t}->{type} = {
946              STRING_TOKEN, INVALID_TOKEN,
947              URI_TOKEN, URI_INVALID_TOKEN,
948              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
949            }->{$self->{t}->{type}} || $self->{t}->{type};
950            $self->{state} = BEFORE_TOKEN_STATE;
951            # reprocess
952            return $self->{t};
953            #redo A;
954          }
955      } elsif ($self->{state} == ESCAPE_STATE) {      } elsif ($self->{state} == ESCAPE_STATE) {
956        ## NOTE: third..seventh character of |unicode| in |escape|.        ## NOTE: third..seventh character of |unicode| in |escape|.
957        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
# Line 528  sub get_next_token ($) { Line 965  sub get_next_token ($) {
965          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
966          redo A;          redo A;
967        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
968          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
969          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
970          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
971          redo A;          redo A;
# Line 536  sub get_next_token ($) { Line 973  sub get_next_token ($) {
973                 $self->{c} == 0x000A or # \n                 $self->{c} == 0x000A or # \n
974                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
975                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
976          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
977          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
978                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
979          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
980          redo A;          redo A;
981        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 545  sub get_next_token ($) { Line 983  sub get_next_token ($) {
983          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
984          redo A;          redo A;
985        } else {        } else {
986          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
987          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
988                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
989          # reconsume          # reconsume
990          redo A;          redo A;
991        }        }
# Line 556  sub get_next_token ($) { Line 995  sub get_next_token ($) {
995            $self->{c} == 0x000A or # \n            $self->{c} == 0x000A or # \n
996            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
997            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
998          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
999          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
1000                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1001          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1002          redo A;          redo A;
1003        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 565  sub get_next_token ($) { Line 1005  sub get_next_token ($) {
1005          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1006          redo A;          redo A;
1007        } else {        } else {
1008          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1009          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
1010                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1011          # reconsume          # reconsume
1012          redo A;          redo A;
1013        }        }
1014      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1015        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1016        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
1017          $current_token->{value} .= chr $char;          $self->{state} = $q == 0 ? NAME_STATE :
1018          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1019          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1020          redo A;          redo A;
1021        } else {        } else {
1022          $current_token->{value} .= chr $char;          $self->{state} = $q == 0 ? NAME_STATE :
1023          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1024          # reconsume          # reprocess
1025          redo A;          redo A;
1026        }        }
1027      } elsif ($self->{state} == STRING_STATE) {      } elsif ($self->{state} == STRING_STATE) {
1028        ## NOTE: A character in |string$Q| in |string| in |STRING|, or        ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1029        ## a character in |invalid$Q| in |invalid| in |INVALID|,        ## a character in |invalid$Q| in |invalid| in |INVALID|,
1030        ## where |$Q = $q == 0x0022 ? 1 : 2|.        ## where |$Q = $q == 0x0022 ? 1 : 2|.
1031          ## Or, in |URI|.
1032        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
1033          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
1034          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1035          redo A;          redo A;
1036        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
1037          $self->{state} = BEFORE_TOKEN_STATE;          if ($self->{t}->{type} == STRING_TOKEN) {
1038          $self->{c} = $self->{get_char}->();            $self->{state} = BEFORE_TOKEN_STATE;
1039          return $current_token;            $self->{c} = $self->{get_char}->();
1040          #redo A;            return $self->{t};
1041              #redo A;
1042            } else {
1043              $self->{state} = URI_AFTER_WSP_STATE;
1044              $self->{c} = $self->{get_char}->();
1045              redo A;
1046            }
1047        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
1048                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
1049                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
1050                 $self->{c} == -1) {                 $self->{c} == -1) {
1051          $current_token->{type} = INVALID_TOKEN;          $self->{t}->{type} = {
1052              STRING_TOKEN, INVALID_TOKEN,
1053              INVALID_TOKEN, INVALID_TOKEN,
1054              URI_TOKEN, URI_INVALID_TOKEN,
1055              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1056              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1057              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1058            }->{$self->{t}->{type}};
1059          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1060          # reconsume          # reconsume
1061          return $current_token;          return $self->{t};
1062          #redo A;          #redo A;
1063        } else {        } else {
1064          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1065          # stay in the state          # stay in the state
1066          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1067          redo A;          redo A;
# Line 614  sub get_next_token ($) { Line 1069  sub get_next_token ($) {
1069      } elsif ($self->{state} == NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_STATE) {
1070        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1071        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1072          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1073          # stay in the state          # stay in the state
1074          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1075          redo A;          redo A;
# Line 623  sub get_next_token ($) { Line 1078  sub get_next_token ($) {
1078          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1079          redo A;          redo A;
1080        } else {        } else {
1081          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1082          $current_token->{value} = '';          $self->{t}->{value} = '';
1083          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1084          # reprocess          # reprocess
1085          redo A;          redo A;
# Line 632  sub get_next_token ($) { Line 1087  sub get_next_token ($) {
1087      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
1088        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
1089        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1090          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1091          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1092          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1093          redo A;          redo A;
1094        } else {        } else {
1095          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DOT_TOKEN};
1096          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1097          $current_token->{value} = '';          $self->{t}->{value} = '';
1098          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1099          # reprocess          # reprocess
1100          return $current_token;          return $self->{t};
1101          #redo A;          #redo A;
1102        }        }
1103      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1104        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
1105        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1106          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1107          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1108          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1109          redo A;          redo A;
1110        } else {        } else {
1111          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1112          $self->{c} = $self->{get_char}->();          # reprocess
1113          return {type => DELIM_TOKEN, value => '.'};          return {type => DOT_TOKEN};
1114          #redo A;          #redo A;
1115        }        }
1116      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1117        ## NOTE: |[0-9]| in |num| after |.|.        ## NOTE: |[0-9]| in |num| after |.|.
1118        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1119          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1120          # stay in the state          # stay in the state
1121          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1122          redo A;          redo A;
1123        } else {        } else {
1124          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1125          $current_token->{value} = '';          $self->{t}->{value} = '';
1126          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1127          # reprocess          # reprocess
1128          redo A;          redo A;
# Line 676  sub get_next_token ($) { Line 1131  sub get_next_token ($) {
1131        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";
1132      }      }
1133    } # A    } # A
1134    } # get_next_token
1135    
1136    ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|  sub serialize_token ($$) {
1137      shift;
1138      my $t = shift;
1139    
1140  } # get_next_token    ## NOTE: This function is not intended for roundtrip-able serialization.
1141    
1142      if ($t->{type} == IDENT_TOKEN) {
1143        return $t->{value};
1144      } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1145        return '@' . $t->{value};
1146      } elsif ($t->{type} == HASH_TOKEN) {
1147        return '#' . $t->{value};
1148      } elsif ($t->{type} == FUNCTION_TOKEN) {
1149        return $t->{value} . '(';
1150      } elsif ($t->{type} == URI_TOKEN) {
1151        return 'url(' . $t->{value} . ')';
1152      } elsif ($t->{type} == URI_INVALID_TOKEN) {
1153        return 'url(' . $t->{value};
1154      } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1155        return 'url-prefix(' . $t->{value} . ')';
1156      } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1157        return 'url-prefix(' . $t->{value};
1158      } elsif ($t->{type} == STRING_TOKEN) {
1159        return '"' . $t->{value} . '"';
1160      } elsif ($t->{type} == INVALID_TOKEN) {
1161        return '"' . $t->{value};
1162      } elsif ($t->{type} == NUMBER_TOKEN) {
1163        return $t->{number};
1164      } elsif ($t->{type} == DIMENSION_TOKEN) {
1165        return $t->{number} . $t->{value};
1166      } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1167        return $t->{number} . '%';
1168      } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1169        return 'U+' . $t->{value};
1170      } elsif ($t->{type} == DELIM_TOKEN) {
1171        return $t->{value};
1172      } elsif ($t->{type} == PLUS_TOKEN) {
1173        return '+';
1174      } elsif ($t->{type} == GREATER_TOKEN) {
1175        return '>';
1176      } elsif ($t->{type} == COMMA_TOKEN) {
1177        return ',';
1178      } elsif ($t->{type} == TILDE_TOKEN) {
1179        return '~';
1180      } elsif ($t->{type} == DASHMATCH_TOKEN) {
1181        return '|=';
1182      } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1183        return '^=';
1184      } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1185        return '$=';
1186      } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1187        return '*=';
1188      } elsif ($t->{type} == INCLUDES_TOKEN) {
1189        return '~=';
1190      } elsif ($t->{type} == SEMICOLON_TOKEN) {
1191        return ';';
1192      } elsif ($t->{type} == LBRACE_TOKEN) {
1193        return '{';
1194      } elsif ($t->{type} == RBRACE_TOKEN) {
1195        return '}';
1196      } elsif ($t->{type} == LPAREN_TOKEN) {
1197        return '(';
1198      } elsif ($t->{type} == RPAREN_TOKEN) {
1199        return ')';
1200      } elsif ($t->{type} == LBRACKET_TOKEN) {
1201        return '[';
1202      } elsif ($t->{type} == RBRACKET_TOKEN) {
1203        return ']';
1204      } elsif ($t->{type} == S_TOKEN) {
1205        return ' ';
1206      } elsif ($t->{type} == CDO_TOKEN) {
1207        return '<!--';
1208      } elsif ($t->{type} == CDC_TOKEN) {
1209        return '-->';
1210      } elsif ($t->{type} == COMMENT_TOKEN) {
1211        return '/**/';
1212      } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1213        return '/*';
1214      } elsif ($t->{type} == EOF_TOKEN) {
1215        return '{EOF}';
1216      } elsif ($t->{type} == MINUS_TOKEN) {
1217        return '-';
1218      } elsif ($t->{type} == STAR_TOKEN) {
1219        return '*';
1220      } elsif ($t->{type} == VBAR_TOKEN) {
1221        return '|';
1222      } elsif ($t->{type} == COLON_TOKEN) {
1223        return ':';
1224      } elsif ($t->{type} == MATCH_TOKEN) {
1225        return '=';
1226      } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1227        return '!';
1228      } else {
1229        return '{'.$t->{type}.'}';
1230      }
1231    } # serialize_token
1232    
1233    =head1 LICENSE
1234    
1235    Copyright 2007 Wakaba <w@suika.fam.cx>
1236    
1237    This library is free software; you can redistribute it
1238    and/or modify it under the same terms as Perl itself.
1239    
1240    =cut
1241    
1242  1;  1;
1243  # $Date$  # $Date$

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.17

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24