/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Fri Aug 17 11:53:52 2007 UTC revision 1.20 by wakaba, Sat Jan 26 14:48:09 2008 UTC
# Line 1  Line 1 
1  package Whatpm::CSS::Tokenizer;  package Whatpm::CSS::Tokenizer;
2  use strict;  use strict;
3    our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    require Exporter;
6    push our @ISA, 'Exporter';
7    
8    sub BEFORE_TOKEN_STATE () { 0 }
9    sub BEFORE_NMSTART_STATE () { 1 }
10    sub NAME_STATE () { 2 }
11    sub ESCAPE_OPEN_STATE () { 3 }
12    sub STRING_STATE () { 4 }
13    sub HASH_OPEN_STATE () { 5 }
14    sub NUMBER_STATE () { 6 }
15    sub NUMBER_FRACTION_STATE () { 7 }
16    sub AFTER_NUMBER_STATE () { 8 }
17    sub URI_BEFORE_WSP_STATE () { 9 }
18    sub ESCAPE_STATE () { 10 }
19    sub ESCAPE_BEFORE_LF_STATE () { 11 }
20    sub ESCAPE_BEFORE_NL_STATE () { 12 }
21    sub NUMBER_DOT_STATE () { 13 }
22    sub NUMBER_DOT_NUMBER_STATE () { 14 }
23    sub DELIM_STATE () { 15 }
24    sub URI_UNQUOTED_STATE () { 16 }
25    sub URI_AFTER_WSP_STATE () { 17 }
26    sub AFTER_AT_STATE () { 18 }
27    sub AFTER_AT_HYPHEN_STATE () { 19 }
28    
29    sub IDENT_TOKEN () { 1 }
30    sub ATKEYWORD_TOKEN () { 2 }
31    sub HASH_TOKEN () { 3 }
32    sub FUNCTION_TOKEN () { 4 }
33    sub URI_TOKEN () { 5 }
34    sub URI_INVALID_TOKEN () { 6 }
35    sub URI_PREFIX_TOKEN () { 7 }
36    sub URI_PREFIX_INVALID_TOKEN () { 8 }
37    sub STRING_TOKEN () { 9 }
38    sub INVALID_TOKEN () { 10 }
39    sub NUMBER_TOKEN () { 11 }
40    sub DIMENSION_TOKEN () { 12 }
41    sub PERCENTAGE_TOKEN () { 13 }
42    sub UNICODE_RANGE_TOKEN () { 14 }
43    sub DELIM_TOKEN () { 16 }
44    sub PLUS_TOKEN () { 17 }
45    sub GREATER_TOKEN () { 18 }
46    sub COMMA_TOKEN () { 19 }
47    sub TILDE_TOKEN () { 20 }
48    sub DASHMATCH_TOKEN () { 21 }
49    sub PREFIXMATCH_TOKEN () { 22 }
50    sub SUFFIXMATCH_TOKEN () { 23 }
51    sub SUBSTRINGMATCH_TOKEN () { 24 }
52    sub INCLUDES_TOKEN () { 25 }
53    sub SEMICOLON_TOKEN () { 26 }
54    sub LBRACE_TOKEN () { 27 }
55    sub RBRACE_TOKEN () { 28 }
56    sub LPAREN_TOKEN () { 29 }
57    sub RPAREN_TOKEN () { 30 }
58    sub LBRACKET_TOKEN () { 31 }
59    sub RBRACKET_TOKEN () { 32 }
60    sub S_TOKEN () { 33 }
61    sub CDO_TOKEN () { 34 }
62    sub CDC_TOKEN () { 35 }
63    sub COMMENT_TOKEN () { 36 }
64    sub COMMENT_INVALID_TOKEN () { 37 }
65    sub EOF_TOKEN () { 38 }
66    sub MINUS_TOKEN () { 39 }
67    sub STAR_TOKEN () { 40 }
68    sub VBAR_TOKEN () { 41 }
69    sub DOT_TOKEN () { 42 }
70    sub COLON_TOKEN () { 43 }
71    sub MATCH_TOKEN () { 44 }
72    sub EXCLAMATION_TOKEN () { 45 }
73    
74    our @TokenName = qw(
75      0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76      STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77      0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78      PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79      LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80      COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81    );
82    
83    our @EXPORT_OK = qw(
84      IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85      URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86      STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87      UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88      TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89      SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90      RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91      S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92      MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93      EXCLAMATION_TOKEN
94    );
95    
96    our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97    
98  sub new ($) {  sub new ($) {
99    my $self = bless {token => []}, shift;    my $self = bless {token => [], get_char => sub { -1 }}, shift;
100    return $self;    return $self;
101  } # new  } # new
102    
103  sub init ($) {  sub init ($) {
104    my $self = shift;    my $self = shift;
105    $self->{state} = BEFORE_TOKEN_STATE;    $self->{state} = BEFORE_TOKEN_STATE;
106    $self->{c} = $self->{get_char}->();    $self->{c} = $self->{get_char}->($self);
107      #$self->{t} = {type => token-type, value => value, number => number};
108  } # init  } # init
109    
110  sub get_next_token ($) {  sub get_next_token ($) {
# Line 18  sub get_next_token ($) { Line 113  sub get_next_token ($) {
113      return shift @{$self->{token}};      return shift @{$self->{token}};
114    }    }
115    
   my $current_token;  
116    my $char;    my $char;
117    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
118    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
119    my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|    my $q;
120          ## NOTE:
121          ##   0: in |ident|.
122          ##   1: in |URI| outside of |string|.
123          ##   0x0022: in |string1| or |invalid1|.
124          ##   0x0027: in |string2| or |invalid2|.
125    
126    A: {    A: {
127      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
128        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
129          ## NOTE: |-| in |ident| in |IDENT|          ## NOTE: |-| in |ident| in |IDENT|
130          $current_token = {type => IDENT_TOKEN, value => '-'};          $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
131                          line => $self->{line}, column => $self->{column}};
132          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
133          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
134          redo A;          redo A;
135        } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
136                 (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
137                          line => $self->{line}, column => $self->{column}};
138            $self->{c} = $self->{get_char}->($self);
139            if ($self->{c} == 0x002B) { # +
140              my ($l, $c) = ($self->{line}, $self->{column});
141              $self->{c} = $self->{get_char}->($self);
142              if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143                  (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144                  (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
145                  $self->{c} == 0x003F) { # ?
146                $self->{t}->{value} = chr $self->{c};
147                $self->{t}->{type} = UNICODE_RANGE_TOKEN;
148                $self->{c} = $self->{get_char}->($self);
149                C: for (2..6) {
150                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
151                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
152                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
153                      $self->{c} == 0x003F) { # ?
154                    $self->{t}->{value} .= chr $self->{c};
155                    $self->{c} = $self->{get_char}->($self);
156                  } else {
157                    last C;
158                  }
159                } # C
160    
161                if ($self->{c} == 0x002D) { # -
162                  $self->{c} = $self->{get_char}->($self);
163                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
164                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
165                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
166                    $self->{t}->{value} .= '-' . chr $self->{c};
167                    $self->{c} = $self->{get_char}->($self);
168                    C: for (2..6) {
169                      if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
170                          (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
171                          (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
172                        $self->{t}->{value} .= chr $self->{c};
173                        $self->{c} = $self->{get_char}->($self);
174                      } else {
175                        last C;
176                      }
177                    } # C
178                    
179                    #
180                  } else {
181                    my $token = $self->{t};
182                    $self->{t} = {type => IDENT_TOKEN, value => '-',
183                                  line => $self->{line},
184                                  column => $self->{column}};
185                    $self->{state} = BEFORE_NMSTART_STATE;
186                    # reprocess
187                    return $token;
188                    #redo A;
189                  }
190                }
191    
192                $self->{state} = BEFORE_TOKEN_STATE;
193                # reprocess
194                return $self->{t};
195                #redo A;
196              } else {
197                unshift @{$self->{token}},
198                    {type => PLUS_TOKEN, line => $l, column => $c};
199                $self->{state} = BEFORE_TOKEN_STATE;
200                # reprocess
201                return $self->{t};
202                #redo A;
203              }
204            } else {
205              $self->{state} = NAME_STATE;
206              # reprocess
207              redo A;
208            }
209          } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
210                   (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
211                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
212                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
213          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
214          $current_token = {type => IDENT_TOKEN, value => chr $self->{char}};          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
215                          line => $self->{line}, column => $self->{column}};
216          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
217          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
218          redo A;          redo A;
219        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
220          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
221          $current_token = {type => IDENT_TOKEN, value => ''};          $self->{t} = {type => IDENT_TOKEN, value => '',
222                          line => $self->{line}, column => $self->{column}};
223          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
224          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
225          redo A;          redo A;
226        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
227          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
228          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $self->{t} = {type => ATKEYWORD_TOKEN, value => '',
229          $self->{state} = BEFORE_NMSTART_STATE;                        line => $self->{line}, column => $self->{column}};
230          $self->{c} = $self->{get_char}->();          $self->{state} = AFTER_AT_STATE;
231          redo A;          $self->{c} = $self->{get_char}->($self);
232        } elsif ($self->{c} == 0x0022) { # "          redo A;
233          ## NOTE: |"| in |string1| in |string| in |STRING|, or        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
234          ## |"| in |invalid1| in |invalid| in |INVALID|.          $self->{t} = {type => STRING_TOKEN, value => '',
235          $current_token = {type => STRING_TOKEN, value => ''};                        line => $self->{line}, column => $self->{column}};
236          $self->{state} = STRING_STATE; $q = 1;          $self->{state} = STRING_STATE; $q = $self->{c};
237          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
         redo A;  
       } elsif ($self->{c} == 0x0027) { # '  
         ## NOTE: |'| in |string2| in |string| in |STRING|, or  
         ## |'| in |invalid2| in |invalid| in |INVALID|.  
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 2;  
         $self->{c} = $self->{get_char}->();  
238          redo A;          redo A;
239        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
240          ## NOTE: |#| in |HASH|.          ## NOTE: |#| in |HASH|.
241          $current_token = {type => HASH_TOKEN, value => ''};          $self->{t} = {type => HASH_TOKEN, value => '',
242                          line => $self->{line}, column => $self->{column}};
243          $self->{state} = HASH_OPEN_STATE;          $self->{state} = HASH_OPEN_STATE;
244          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
245          redo A;          redo A;
246        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
247          ## NOTE: |num|.          ## NOTE: |num|.
248          $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};          $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c},
249                          line => $self->{line}, column => $self->{column}};
250            ## NOTE: 'value' is renamed as 'number' later.
251          $self->{state} = NUMBER_STATE;          $self->{state} = NUMBER_STATE;
252          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
253          redo A;          redo A;
254        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
255          ## NOTE: |num|.          ## NOTE: |num|.
256          $current_token = {type => NUMBER_TOKEN, value => '.'};          $self->{t} = {type => NUMBER_TOKEN, value => '0',
257                          line => $self->{line}, column => $self->{column}};
258            ## NOTE: 'value' is renamed as 'number' later.
259          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
260          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
261          redo A;          redo A;
262          } elsif ($self->{c} == 0x002F) { # /
263            my ($l, $c) = ($self->{line}, $self->{column});
264            $self->{c} = $self->{get_char}->($self);
265            if ($self->{c} == 0x002A) { # *
266              C: {
267                $self->{c} = $self->{get_char}->($self);
268                if ($self->{c} == 0x002A) { # *
269                  D: {
270                    $self->{c} = $self->{get_char}->($self);
271                    if ($self->{c} == 0x002F) { # /
272                      #
273                    } elsif ($self->{c} == 0x002A) { # *
274                      redo D;
275                    } else {
276                      redo C;
277                    }
278                  } # D
279                } elsif ($self->{c} == -1) {
280                  # stay in the state
281                  # reprocess
282                  return {type => COMMENT_INVALID_TOKEN};
283                  #redo A;
284                } else {
285                  redo C;
286                }
287              } # C
288    
289              # stay in the state.
290              $self->{c} = $self->{get_char}->($self);
291              redo A;
292            } else {
293              # stay in the state.
294              # reprocess
295              return {type => DELIM_TOKEN, value => '/', line => $l, column => $c};
296              #redo A;
297            }        
298        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
299            my ($l, $c) = ($self->{line}, $self->{column});
300          ## NOTE: |CDO|          ## NOTE: |CDO|
301          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
302          if ($self->{c} == 0x0021) { # !          if ($self->{c} == 0x0021) { # !
303            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
304            if ($self->{c} == 0x002C) { # -            if ($self->{c} == 0x002D) { # -
305              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->($self);
306              if ($self->{c} == 0x002C) { # -              if ($self->{c} == 0x002D) { # -
307                $self->{state} = BEFORE_TOKEN_STATE;                $self->{state} = BEFORE_TOKEN_STATE;
308                $self->{c} = $self->{get_char}->();                $self->{c} = $self->{get_char}->($self);
309                return {type => CDO_TOKEN};                return {type => CDO_TOKEN, line => $l, column => $c};
310                #redo A;                #redo A;
311              } else {              } else {
312                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}},
313                      {type => EXCLAMATION_TOKEN, line => $l, column => $c + 1};
314                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
315                $current_token = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-',
316                                line => $l, column => $c + 2};
317                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
318                #reprocess                #reprocess
319                return {type => DELIM_TOKEN, value => '<'};                return {type => DELIM_TOKEN, value => '<',
320                          line => $l, column => $c};
321                #redo A;                #redo A;
322              }              }
323            } else {            } else {
324              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};              unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN,
325                                            line => $l, column => $c + 1};
326              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
327              #reprocess              #reprocess
328              return {type => DELIM_TOKEN, value => '<'};              return {type => DELIM_TOKEN, value => '<',
329                        line => $l, column => $c};
330              #redo A;              #redo A;
331            }            }
332          } else {          } else {
333            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
334            #reprocess            #reprocess
335            return {type => DELIM_TOKEN, value => '<'};            return {type => DELIM_TOKEN, value => '<',
336                      line => $l, column => $c};
337            #redo A;            #redo A;
338          }          }
339        } elsif ({        } elsif (my $t = {
340                  0x003B => 1, # ;                          0x0021 => EXCLAMATION_TOKEN, # !
341                  0x007B => 1, # {                          0x002D => MINUS_TOKEN, # -
342                  0x007D => 1, # }                          0x002E => DOT_TOKEN, # .
343                  0x0028 => 1, # (                          0x003A => COLON_TOKEN, # :
344                  0x0029 => 1, # )                          0x003B => SEMICOLON_TOKEN, # ;
345                  0x005B => 1, # [                          0x003D => MATCH_TOKEN, # =
346                  0x005D => 1, # ]                          0x007B => LBRACE_TOKEN, # {
347                            0x007D => RBRACE_TOKEN, # }
348                            0x0028 => LPAREN_TOKEN, # (
349                            0x0029 => RPAREN_TOKEN, # )
350                            0x005B => LBRACKET_TOKEN, # [
351                            0x005D => RBRACKET_TOKEN, # ]
352                 }->{$self->{c}}) {                 }->{$self->{c}}) {
353            my ($l, $c) = ($self->{line}, $self->{column});
354          # stay in the state          # stay in the state
355          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
356          return {type => chr $self->{c}};          return {type => $t, line => $l, column => $c};
357          # redo A;          # redo A;
358        } elsif ({        } elsif ({
359                  0x0020 => 1, # SP                  0x0020 => 1, # SP
# Line 139  sub get_next_token ($) { Line 362  sub get_next_token ($) {
362                  0x000A => 1, # \n                  0x000A => 1, # \n
363                  0x000C => 1, # \f                  0x000C => 1, # \f
364                 }->{$self->{c}}) {                 }->{$self->{c}}) {
365            my ($l, $c) = ($self->{line}, $self->{column});
366          W: {          W: {
367            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
368            if ({            if ({
369                  0x0020 => 1, # SP                  0x0020 => 1, # SP
370                  0x0009 => 1, # \t                  0x0009 => 1, # \t
# Line 155  sub get_next_token ($) { Line 379  sub get_next_token ($) {
379                              0x002C => COMMA_TOKEN, # ,                              0x002C => COMMA_TOKEN, # ,
380                              0x007E => TILDE_TOKEN, # ~                              0x007E => TILDE_TOKEN, # ~
381                             }->{$self->{c}}) {                             }->{$self->{c}}) {
382                my ($l, $c) = ($self->{line}, $self->{column});
383              # stay in the state              # stay in the state
384              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->($self);
385              return {type => $v};              return {type => $v, line => $l, column => $c};
386              #redo A;              #redo A;
387            } else {            } else {
388              # stay in the state              # stay in the state
389              # reprocess              # reprocess
390              return {type => S_TOKEN};              return {type => S_TOKEN, line => $l, column => $c};
391              #redo A;              #redo A;
392            }            }
393          } # W          } # W
# Line 172  sub get_next_token ($) { Line 397  sub get_next_token ($) {
397                          0x0024 => SUFFIXMATCH_TOKEN, # $                          0x0024 => SUFFIXMATCH_TOKEN, # $
398                          0x002A => SUBSTRINGMATCH_TOKEN, # *                          0x002A => SUBSTRINGMATCH_TOKEN, # *
399                         }->{$self->{c}}) {                         }->{$self->{c}}) {
400          $self->{c} = $self->{get_char}->();          my ($line, $column) = ($self->{line}, $self->{column});
401            my $c = $self->{c};
402            $self->{c} = $self->{get_char}->($self);
403          if ($self->{c} == 0x003D) { # =          if ($self->{c} == 0x003D) { # =
404            # stay in the state            # stay in the state
405            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
406            return {type => $v};            return {type => $v, line => $line, column => $column};
407              #redo A;
408            } elsif ($v = {
409                           0x002A => STAR_TOKEN, # *
410                           0x007C => VBAR_TOKEN, # |
411                          }->{$c}) {
412              # stay in the state.
413              # reprocess
414              return {type => $v, line => $line, column => $column};
415            #redo A;            #redo A;
416          } else {          } else {
417            # stay in the state            # stay in the state
418            # reprocess            # reprocess
419            return {type => DELIM_TOKEN, value => chr $self->{c}};            return {type => DELIM_TOKEN, value => chr $c,
420                      line => $line, column => $column};
421            #redo A;            #redo A;
422          }          }
423        } elsif ($self->{c} == 0x002B) { # +        } elsif ($self->{c} == 0x002B) { # +
424            my ($l, $c) = ($self->{line}, $self->{column});
425          # stay in the state          # stay in the state
426          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
427          return {type => PLUS_TOKEN};          return {type => PLUS_TOKEN, line => $l, column => $c};
428          #redo A;          #redo A;
429        } elsif ($self->{c} == 0x003E) { # >        } elsif ($self->{c} == 0x003E) { # >
430            my ($l, $c) = ($self->{line}, $self->{column});
431          # stay in the state          # stay in the state
432          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
433          return {type => GREATER_TOKEN};          return {type => GREATER_TOKEN, line => $l, column => $c};
434          #redo A;          #redo A;
435        } elsif ($self->{c} == 0x002C) { # ,        } elsif ($self->{c} == 0x002C) { # ,
436            my ($l, $c) = ($self->{line}, $self->{column});
437          # stay in the state          # stay in the state
438          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
439          return {type => COMMA_TOKEN};          return {type => COMMA_TOKEN, line => $l, column => $c};
440          #redo A;          #redo A;
441        } elsif ($self->{c} == 0x007E) { # ~        } elsif ($self->{c} == 0x007E) { # ~
442          $self->{c} = $self->{get_char}->();          my ($l, $c) = ($self->{line}, $self->{column});
443            $self->{c} = $self->{get_char}->($self);
444          if ($self->{c} == 0x003D) { # =          if ($self->{c} == 0x003D) { # =
445            # stay in the state            # stay in the state
446            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
447            return {type => INCLUDES_TOKEN};            return {type => INCLUDES_TOKEN, line => $l, column => $c};
448            #redo A;            #redo A;
449          } else {          } else {
450            # stay in the state            # stay in the state
451            # reprocess            # reprocess
452            return {type => TILDE_TOKEN};            return {type => TILDE_TOKEN, line => $l, column => $c};
453            #redo A;            #redo A;
454          }          }
455        } elsif ($self->{c} == -1) {        } elsif ($self->{c} == -1) {
456          # stay in the state          # stay in the state
457          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
458          return {type => EOF_TOKEN};          return {type => EOF_TOKEN,
459                    line => $self->{line}, column => $self->{column}};
460          #redo A;          #redo A;
461        } else {        } else {
462          # stay in the state          # stay in the state
463          $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};          $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c},
464          $self->{c} = $self->{get_char}->();                        line => $self->{line}, column => $self->{column}};
465          return $current_token;          $self->{c} = $self->{get_char}->($self);
466            return $self->{t};
467          #redo A;          #redo A;
468        }        }
469      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
470        ## NOTE: |nmstart| in |ident| in (|IDENT| or |ATKEYWORD|)        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
471        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        ## |FUNCTION|)
472            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
473              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
474            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
475            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
476          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
477            $self->{t}->{type} = DIMENSION_TOKEN
478                if $self->{t}->{type} == NUMBER_TOKEN;
479          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
480          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
481          redo A;          redo A;
482        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
483          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
484          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
485          redo A;          redo A;
486        } elsif ($self->{c} == 0x002D and # -        } elsif ($self->{c} == 0x002D) { # -
487                 $current_token->{type} == IDENT_TOKEN) {          if ($self->{t}->{type} == IDENT_TOKEN) {
488          $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
489              if ($self->{c} == 0x003E) { # >
490                $self->{state} = BEFORE_TOKEN_STATE;
491                $self->{c} = $self->{get_char}->($self);
492                return {type => CDC_TOKEN,
493                        line => $self->{t}->{line},
494                        column => $self->{t}->{column}};
495                #redo A;
496              } else {
497                ## NOTE: |-|, |-|, $self->{c}
498                #$self->{t} = {type => IDENT_TOKEN, value => '-'};
499                $self->{t}->{column}++;
500                # stay in the state
501                # reconsume
502                return {type => MINUS_TOKEN,
503                        line => $self->{t}->{line},
504                        column => $self->{t}->{column} - 1};
505                #redo A;
506              }
507            } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
508              my ($l, $c) = ($self->{line}, $self->{column}); # second '-'
509              $self->{c} = $self->{get_char}->($self);
510              if ($self->{c} == 0x003E) { # >
511                unshift @{$self->{token}}, {type => CDC_TOKEN};
512                $self->{t}->{type} = NUMBER_TOKEN;
513                $self->{t}->{value} = '';
514                $self->{state} = BEFORE_TOKEN_STATE;
515                $self->{c} = $self->{get_char}->($self);
516                return $self->{t};
517                #redo A;
518              } else {
519                ## NOTE: NUMBER, |-|, |-|, $self->{c}
520                my $t = $self->{t};
521                $t->{type} = NUMBER_TOKEN;
522                $t->{value} = '';
523                $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
524                              line => $l, column => $c};
525                unshift @{$self->{token}}, {type => MINUS_TOKEN,
526                                            line => $l, column => $c - 1};
527                # stay in the state
528                # reconsume
529                return $t;
530                #redo A;
531              }
532            } else {
533              #
534            }
535          } else {
536            #
537          }
538          
539          if ($self->{t}->{type} == DIMENSION_TOKEN) {
540            ## NOTE: |-| after |NUMBER|.
541            unshift @{$self->{token}}, {type => MINUS_TOKEN,
542                                        line => $self->{line},
543                                        column => $self->{column} - 1};
544            ## BUG: column might be wrong if on the line boundary.
545            $self->{state} = BEFORE_TOKEN_STATE;
546            # reprocess
547            $self->{t}->{type} = NUMBER_TOKEN;
548            $self->{t}->{value} = '';
549            return $self->{t};
550          } else {
551            ## NOTE: |-| not followed by |nmstart|.
552            $self->{state} = BEFORE_TOKEN_STATE;
553            # reprocess
554            return {type => MINUS_TOKEN,
555                    line => $self->{line}, column => $self->{column} - 1};
556            ## BUG: column might be wrong if on the line boundary.
557          }
558        } elsif ($self->{state} == AFTER_AT_STATE) {
559          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
560              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
561              $self->{c} == 0x005F or # _
562              $self->{c} > 0x007F) { # nonascii
563            $self->{t}->{value} .= chr $self->{c};
564            $self->{state} = NAME_STATE;
565            $self->{c} = $self->{get_char}->($self);
566            redo A;
567          } elsif ($self->{c} == 0x002D) { # -
568            $self->{t}->{value} .= '-';
569            $self->{state} = AFTER_AT_HYPHEN_STATE;
570            $self->{c} = $self->{get_char}->($self);
571            redo A;
572          } elsif ($self->{c} == 0x005C) { # \
573            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
574            $self->{c} = $self->{get_char}->($self);
575            redo A;
576          } else {
577            $self->{state} = BEFORE_TOKEN_STATE;
578            # reprocess
579            return {type => DELIM_TOKEN, value => '@',
580                    line => $self->{t}->{line},
581                    column => $self->{t}->{column}};
582          }
583        } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
584          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
585              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
586              $self->{c} == 0x005F or # _
587              $self->{c} > 0x007F) { # nonascii
588            $self->{t}->{value} .= chr $self->{c};
589            $self->{state} = NAME_STATE;
590            $self->{c} = $self->{get_char}->($self);
591            redo A;
592          } elsif ($self->{c} == 0x002D) { # -
593            $self->{c} = $self->{get_char}->($self);
594          if ($self->{c} == 0x003E) { # >          if ($self->{c} == 0x003E) { # >
595              unshift @{$self->{token}}, {type => CDC_TOKEN};
596            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
597            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
598            return {type => CDC_TOKEN};            return {type => DELIM_TOKEN, value => '@'};
599            #redo A;            #redo A;
600          } else {          } else {
601            ## NOTE: |-|, |-|, $self->{c}            unshift @{$self->{token}}, {type => MINUS_TOKEN};
602            #$current_token = {type => IDENT_TOKEN, value => '-'};            $self->{t} = {type => IDENT_TOKEN, value => '-'};
603            # stay in the state            $self->{state} = BEFORE_NMSTART_STATE;
604            # reconsume            # reprocess
605            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '@'};
606            #redo A;            #redo A;
607          }          }
608          } elsif ($self->{c} == 0x005C) { # \
609            ## TODO: @-\{nl}
610            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
611            $self->{c} = $self->{get_char}->($self);
612            redo A;
613        } else {        } else {
614          if ($current_token->{type} == NUMBER_TOKEN) {          unshift @{$self->{token}}, {type => MINUS_TOKEN};
615            ## NOTE: |-| after |num|.          $self->{state} = BEFORE_TOKEN_STATE;
616            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};          # reprocess
617            $self->{state} = BEFORE_TOKEN_STATE;          return {type => DELIM_TOKEN, value => '@'};
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {  
           ## NOTE: |-| after |@|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } else {  
           ## NOTE: |-| not followed by |nmstart|.  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return {type => DELIM_TOKEN, value => '-'};  
         }  
618        }        }
619      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
620        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
621          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
622          $current_token->{value} = '-';          $self->{t}->{hyphen} = 1;
623            $self->{t}->{value} = '-';
624            $self->{t}->{type} = DIMENSION_TOKEN;
625          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
626          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
627          redo A;          redo A;
628        } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
629                 (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
630                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
631                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
632          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
633          $current_token->{value} = chr $self->{char};          $self->{t}->{value} = chr $self->{c};
634            $self->{t}->{type} = DIMENSION_TOKEN;
635          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
636          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
637          redo A;          redo A;
638        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
639          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
640          $current_token->{value} = '';          $self->{t}->{value} = '';
641            $self->{t}->{type} = DIMENSION_TOKEN;
642          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
643          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
644          redo A;          redo A;
645        } elsif ($self->{c} == 0x0025) { # %        } elsif ($self->{c} == 0x0025) { # %
646          $current_token->{type} = PERCENTAGE_TOKEN;          $self->{t}->{type} = PERCENTAGE_TOKEN;
647          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
648          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
649          return $current_token;          return $self->{t};
650          #redo A;          #redo A;
651        } else {        } else {
652          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
653          # reprocess          # reprocess
654          return $current_token;          return $self->{t};
655          #redo A;          #redo A;
656        }        }
657      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
658        ## NOTE: The first |nmchar| in |name| in |HASH|.        ## NOTE: The first |nmchar| in |name| in |HASH|.
659        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
660            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
661            (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9            (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
662            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
663            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
664            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
665          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
666          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
667          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
668          redo A;          redo A;
669        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
670          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
671          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
672          redo A;          redo A;
673        } else {        } else {
674          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
675          $self->{c} = $self->{get_char}->();          # reprocess
676          return {type => DELIM_TOKEN, value => '#'};          return {type => DELIM_TOKEN, value => '#',
677                    line => $self->{t}->{line},
678                    column => $self->{t}->{column}};
679          #redo A;          #redo A;
680        }        }
681      } elsif ($self->{state} == NAME_STATE) {      } elsif ($self->{state} == NAME_STATE) {
682        ## NOTE: |nmchar| in (|ident| or |name|).        ## NOTE: |nmchar| in (|ident| or |name|).
683        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
684            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
685            (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9            (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
686            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
687            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
688            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
689          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
690          # stay in the state          # stay in the state
691          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
692          redo A;          redo A;
693        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
694          $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
695          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
696          redo A;          redo A;
697        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
698                 $current_token->{type} == IDENT_TOKEN) { # (                 $self->{t}->{type} == IDENT_TOKEN) { # (
699          if (not $current_token->{has_escape} and          my $func_name = $self->{t}->{value};
700              {url => 1, Url => 1, uRl => 1, urL => 1,          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
701               URl => 1, UrL => 1, uRL => 1, URL => 1}          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
702              ->{$current_token->{value}}) {            if ($self->{t}->{has_escape}) {
703            $current_token->{type} = URI_TOKEN;              ## TODO: warn
704              }
705              $self->{t}->{type}
706                  = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
707              $self->{t}->{value} = '';
708            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
709            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
   
           ## NOTE: This version of the tokenizer does not support the |URI|  
           ## token type.  Note that browsers disagree in how to tokenize  
           ## |url| function.  
           $current_token->{type} = FUNCTION_TOKEN;  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
   
710            redo A;            redo A;
711          } else {          } else {
712            $current_token->{type} = FUNCTION_TOKEN;            $self->{t}->{type} = FUNCTION_TOKEN;
713            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
714            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
715            return $current_token;            return $self->{t};
716            #redo A;            #redo A;
717          }          }
718        } else {        } else {
719          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
720          # reconsume          # reconsume
721          return $current_token;          return $self->{t};
722            #redo A;
723          }
724        } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
725          while ({
726                    0x0020 => 1, # SP
727                    0x0009 => 1, # \t
728                    0x000D => 1, # \r
729                    0x000A => 1, # \n
730                    0x000C => 1, # \f
731                 }->{$self->{c}}) {
732            $self->{c} = $self->{get_char}->($self);
733          }
734          if ($self->{c} == -1) {
735            $self->{t}->{type} = {
736                URI_TOKEN, URI_INVALID_TOKEN,
737                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
738                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
739                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
740            }->{$self->{t}->{type}};        
741            $self->{state} = BEFORE_TOKEN_STATE;
742            $self->{c} = $self->{get_char}->($self);
743            return $self->{t};
744            #redo A;
745          } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
746            ## TODO: Should we consider matches of "(" and ")"?
747            $self->{t}->{type} = {
748                URI_TOKEN, URI_INVALID_TOKEN,
749                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
750                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
751                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
752            }->{$self->{t}->{type}};
753            $self->{state} = URI_UNQUOTED_STATE;
754            $self->{c} = $self->{get_char}->($self);
755            redo A;
756          } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
757            $self->{state} = STRING_STATE; $q = $self->{c};
758            $self->{c} = $self->{get_char}->($self);
759            redo A;
760          } elsif ($self->{c} == 0x0029) { # )
761            $self->{state} = BEFORE_TOKEN_STATE;
762            $self->{c} = $self->{get_char}->($self);
763            return $self->{t};
764            #redo A;
765          } elsif ($self->{c} == 0x005C) { # \
766            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
767            $self->{c} = $self->{get_char}->($self);
768            redo A;
769          } else {
770            $self->{t}->{value} .= chr $self->{c};
771            $self->{state} = URI_UNQUOTED_STATE;
772            $self->{c} = $self->{get_char}->($self);
773            redo A;
774          }
775        } elsif ($self->{state} == URI_UNQUOTED_STATE) {
776          if ({
777               0x0020 => 1, # SP
778               0x0009 => 1, # \t
779               0x000D => 1, # \r
780               0x000A => 1, # \n
781               0x000C => 1, # \f
782              }->{$self->{c}}) {
783            $self->{state} = URI_AFTER_WSP_STATE;
784            $self->{c} = $self->{get_char}->($self);
785            redo A;
786          } elsif ($self->{c} == -1) {
787            $self->{t}->{type} = {
788                URI_TOKEN, URI_INVALID_TOKEN,
789                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
790                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
791                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
792            }->{$self->{t}->{type}};        
793            $self->{state} = BEFORE_TOKEN_STATE;
794            $self->{c} = $self->{get_char}->($self);
795            return $self->{t};
796            #redo A;
797          } elsif ($self->{c} < 0x0020 or {
798              0x0022 => 1, # "
799              0x0027 => 1, # '
800              0x0028 => 1, # (
801          }->{$self->{c}}) { # C0 or (
802            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
803            $self->{t}->{type} = {
804                URI_TOKEN, URI_INVALID_TOKEN,
805                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
806                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
807                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
808            }->{$self->{t}->{type}};
809            # stay in the state.
810            $self->{c} = $self->{get_char}->($self);
811            redo A;
812          } elsif ($self->{c} == 0x0029) { # )
813            $self->{state} = BEFORE_TOKEN_STATE;
814            $self->{c} = $self->{get_char}->($self);
815            return $self->{t};
816            #redo A;
817          } elsif ($self->{c} == 0x005C) { # \
818            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
819            $self->{c} = $self->{get_char}->($self);
820            redo A;
821          } else {
822            $self->{t}->{value} .= chr $self->{c};
823            # stay in the state.
824            $self->{c} = $self->{get_char}->($self);
825            redo A;
826          }
827        } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
828          if ({
829               0x0020 => 1, # SP
830               0x0009 => 1, # \t
831               0x000D => 1, # \r
832               0x000A => 1, # \n
833               0x000C => 1, # \f
834              }->{$self->{c}}) {
835            # stay in the state.
836            $self->{c} = $self->{get_char}->($self);
837            redo A;
838          } elsif ($self->{c} == -1) {
839            $self->{t}->{type} = {
840                URI_TOKEN, URI_INVALID_TOKEN,
841                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
842                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
843                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
844            }->{$self->{t}->{type}};        
845            $self->{state} = BEFORE_TOKEN_STATE;
846            $self->{c} = $self->{get_char}->($self);
847            return $self->{t};
848            #redo A;
849          } elsif ($self->{c} == 0x0029) { # )
850            $self->{state} = BEFORE_TOKEN_STATE;
851            $self->{c} = $self->{get_char}->($self);
852            return $self->{t};
853          #redo A;          #redo A;
854          } elsif ($self->{c} == 0x005C) { # \
855            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
856            $self->{c} = $self->{get_char}->($self);
857            redo A;
858          } else {
859            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
860            $self->{t}->{type} = {
861                URI_TOKEN, URI_INVALID_TOKEN,
862                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
863                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
864                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
865            }->{$self->{t}->{type}};
866            # stay in the state.
867            $self->{c} = $self->{get_char}->($self);
868            redo A;
869        }        }
870      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
871        $current_token->{has_escape} = 1;        $self->{t}->{has_escape} = 1;
872        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
873          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
874          $char = $self->{c} - 0x0030;          $char = $self->{c} - 0x0030;
875          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
876          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
877          redo A;          redo A;
878        } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F        } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
879          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
880          $char = $self->{c} - 0x0041 + 0xA;          $char = $self->{c} - 0x0041 + 0xA;
881          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
882          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
883          redo A;          redo A;
884        } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
885          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
886          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 + 0xA;
887          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
888          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
889          redo A;          redo A;
890        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
891                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
892          if ($q == 0) {          if ($q == 0) {
893            ## NOTE: In |escape| in ... in |ident|.            #
894            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
895            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
896            return $current_token;            $self->{t}->{type} = {
897            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
898            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
899                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
900                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
901              }->{$self->{t}->{type}};
902              $self->{t}->{value} .= chr $self->{c};
903              $self->{state} = URI_UNQUOTED_STATE;
904              $self->{c} = $self->{get_char}->($self);
905              redo A;
906          } else {          } else {
907            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $current_token->{value} .= chr $self->{c};  
908            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
909            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
910            redo A;            redo A;
911          }          }
912        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
913          if ($q == 0) {          if ($q == 0) {
914            ## NOTE: In |escape| in ... in |ident|.            #
915            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
916            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
917            return $current_token;            $self->{t}->{type} = {
918            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
919            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
920                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
921                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
922              }->{$self->{t}->{type}};
923              $self->{state} = ESCAPE_BEFORE_LF_STATE;
924              $self->{c} = $self->{get_char}->($self);
925              redo A;
926          } else {          } else {
927            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $current_token->{value} .= "\x0D\x0A";  
928            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
929            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
930            redo A;            redo A;
931          }          }
932          } elsif ($self->{c} == -1) {
933            #
934        } else {        } else {
935          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
936          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
937          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
938          $self->{c} = $self->{get_char}->();              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
939            $self->{c} = $self->{get_char}->($self);
940          redo A;          redo A;
941        }        }
942    
943          if ($q == 0) {
944            if ($self->{t}->{type} == DIMENSION_TOKEN) {
945              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
946                $self->{state} = BEFORE_TOKEN_STATE;
947                # reprocess
948                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
949                                            line => $self->{line},
950                                            column => $self->{column} - 2};
951                unshift @{$self->{token}}, {type => MINUS_TOKEN,
952                                            line => $self->{line},
953                                            column => $self->{column} - 1};
954                ## BUG: line and column might be wrong if they are on the
955                ## line boundary.
956                $self->{t}->{type} = NUMBER_TOKEN;
957                $self->{t}->{value} = '';
958                return $self->{t};
959                #redo A;
960              } elsif (length $self->{t}->{value}) {
961                $self->{state} = BEFORE_TOKEN_STATE;
962                # reprocess
963                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
964                                            line => $self->{line},
965                                            column => $self->{column} - 1};
966                ## BUG: line and column might be wrong if they are on the
967                ## line boundary.
968                return $self->{t};
969                #redo A;
970              } else {
971                $self->{state} = BEFORE_TOKEN_STATE;
972                # reprocess
973                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
974                                            line => $self->{line},
975                                            column => $self->{column} - 1};
976                ## BUG: line and column might be wrong if they are on the
977                ## line boundary.
978                $self->{t}->{type} = NUMBER_TOKEN;
979                $self->{t}->{value} = '';
980                return $self->{t};
981                #redo A;
982              }
983            } else {
984              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
985                $self->{state} = BEFORE_TOKEN_STATE;
986                # reprocess
987                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
988                                            line => $self->{line},
989                                            column => $self->{column} - 2};
990                return {type => MINUS_TOKEN,
991                        line => $self->{line},
992                        column => $self->{column} - 1};
993                ## BUG: line and column might be wrong if they are on the
994                ## line boundary.
995                #redo A;
996              } elsif (length $self->{t}->{value}) {
997                $self->{state} = BEFORE_TOKEN_STATE;
998                # reprocess
999                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
1000                                            line => $self->{line},
1001                                            column => $self->{column} - 1};
1002                ## BUG: line and column might be wrong if they are on the
1003                ## line boundary.
1004                return $self->{t};
1005                #redo A;
1006              } else {
1007                $self->{state} = BEFORE_TOKEN_STATE;
1008                # reprocess
1009                return {type => DELIM_TOKEN, value => '\\',
1010                        line => $self->{line},
1011                        column => $self->{column} - 1};
1012                ## BUG: line and column might be wrong if they are on the
1013                ## line boundary.
1014                #redo A;
1015              }
1016            }
1017          } elsif ($q == 1) {
1018            $self->{state} = URI_UNQUOTED_STATE;
1019            $self->{c} = $self->{get_char}->($self);
1020            redo A;
1021          } else {
1022            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
1023                                        line => $self->{line},
1024                                        column => $self->{column} - 1};
1025            ## BUG: line and column might be wrong if they are on the
1026            ## line boundary.
1027            $self->{t}->{type} = {
1028              STRING_TOKEN, INVALID_TOKEN,
1029              URI_TOKEN, URI_INVALID_TOKEN,
1030              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1031            }->{$self->{t}->{type}} || $self->{t}->{type};
1032            $self->{state} = BEFORE_TOKEN_STATE;
1033            # reprocess
1034            return $self->{t};
1035            #redo A;
1036          }
1037      } elsif ($self->{state} == ESCAPE_STATE) {      } elsif ($self->{state} == ESCAPE_STATE) {
1038        ## NOTE: third..seventh character of |unicode| in |escape|.        ## NOTE: third..seventh character of |unicode| in |escape|.
1039        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
1040          $char = $char * 0x10 + $self->{c} - 0x0030;          $char = $char * 0x10 + $self->{c} - 0x0030;
1041          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1042          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1043          redo A;          redo A;
1044        } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F        } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
1045          $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;          $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
1046          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1047          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1048          redo A;          redo A;
1049        } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
1050          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
1051          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1052          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1053          redo A;          redo A;
1054        } elsif ($self->{c} == 0x0020 or # SP        } elsif ($self->{c} == 0x0020 or # SP
1055                 $self->{c} == 0x000A or # \n                 $self->{c} == 0x000A or # \n
1056                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
1057                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
1058          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1059          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
1060          $self->{c} = $self->{get_char}->();              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1061            $self->{c} = $self->{get_char}->($self);
1062          redo A;          redo A;
1063        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
1064          $self->{state} = ESCAPE_BEFORE_LF_STATE;          $self->{state} = ESCAPE_BEFORE_LF_STATE;
1065          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1066          redo A;          redo A;
1067        } else {        } else {
1068          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1069          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
1070                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1071          # reconsume          # reconsume
1072          redo A;          redo A;
1073        }        }
# Line 477  sub get_next_token ($) { Line 1077  sub get_next_token ($) {
1077            $self->{c} == 0x000A or # \n            $self->{c} == 0x000A or # \n
1078            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
1079            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
1080          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1081          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
1082          $self->{c} = $self->{get_char}->();              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1083            $self->{c} = $self->{get_char}->($self);
1084          redo A;          redo A;
1085        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
1086          $self->{state} = ESCAPE_BEFORE_NL_STATE;          $self->{state} = ESCAPE_BEFORE_NL_STATE;
1087          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1088          redo A;          redo A;
1089        } else {        } else {
1090          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1091          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
1092                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1093          # reconsume          # reconsume
1094          redo A;          redo A;
1095        }        }
1096      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1097        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1098        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
1099          $current_token->{value} .= chr $char;          $self->{state} = $q == 0 ? NAME_STATE :
1100          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1101          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1102          redo A;          redo A;
1103        } else {        } else {
1104          $current_token->{value} .= chr $char;          $self->{state} = $q == 0 ? NAME_STATE :
1105          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1106          # reconsume          # reprocess
1107          redo A;          redo A;
1108        }        }
1109      } elsif ($self->{state} == STRING_STATE) {      } elsif ($self->{state} == STRING_STATE) {
1110        ## NOTE: A character in |string$Q| in |string| in |STRING|, or        ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1111        ## a character in |invalid$Q| in |invalid| in |INVALID|,        ## a character in |invalid$Q| in |invalid| in |INVALID|,
1112        ## where |$Q = $q == 0x0022 ? 1 : 2|.        ## where |$Q = $q == 0x0022 ? 1 : 2|.
1113          ## Or, in |URI|.
1114        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
1115          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
1116          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1117          redo A;          redo A;
1118        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
1119          $self->{state} = BEFORE_TOKEN_STATE;          if ($self->{t}->{type} == STRING_TOKEN) {
1120          $self->{c} = $self->{get_char}->();            $self->{state} = BEFORE_TOKEN_STATE;
1121          return $current_token;            $self->{c} = $self->{get_char}->($self);
1122          #redo A;            return $self->{t};
1123              #redo A;
1124            } else {
1125              $self->{state} = URI_AFTER_WSP_STATE;
1126              $self->{c} = $self->{get_char}->($self);
1127              redo A;
1128            }
1129        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
1130                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
1131                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
1132                 $self->{c} == -1) {                 $self->{c} == -1) {
1133          $current_token->{type} = INVALID_TOKEN;          $self->{t}->{type} = {
1134              STRING_TOKEN, INVALID_TOKEN,
1135              INVALID_TOKEN, INVALID_TOKEN,
1136              URI_TOKEN, URI_INVALID_TOKEN,
1137              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1138              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1139              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1140            }->{$self->{t}->{type}};
1141          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1142          # reconsume          # reconsume
1143          return $current_token;          return $self->{t};
1144          #redo A;          #redo A;
1145        } else {        } else {
1146          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1147          # stay in the state          # stay in the state
1148          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1149          redo A;          redo A;
1150        }        }
1151      } elsif ($self->{state} == NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_STATE) {
1152        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1153        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1154          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1155          # stay in the state          # stay in the state
1156          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1157          redo A;          redo A;
1158        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
1159          $self->{state} = NUMBER_DOT_STATE;          $self->{state} = NUMBER_DOT_STATE;
1160          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1161          redo A;          redo A;
1162        } else {        } else {
1163          $self->{number} = $self->{value};          $self->{t}->{number} = 0+$self->{t}->{value};
1164          $self->{value} = '';          $self->{t}->{value} = '';
1165          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1166          # reprocess          # reprocess
1167          return $current_token;          redo A;
         #redo A;  
1168        }        }
1169      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
1170        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
1171        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1172          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1173          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1174          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1175          redo A;          redo A;
1176        } else {        } else {
1177          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DOT_TOKEN};
1178          $self->{number} = $self->{value};          $self->{t}->{number} = 0+$self->{t}->{value};
1179          $self->{value} = '';          $self->{t}->{value} = '';
1180          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1181          # reprocess          # reprocess
1182          return $current_token;          return $self->{t};
1183          #redo A;          #redo A;
1184        }        }
1185      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1186        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
1187        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1188          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1189          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1190          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1191          redo A;          redo A;
1192        } else {        } else {
1193          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1194          $self->{c} = $self->{get_char}->();          # reprocess
1195          return {type => DELIM_TOKEN, value => '.'};          return {type => DOT_TOKEN,
1196                    line => $self->{line}, column => $self->{column} - 1};
1197            ## BUG: line and column might be wrong if they are on the
1198            ## line boundary.
1199          #redo A;          #redo A;
1200        }        }
1201      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1202        ## NOTE: |[0-9]| in |num| after |.|.        ## NOTE: |[0-9]| in |num| after |.|.
1203        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1204          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1205          # stay in the state          # stay in the state
1206          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1207          redo A;          redo A;
1208        } else {        } else {
1209          $self->{number} = $self->{value};          $self->{t}->{number} = 0+$self->{t}->{value};
1210          $self->{value} = '';          $self->{t}->{value} = '';
1211          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1212          # reprocess          # reprocess
1213          return $current_token;          redo A;
         #redo A;  
1214        }        }
1215      } else {      } else {
1216        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";
1217      }      }
1218    } # A    } # A
1219    } # get_next_token
1220    
1221    ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|  sub serialize_token ($$) {
1222      shift;
1223      my $t = shift;
1224    
1225  } # get_next_token    ## NOTE: This function is not intended for roundtrip-able serialization.
1226    
1227      if ($t->{type} == IDENT_TOKEN) {
1228        return $t->{value};
1229      } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1230        return '@' . $t->{value};
1231      } elsif ($t->{type} == HASH_TOKEN) {
1232        return '#' . $t->{value};
1233      } elsif ($t->{type} == FUNCTION_TOKEN) {
1234        return $t->{value} . '(';
1235      } elsif ($t->{type} == URI_TOKEN) {
1236        return 'url(' . $t->{value} . ')';
1237      } elsif ($t->{type} == URI_INVALID_TOKEN) {
1238        return 'url(' . $t->{value};
1239      } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1240        return 'url-prefix(' . $t->{value} . ')';
1241      } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1242        return 'url-prefix(' . $t->{value};
1243      } elsif ($t->{type} == STRING_TOKEN) {
1244        return '"' . $t->{value} . '"';
1245      } elsif ($t->{type} == INVALID_TOKEN) {
1246        return '"' . $t->{value};
1247      } elsif ($t->{type} == NUMBER_TOKEN) {
1248        return $t->{number};
1249      } elsif ($t->{type} == DIMENSION_TOKEN) {
1250        return $t->{number} . $t->{value};
1251      } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1252        return $t->{number} . '%';
1253      } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1254        return 'U+' . $t->{value};
1255      } elsif ($t->{type} == DELIM_TOKEN) {
1256        return $t->{value};
1257      } elsif ($t->{type} == PLUS_TOKEN) {
1258        return '+';
1259      } elsif ($t->{type} == GREATER_TOKEN) {
1260        return '>';
1261      } elsif ($t->{type} == COMMA_TOKEN) {
1262        return ',';
1263      } elsif ($t->{type} == TILDE_TOKEN) {
1264        return '~';
1265      } elsif ($t->{type} == DASHMATCH_TOKEN) {
1266        return '|=';
1267      } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1268        return '^=';
1269      } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1270        return '$=';
1271      } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1272        return '*=';
1273      } elsif ($t->{type} == INCLUDES_TOKEN) {
1274        return '~=';
1275      } elsif ($t->{type} == SEMICOLON_TOKEN) {
1276        return ';';
1277      } elsif ($t->{type} == LBRACE_TOKEN) {
1278        return '{';
1279      } elsif ($t->{type} == RBRACE_TOKEN) {
1280        return '}';
1281      } elsif ($t->{type} == LPAREN_TOKEN) {
1282        return '(';
1283      } elsif ($t->{type} == RPAREN_TOKEN) {
1284        return ')';
1285      } elsif ($t->{type} == LBRACKET_TOKEN) {
1286        return '[';
1287      } elsif ($t->{type} == RBRACKET_TOKEN) {
1288        return ']';
1289      } elsif ($t->{type} == S_TOKEN) {
1290        return ' ';
1291      } elsif ($t->{type} == CDO_TOKEN) {
1292        return '<!--';
1293      } elsif ($t->{type} == CDC_TOKEN) {
1294        return '-->';
1295      } elsif ($t->{type} == COMMENT_TOKEN) {
1296        return '/**/';
1297      } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1298        return '/*';
1299      } elsif ($t->{type} == EOF_TOKEN) {
1300        return '{EOF}';
1301      } elsif ($t->{type} == MINUS_TOKEN) {
1302        return '-';
1303      } elsif ($t->{type} == STAR_TOKEN) {
1304        return '*';
1305      } elsif ($t->{type} == VBAR_TOKEN) {
1306        return '|';
1307      } elsif ($t->{type} == COLON_TOKEN) {
1308        return ':';
1309      } elsif ($t->{type} == MATCH_TOKEN) {
1310        return '=';
1311      } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1312        return '!';
1313      } else {
1314        return '{'.$t->{type}.'}';
1315      }
1316    } # serialize_token
1317    
1318    =head1 LICENSE
1319    
1320    Copyright 2007 Wakaba <w@suika.fam.cx>
1321    
1322    This library is free software; you can redistribute it
1323    and/or modify it under the same terms as Perl itself.
1324    
1325    =cut
1326    
1327  1;  1;
1328  # $Date$  # $Date$

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.20

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24