/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4 by wakaba, Sat Sep 8 02:58:24 2007 UTC revision 1.19 by wakaba, Sat Jan 26 09:30:47 2008 UTC
# Line 1  Line 1 
1  package Whatpm::CSS::Tokenizer;  package Whatpm::CSS::Tokenizer;
2  use strict;  use strict;
3    our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    require Exporter;
6    push our @ISA, 'Exporter';
7    
8  sub BEFORE_TOKEN_STATE () { 0 }  sub BEFORE_TOKEN_STATE () { 0 }
9  sub BEFORE_NMSTART_STATE () { 1 }  sub BEFORE_NMSTART_STATE () { 1 }
# Line 36  sub NUMBER_TOKEN () { 11 } Line 40  sub NUMBER_TOKEN () { 11 }
40  sub DIMENSION_TOKEN () { 12 }  sub DIMENSION_TOKEN () { 12 }
41  sub PERCENTAGE_TOKEN () { 13 }  sub PERCENTAGE_TOKEN () { 13 }
42  sub UNICODE_RANGE_TOKEN () { 14 }  sub UNICODE_RANGE_TOKEN () { 14 }
 sub UNICODE_RANGE_INVALID_TOKEN () { 15 }  
43  sub DELIM_TOKEN () { 16 }  sub DELIM_TOKEN () { 16 }
44  sub PLUS_TOKEN () { 17 }  sub PLUS_TOKEN () { 17 }
45  sub GREATER_TOKEN () { 18 }  sub GREATER_TOKEN () { 18 }
# Line 60  sub CDC_TOKEN () { 35 } Line 63  sub CDC_TOKEN () { 35 }
63  sub COMMENT_TOKEN () { 36 }  sub COMMENT_TOKEN () { 36 }
64  sub COMMENT_INVALID_TOKEN () { 37 }  sub COMMENT_INVALID_TOKEN () { 37 }
65  sub EOF_TOKEN () { 38 }  sub EOF_TOKEN () { 38 }
66    sub MINUS_TOKEN () { 39 }
67    sub STAR_TOKEN () { 40 }
68    sub VBAR_TOKEN () { 41 }
69    sub DOT_TOKEN () { 42 }
70    sub COLON_TOKEN () { 43 }
71    sub MATCH_TOKEN () { 44 }
72    sub EXCLAMATION_TOKEN () { 45 }
73    
74  our @TokenName = qw(  our @TokenName = qw(
75    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77    UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH    0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80    COMMENT_INVALID EOF    COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81    );
82    
83    our @EXPORT_OK = qw(
84      IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85      URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86      STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87      UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88      TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89      SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90      RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91      S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92      MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93      EXCLAMATION_TOKEN
94  );  );
95    
96    our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97    
98  sub new ($) {  sub new ($) {
99    my $self = bless {token => [], get_char => sub { -1 },    my $self = bless {token => [], get_char => sub { -1 }}, shift;
                     onerror => sub { }}, shift;  
100    return $self;    return $self;
101  } # new  } # new
102    
103  sub init ($) {  sub init ($) {
104    my $self = shift;    my $self = shift;
105    $self->{state} = BEFORE_TOKEN_STATE;    $self->{state} = BEFORE_TOKEN_STATE;
106    $self->{c} = $self->{get_char}->();    $self->{c} = $self->{get_char}->($self);
107      #$self->{t} = {type => token-type, value => value, number => number};
108  } # init  } # init
109    
110  sub get_next_token ($) {  sub get_next_token ($) {
# Line 88  sub get_next_token ($) { Line 113  sub get_next_token ($) {
113      return shift @{$self->{token}};      return shift @{$self->{token}};
114    }    }
115    
   my $current_token;  
116    my $char;    my $char;
117    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
118    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
# Line 103  sub get_next_token ($) { Line 127  sub get_next_token ($) {
127      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
128        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
129          ## NOTE: |-| in |ident| in |IDENT|          ## NOTE: |-| in |ident| in |IDENT|
130          $current_token = {type => IDENT_TOKEN, value => '-'};          $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
131                          line => $self->{line}, column => $self->{column}};
132          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
133          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
134          redo A;          redo A;
135          } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
136            $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
137                          line => $self->{line}, column => $self->{column}};
138            $self->{c} = $self->{get_char}->($self);
139            if ($self->{c} == 0x002B) { # +
140              my ($l, $c) = ($self->{line}, $self->{column});
141              $self->{c} = $self->{get_char}->($self);
142              if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143                  (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144                  (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
145                  $self->{c} == 0x003F) { # ?
146                $self->{t}->{value} = chr $self->{c};
147                $self->{t}->{type} = UNICODE_RANGE_TOKEN;
148                $self->{c} = $self->{get_char}->($self);
149                C: for (2..6) {
150                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
151                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
152                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
153                      $self->{c} == 0x003F) { # ?
154                    $self->{t}->{value} .= chr $self->{c};
155                    $self->{c} = $self->{get_char}->($self);
156                  } else {
157                    last C;
158                  }
159                } # C
160    
161                if ($self->{c} == 0x002D) { # -
162                  $self->{c} = $self->{get_char}->($self);
163                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
164                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
165                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
166                    $self->{t}->{value} .= '-' . chr $self->{c};
167                    $self->{c} = $self->{get_char}->($self);
168                    C: for (2..6) {
169                      if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
170                          (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
171                          (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
172                        $self->{t}->{value} .= chr $self->{c};
173                        $self->{c} = $self->{get_char}->($self);
174                      } else {
175                        last C;
176                      }
177                    } # C
178                    
179                    #
180                  } else {
181                    my $token = $self->{t};
182                    $self->{t} = {type => IDENT_TOKEN, value => '-',
183                                  line => $self->{line},
184                                  column => $self->{column}};
185                    $self->{state} = BEFORE_NMSTART_STATE;
186                    # reprocess
187                    return $token;
188                    #redo A;
189                  }
190                }
191    
192                $self->{state} = BEFORE_TOKEN_STATE;
193                # reprocess
194                return $self->{t};
195                #redo A;
196              } else {
197                unshift @{$self->{token}},
198                    {type => PLUS_TOKEN, line => $l, column => $c};
199                $self->{state} = BEFORE_TOKEN_STATE;
200                # reprocess
201                return $self->{t};
202                #redo A;
203              }
204            } else {
205              $self->{state} = NAME_STATE;
206              # reprocess
207              redo A;
208            }
209        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
210                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
211                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
212                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
213          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
214          $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c},
215                          line => $self->{line}, column => $self->{column}};
216          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
217          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
218          redo A;          redo A;
219        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
220          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
221          $current_token = {type => IDENT_TOKEN, value => ''};          $self->{t} = {type => IDENT_TOKEN, value => '',
222                          line => $self->{line}, column => $self->{column}};
223          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
224          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
225          redo A;          redo A;
226        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
227          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
228          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $self->{t} = {type => ATKEYWORD_TOKEN, value => '',
229                          line => $self->{line}, column => $self->{column}};
230          $self->{state} = AFTER_AT_STATE;          $self->{state} = AFTER_AT_STATE;
231          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
232          redo A;          redo A;
233        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
234          $current_token = {type => STRING_TOKEN, value => ''};          $self->{t} = {type => STRING_TOKEN, value => '',
235                          line => $self->{line}, column => $self->{column}};
236          $self->{state} = STRING_STATE; $q = $self->{c};          $self->{state} = STRING_STATE; $q = $self->{c};
237          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
238          redo A;          redo A;
239        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
240          ## NOTE: |#| in |HASH|.          ## NOTE: |#| in |HASH|.
241          $current_token = {type => HASH_TOKEN, value => ''};          $self->{t} = {type => HASH_TOKEN, value => '',
242                          line => $self->{line}, column => $self->{column}};
243          $self->{state} = HASH_OPEN_STATE;          $self->{state} = HASH_OPEN_STATE;
244          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
245          redo A;          redo A;
246        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
247          ## NOTE: |num|.          ## NOTE: |num|.
248          $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};          $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c},
249                          line => $self->{line}, column => $self->{column}};
250          $self->{state} = NUMBER_STATE;          $self->{state} = NUMBER_STATE;
251          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
252          redo A;          redo A;
253        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
254          ## NOTE: |num|.          ## NOTE: |num|.
255          $current_token = {type => NUMBER_TOKEN, value => '0'};          $self->{t} = {type => NUMBER_TOKEN, value => '0',
256                          line => $self->{line}, column => $self->{column}};
257          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
258          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
259          redo A;          redo A;
260        } elsif ($self->{c} == 0x002F) { # /        } elsif ($self->{c} == 0x002F) { # /
261          $self->{c} = $self->{get_char}->();          my ($l, $c) = ($self->{line}, $self->{column});
262            $self->{c} = $self->{get_char}->($self);
263          if ($self->{c} == 0x002A) { # *          if ($self->{c} == 0x002A) { # *
264            C: {            C: {
265              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->($self);
266              if ($self->{c} == 0x002A) { # *              if ($self->{c} == 0x002A) { # *
267                D: {                D: {
268                  $self->{c} = $self->{get_char}->();                  $self->{c} = $self->{get_char}->($self);
269                  if ($self->{c} == 0x002F) { # /                  if ($self->{c} == 0x002F) { # /
270                    #                    #
271                  } elsif ($self->{c} == 0x002A) { # *                  } elsif ($self->{c} == 0x002A) { # *
# Line 178  sub get_next_token ($) { Line 285  sub get_next_token ($) {
285            } # C            } # C
286    
287            # stay in the state.            # stay in the state.
288            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
289            redo A;            redo A;
290          } else {          } else {
291            # stay in the state.            # stay in the state.
292            # reprocess            # reprocess
293            return {type => DELIM_STATE, value => '/'};            return {type => DELIM_TOKEN, value => '/', line => $l, column => $c};
294            #redo A;            #redo A;
295          }                  }        
296        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
297            my ($l, $c) = ($self->{line}, $self->{column});
298          ## NOTE: |CDO|          ## NOTE: |CDO|
299          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
300          if ($self->{c} == 0x0021) { # !          if ($self->{c} == 0x0021) { # !
301            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
302            if ($self->{c} == 0x002C) { # -            if ($self->{c} == 0x002D) { # -
303              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->($self);
304              if ($self->{c} == 0x002C) { # -              if ($self->{c} == 0x002D) { # -
305                $self->{state} = BEFORE_TOKEN_STATE;                $self->{state} = BEFORE_TOKEN_STATE;
306                $self->{c} = $self->{get_char}->();                $self->{c} = $self->{get_char}->($self);
307                return {type => CDO_TOKEN};                return {type => CDO_TOKEN, line => $l, column => $c};
308                #redo A;                #redo A;
309              } else {              } else {
310                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}},
311                      {type => EXCLAMATION_TOKEN, line => $l, column => $c + 1};
312                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
313                $current_token = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-',
314                                line => $l, column => $c + 2};
315                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
316                #reprocess                #reprocess
317                return {type => DELIM_TOKEN, value => '<'};                return {type => DELIM_TOKEN, value => '<',
318                          line => $l, column => $c};
319                #redo A;                #redo A;
320              }              }
321            } else {            } else {
322              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};              unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN,
323                                            line => $l, column => $c + 1};
324              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
325              #reprocess              #reprocess
326              return {type => DELIM_TOKEN, value => '<'};              return {type => DELIM_TOKEN, value => '<',
327                        line => $l, column => $c};
328              #redo A;              #redo A;
329            }            }
330          } else {          } else {
331            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
332            #reprocess            #reprocess
333            return {type => DELIM_TOKEN, value => '<'};            return {type => DELIM_TOKEN, value => '<',
334                      line => $l, column => $c};
335            #redo A;            #redo A;
336          }          }
337        } elsif (my $t = {        } elsif (my $t = {
338                  0x003B => SEMICOLON_TOKEN, # ;                          0x0021 => EXCLAMATION_TOKEN, # !
339                  0x007B => LBRACE_TOKEN, # {                          0x002D => MINUS_TOKEN, # -
340                  0x007D => RBRACE_TOKEN, # }                          0x002E => DOT_TOKEN, # .
341                  0x0028 => LPAREN_TOKEN, # (                          0x003A => COLON_TOKEN, # :
342                  0x0029 => RPAREN_TOKEN, # )                          0x003B => SEMICOLON_TOKEN, # ;
343                  0x005B => LBRACKET_TOKEN, # [                          0x003D => MATCH_TOKEN, # =
344                  0x005D => RBRACKET_TOKEN, # ]                          0x007B => LBRACE_TOKEN, # {
345                            0x007D => RBRACE_TOKEN, # }
346                            0x0028 => LPAREN_TOKEN, # (
347                            0x0029 => RPAREN_TOKEN, # )
348                            0x005B => LBRACKET_TOKEN, # [
349                            0x005D => RBRACKET_TOKEN, # ]
350                 }->{$self->{c}}) {                 }->{$self->{c}}) {
351            my ($l, $c) = ($self->{line}, $self->{column});
352          # stay in the state          # stay in the state
353          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
354          return {type => $t};          return {type => $t, line => $l, column => $c};
355          # redo A;          # redo A;
356        } elsif ({        } elsif ({
357                  0x0020 => 1, # SP                  0x0020 => 1, # SP
# Line 240  sub get_next_token ($) { Line 360  sub get_next_token ($) {
360                  0x000A => 1, # \n                  0x000A => 1, # \n
361                  0x000C => 1, # \f                  0x000C => 1, # \f
362                 }->{$self->{c}}) {                 }->{$self->{c}}) {
363            my ($l, $c) = ($self->{line}, $self->{column});
364          W: {          W: {
365            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
366            if ({            if ({
367                  0x0020 => 1, # SP                  0x0020 => 1, # SP
368                  0x0009 => 1, # \t                  0x0009 => 1, # \t
# Line 256  sub get_next_token ($) { Line 377  sub get_next_token ($) {
377                              0x002C => COMMA_TOKEN, # ,                              0x002C => COMMA_TOKEN, # ,
378                              0x007E => TILDE_TOKEN, # ~                              0x007E => TILDE_TOKEN, # ~
379                             }->{$self->{c}}) {                             }->{$self->{c}}) {
380                my ($l, $c) = ($self->{line}, $self->{column});
381              # stay in the state              # stay in the state
382              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->($self);
383              return {type => $v};              return {type => $v, line => $l, column => $c};
384              #redo A;              #redo A;
385            } else {            } else {
386              # stay in the state              # stay in the state
387              # reprocess              # reprocess
388              return {type => S_TOKEN};              return {type => S_TOKEN, line => $l, column => $c};
389              #redo A;              #redo A;
390            }            }
391          } # W          } # W
# Line 273  sub get_next_token ($) { Line 395  sub get_next_token ($) {
395                          0x0024 => SUFFIXMATCH_TOKEN, # $                          0x0024 => SUFFIXMATCH_TOKEN, # $
396                          0x002A => SUBSTRINGMATCH_TOKEN, # *                          0x002A => SUBSTRINGMATCH_TOKEN, # *
397                         }->{$self->{c}}) {                         }->{$self->{c}}) {
398            my ($line, $column) = ($self->{line}, $self->{column});
399          my $c = $self->{c};          my $c = $self->{c};
400          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
401          if ($self->{c} == 0x003D) { # =          if ($self->{c} == 0x003D) { # =
402            # stay in the state            # stay in the state
403            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
404            return {type => $v};            return {type => $v, line => $line, column => $column};
405              #redo A;
406            } elsif ($v = {
407                           0x002A => STAR_TOKEN, # *
408                           0x007C => VBAR_TOKEN, # |
409                          }->{$c}) {
410              # stay in the state.
411              # reprocess
412              return {type => $v, line => $line, column => $column};
413            #redo A;            #redo A;
414          } else {          } else {
415            # stay in the state            # stay in the state
416            # reprocess            # reprocess
417            return {type => DELIM_TOKEN, value => chr $c};            return {type => DELIM_TOKEN, value => chr $c,
418                      line => $line, column => $column};
419            #redo A;            #redo A;
420          }          }
421        } elsif ($self->{c} == 0x002B) { # +        } elsif ($self->{c} == 0x002B) { # +
422            my ($l, $c) = ($self->{line}, $self->{column});
423          # stay in the state          # stay in the state
424          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
425          return {type => PLUS_TOKEN};          return {type => PLUS_TOKEN, line => $l, column => $c};
426          #redo A;          #redo A;
427        } elsif ($self->{c} == 0x003E) { # >        } elsif ($self->{c} == 0x003E) { # >
428            my ($l, $c) = ($self->{line}, $self->{column});
429          # stay in the state          # stay in the state
430          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
431          return {type => GREATER_TOKEN};          return {type => GREATER_TOKEN, line => $l, column => $c};
432          #redo A;          #redo A;
433        } elsif ($self->{c} == 0x002C) { # ,        } elsif ($self->{c} == 0x002C) { # ,
434            my ($l, $c) = ($self->{line}, $self->{column});
435          # stay in the state          # stay in the state
436          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
437          return {type => COMMA_TOKEN};          return {type => COMMA_TOKEN, line => $l, column => $c};
438          #redo A;          #redo A;
439        } elsif ($self->{c} == 0x007E) { # ~        } elsif ($self->{c} == 0x007E) { # ~
440          $self->{c} = $self->{get_char}->();          my ($l, $c) = ($self->{line}, $self->{column});
441            $self->{c} = $self->{get_char}->($self);
442          if ($self->{c} == 0x003D) { # =          if ($self->{c} == 0x003D) { # =
443            # stay in the state            # stay in the state
444            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
445            return {type => INCLUDES_TOKEN};            return {type => INCLUDES_TOKEN, line => $l, column => $c};
446            #redo A;            #redo A;
447          } else {          } else {
448            # stay in the state            # stay in the state
449            # reprocess            # reprocess
450            return {type => TILDE_TOKEN};            return {type => TILDE_TOKEN, line => $l, column => $c};
451            #redo A;            #redo A;
452          }          }
453        } elsif ($self->{c} == -1) {        } elsif ($self->{c} == -1) {
454          # stay in the state          # stay in the state
455          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
456          return {type => EOF_TOKEN};          return {type => EOF_TOKEN,
457                    line => $self->{line}, column => $self->{column}};
458          #redo A;          #redo A;
459        } else {        } else {
460          # stay in the state          # stay in the state
461          $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};          $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c},
462          $self->{c} = $self->{get_char}->();                        line => $self->{line}, column => $self->{column}};
463          return $current_token;          $self->{c} = $self->{get_char}->($self);
464            return $self->{t};
465          #redo A;          #redo A;
466        }        }
467      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
# Line 333  sub get_next_token ($) { Line 471  sub get_next_token ($) {
471            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
472            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
473            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
474          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
475          $current_token->{type} = DIMENSION_TOKEN          $self->{t}->{type} = DIMENSION_TOKEN
476              if $current_token->{type} == NUMBER_TOKEN;              if $self->{t}->{type} == NUMBER_TOKEN;
477          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
478          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
479          redo A;          redo A;
480        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
 ## TODO: 12-\X, 12-\{nl}  
481          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
482          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
483          redo A;          redo A;
484        } elsif ($self->{c} == 0x002D and # -        } elsif ($self->{c} == 0x002D) { # -
485                 $current_token->{type} == IDENT_TOKEN) {          if ($self->{t}->{type} == IDENT_TOKEN) {
486          $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
487          if ($self->{c} == 0x003E) { # >            if ($self->{c} == 0x003E) { # >
488            $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
489            $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->($self);
490            return {type => CDC_TOKEN};              return {type => CDC_TOKEN,
491            #redo A;                      line => $self->{t}->{line},
492                        column => $self->{t}->{column}};
493                #redo A;
494              } else {
495                ## NOTE: |-|, |-|, $self->{c}
496                #$self->{t} = {type => IDENT_TOKEN, value => '-'};
497                $self->{t}->{column}++;
498                # stay in the state
499                # reconsume
500                return {type => MINUS_TOKEN,
501                        line => $self->{t}->{line},
502                        column => $self->{t}->{column} - 1};
503                #redo A;
504              }
505            } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
506              my ($l, $c) = ($self->{line}, $self->{column}); # second '-'
507              $self->{c} = $self->{get_char}->($self);
508              if ($self->{c} == 0x003E) { # >
509                unshift @{$self->{token}}, {type => CDC_TOKEN};
510                $self->{t}->{type} = NUMBER_TOKEN;
511                $self->{t}->{value} = '';
512                $self->{state} = BEFORE_TOKEN_STATE;
513                $self->{c} = $self->{get_char}->($self);
514                return $self->{t};
515                #redo A;
516              } else {
517                ## NOTE: NUMBER, |-|, |-|, $self->{c}
518                my $t = $self->{t};
519                $t->{type} = NUMBER_TOKEN;
520                $t->{value} = '';
521                $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1,
522                              line => $l, column => $c};
523                unshift @{$self->{token}}, {type => MINUS_TOKEN,
524                                            line => $l, column => $c - 1};
525                # stay in the state
526                # reconsume
527                return $t;
528                #redo A;
529              }
530          } else {          } else {
531            ## NOTE: |-|, |-|, $self->{c}            #
           #$current_token = {type => IDENT_TOKEN, value => '-'};  
           # stay in the state  
           # reconsume  
           return {type => DELIM_TOKEN, value => '-'};  
           #redo A;  
532          }          }
533        } else {        } else {
534          if ($current_token->{type} == NUMBER_TOKEN) {          #
535            ## NOTE: |-| after |NUMBER|.        }
536            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};        
537            $self->{state} = BEFORE_TOKEN_STATE;        if ($self->{t}->{type} == DIMENSION_TOKEN) {
538            # reconsume          ## NOTE: |-| after |NUMBER|.
539            $current_token->{value} = $current_token->{number};          unshift @{$self->{token}}, {type => MINUS_TOKEN,
540            delete $current_token->{number};                                      line => $self->{line},
541            return $current_token;                                      column => $self->{column} - 1};
542          } else {          ## BUG: column might be wrong if on the line boundary.
543            ## NOTE: |-| not followed by |nmstart|.          $self->{state} = BEFORE_TOKEN_STATE;
544            $self->{state} = BEFORE_TOKEN_STATE;          # reprocess
545            $self->{c} = $self->{get_char}->();          $self->{t}->{type} = NUMBER_TOKEN;
546            return {type => DELIM_TOKEN, value => '-'};          $self->{t}->{value} = '';
547          }          return $self->{t};
548          } else {
549            ## NOTE: |-| not followed by |nmstart|.
550            $self->{state} = BEFORE_TOKEN_STATE;
551            # reprocess
552            return {type => MINUS_TOKEN,
553                    line => $self->{line}, column => $self->{column} - 1};
554            ## BUG: column might be wrong if on the line boundary.
555        }        }
556      } elsif ($self->{state} == AFTER_AT_STATE) {      } elsif ($self->{state} == AFTER_AT_STATE) {
557        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
558            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
559            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
560            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
561          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
562          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
563          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
564          redo A;          redo A;
565        } elsif ($self->{c} == 0x002D) { # -        } elsif ($self->{c} == 0x002D) { # -
566          $current_token->{value} .= '-';          $self->{t}->{value} .= '-';
567          $self->{state} = AFTER_AT_HYPHEN_STATE;          $self->{state} = AFTER_AT_HYPHEN_STATE;
568          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
569          redo A;          redo A;
570        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
571          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
572          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
573          redo A;          redo A;
574        } else {        } else {
575          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
576          # reprocess          # reprocess
577          return {type => DELIM_TOKEN, value => '@'};          return {type => DELIM_TOKEN, value => '@',
578                    line => $self->{t}->{line},
579                    column => $self->{t}->{column}};
580        }        }
581      } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {      } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
582        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
583            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
584            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
585            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
586          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
587          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
588          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
589          redo A;          redo A;
590        } elsif ($self->{c} == 0x002D) { # -        } elsif ($self->{c} == 0x002D) { # -
591          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
592          if ($self->{c} == 0x003E) { # >          if ($self->{c} == 0x003E) { # >
593            unshift @{$self->{token}}, {type => CDC_TOKEN};            unshift @{$self->{token}}, {type => CDC_TOKEN};
594            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
595            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
596            return {type => DELIM_TOKEN, value => '@'};            return {type => DELIM_TOKEN, value => '@'};
597            #redo A;            #redo A;
598          } else {          } else {
599            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};            unshift @{$self->{token}}, {type => MINUS_TOKEN};
600            $current_token = {type => IDENT_TOKEN, value => '-'};            $self->{t} = {type => IDENT_TOKEN, value => '-'};
601            $self->{state} = BEFORE_NMSTART_STATE;            $self->{state} = BEFORE_NMSTART_STATE;
602            # reprocess            # reprocess
603            return {type => DELIM_TOKEN, value => '@'};            return {type => DELIM_TOKEN, value => '@'};
# Line 427  sub get_next_token ($) { Line 606  sub get_next_token ($) {
606        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
607          ## TODO: @-\{nl}          ## TODO: @-\{nl}
608          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
609          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
610          redo A;          redo A;
611        } else {        } else {
612          unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};          unshift @{$self->{token}}, {type => MINUS_TOKEN};
613          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
614          # reprocess          # reprocess
615          return {type => DELIM_TOKEN, value => '@'};          return {type => DELIM_TOKEN, value => '@'};
# Line 438  sub get_next_token ($) { Line 617  sub get_next_token ($) {
617      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
618        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
619          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
620          $current_token->{value} = '-';          $self->{t}->{hyphen} = 1;
621            $self->{t}->{value} = '-';
622            $self->{t}->{type} = DIMENSION_TOKEN;
623          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
624          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
625          redo A;          redo A;
626        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
627                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
628                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
629                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
630          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
631          $current_token->{value} = chr $self->{c};          $self->{t}->{value} = chr $self->{c};
632          $current_token->{type} = DIMENSION_TOKEN;          $self->{t}->{type} = DIMENSION_TOKEN;
633          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
634          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
635          redo A;          redo A;
636        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
637          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
638          $current_token->{value} = '';          $self->{t}->{value} = '';
639            $self->{t}->{type} = DIMENSION_TOKEN;
640          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
641          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
642          redo A;          redo A;
643        } elsif ($self->{c} == 0x0025) { # %        } elsif ($self->{c} == 0x0025) { # %
644          $current_token->{type} = PERCENTAGE_TOKEN;          $self->{t}->{type} = PERCENTAGE_TOKEN;
645          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
646          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
647          return $current_token;          return $self->{t};
648          #redo A;          #redo A;
649        } else {        } else {
650          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
651          # reprocess          # reprocess
652          return $current_token;          return $self->{t};
653          #redo A;          #redo A;
654        }        }
655      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
# Line 478  sub get_next_token ($) { Line 660  sub get_next_token ($) {
660            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
661            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
662            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
663          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
664          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
665          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
666          redo A;          redo A;
667        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
668          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
669          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
670          redo A;          redo A;
671        } else {        } else {
672          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
673          $self->{c} = $self->{get_char}->();          # reprocess
674          return {type => DELIM_TOKEN, value => '#'};          return {type => DELIM_TOKEN, value => '#',
675                    line => $self->{t}->{line},
676                    column => $self->{t}->{column}};
677          #redo A;          #redo A;
678        }        }
679      } elsif ($self->{state} == NAME_STATE) {      } elsif ($self->{state} == NAME_STATE) {
# Line 500  sub get_next_token ($) { Line 684  sub get_next_token ($) {
684            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
685            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
686            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
687          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
688          # stay in the state          # stay in the state
689          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
690          redo A;          redo A;
691        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
692          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
693          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
694          redo A;          redo A;
695        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
696                 $current_token->{type} == IDENT_TOKEN) { # (                 $self->{t}->{type} == IDENT_TOKEN) { # (
697          my $func_name = $current_token->{value};          my $func_name = $self->{t}->{value};
698          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
699          if ($func_name eq 'url' or $func_name eq 'url-prefix') {          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
700            if ($current_token->{has_escape}) {            if ($self->{t}->{has_escape}) {
701              ## TODO: warn              ## TODO: warn
702            }            }
703            $current_token->{type}            $self->{t}->{type}
704                = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;                = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
705            $current_token->{value} = '';            $self->{t}->{value} = '';
706            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
707            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
708            redo A;            redo A;
709          } else {          } else {
710            $current_token->{type} = FUNCTION_TOKEN;            $self->{t}->{type} = FUNCTION_TOKEN;
711            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
712            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
713            return $current_token;            return $self->{t};
714            #redo A;            #redo A;
715          }          }
716        } else {        } else {
717          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
718          # reconsume          # reconsume
719          return $current_token;          return $self->{t};
720          #redo A;          #redo A;
721        }        }
722      } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {      } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
# Line 543  sub get_next_token ($) { Line 727  sub get_next_token ($) {
727                  0x000A => 1, # \n                  0x000A => 1, # \n
728                  0x000C => 1, # \f                  0x000C => 1, # \f
729               }->{$self->{c}}) {               }->{$self->{c}}) {
730          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
731        }        }
732        if ($self->{c} == -1) {        if ($self->{c} == -1) {
733          $current_token->{type} = {          $self->{t}->{type} = {
734              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
735              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
736              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
737              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
738          }->{$current_token->{type}};                  }->{$self->{t}->{type}};        
739          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
740          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
741          return $current_token;          return $self->{t};
742          #redo A;          #redo A;
743        } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (        } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
744          ## TODO: Should we consider matches of "(" and ")"?          ## TODO: Should we consider matches of "(" and ")"?
745          $current_token->{type} = {          $self->{t}->{type} = {
746              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
747              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
748              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
749              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
750          }->{$current_token->{type}};          }->{$self->{t}->{type}};
751          $self->{state} = URI_UNQUOTED_STATE;          $self->{state} = URI_UNQUOTED_STATE;
752          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
753          redo A;          redo A;
754        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
755          $self->{state} = STRING_STATE; $q = $self->{c};          $self->{state} = STRING_STATE; $q = $self->{c};
756          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
757          redo A;          redo A;
758        } elsif ($self->{c} == 0x0029) { # )        } elsif ($self->{c} == 0x0029) { # )
759          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
760          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
761          return $current_token;          return $self->{t};
762          #redo A;          #redo A;
763        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
764          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
765          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
766          redo A;          redo A;
767        } else {        } else {
768          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
769          $self->{state} = URI_UNQUOTED_STATE;          $self->{state} = URI_UNQUOTED_STATE;
770          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
771          redo A;          redo A;
772        }        }
773      } elsif ($self->{state} == URI_UNQUOTED_STATE) {      } elsif ($self->{state} == URI_UNQUOTED_STATE) {
# Line 595  sub get_next_token ($) { Line 779  sub get_next_token ($) {
779             0x000C => 1, # \f             0x000C => 1, # \f
780            }->{$self->{c}}) {            }->{$self->{c}}) {
781          $self->{state} = URI_AFTER_WSP_STATE;          $self->{state} = URI_AFTER_WSP_STATE;
782          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
783          redo A;          redo A;
784        } elsif ($self->{c} == -1) {        } elsif ($self->{c} == -1) {
785          $current_token->{type} = {          $self->{t}->{type} = {
786              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
787              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
788              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
789              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
790          }->{$current_token->{type}};                  }->{$self->{t}->{type}};        
791          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
792          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
793          return $current_token;          return $self->{t};
794          #redo A;          #redo A;
795        } elsif ($self->{c} < 0x0020 or {        } elsif ($self->{c} < 0x0020 or {
796            0x0022 => 1, # "            0x0022 => 1, # "
# Line 614  sub get_next_token ($) { Line 798  sub get_next_token ($) {
798            0x0028 => 1, # (            0x0028 => 1, # (
799        }->{$self->{c}}) { # C0 or (        }->{$self->{c}}) { # C0 or (
800          ## TODO: Should we consider matches of "(" and ")", '"', or "'"?          ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
801          $current_token->{type} = {          $self->{t}->{type} = {
802              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
803              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
804              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
805              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
806          }->{$current_token->{type}};          }->{$self->{t}->{type}};
807          # stay in the state.          # stay in the state.
808          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
809          redo A;          redo A;
810        } elsif ($self->{c} == 0x0029) { # )        } elsif ($self->{c} == 0x0029) { # )
811          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
812          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
813          return $current_token;          return $self->{t};
814          #redo A;          #redo A;
815        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
816          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
817          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
818          redo A;          redo A;
819        } else {        } else {
820          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
821          # stay in the state.          # stay in the state.
822          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
823          redo A;          redo A;
824        }        }
825      } elsif ($self->{state} == URI_AFTER_WSP_STATE) {      } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
# Line 647  sub get_next_token ($) { Line 831  sub get_next_token ($) {
831             0x000C => 1, # \f             0x000C => 1, # \f
832            }->{$self->{c}}) {            }->{$self->{c}}) {
833          # stay in the state.          # stay in the state.
834          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
835          redo A;          redo A;
836        } elsif ($self->{c} == -1) {        } elsif ($self->{c} == -1) {
837          $current_token->{type} = {          $self->{t}->{type} = {
838              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
839              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
840              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
841              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
842          }->{$current_token->{type}};                  }->{$self->{t}->{type}};        
843          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
844          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
845          return $current_token;          return $self->{t};
846          #redo A;          #redo A;
847        } elsif ($self->{c} == 0x0029) { # )        } elsif ($self->{c} == 0x0029) { # )
848          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
849          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
850          return $current_token;          return $self->{t};
851          #redo A;          #redo A;
852        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
853          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
854          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
855          redo A;          redo A;
856        } else {        } else {
857          ## TODO: Should we consider matches of "(" and ")", '"', or "'"?          ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
858          $current_token->{type} = {          $self->{t}->{type} = {
859              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
860              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
861              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
862              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
863          }->{$current_token->{type}};          }->{$self->{t}->{type}};
864          # stay in the state.          # stay in the state.
865          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
866          redo A;          redo A;
867        }        }
868      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
869        $current_token->{has_escape} = 1;        $self->{t}->{has_escape} = 1;
870        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
871          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
872          $char = $self->{c} - 0x0030;          $char = $self->{c} - 0x0030;
873          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
874          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
875          redo A;          redo A;
876        } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F        } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
877          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
878          $char = $self->{c} - 0x0041 + 0xA;          $char = $self->{c} - 0x0041 + 0xA;
879          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
880          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
881          redo A;          redo A;
882        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
883          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
884          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 + 0xA;
885          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
886          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
887          redo A;          redo A;
888        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
889                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
890          if ($q == 0) {          if ($q == 0) {
891            ## NOTE: In |escape| in ... in |ident|.            #
           $self->{state} = BEFORE_TOKEN_STATE;  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};  
           return $current_token;  
           # reconsume  
           #redo A;  
892          } elsif ($q == 1) {          } elsif ($q == 1) {
893            ## NOTE: In |escape| in |URI|.            ## NOTE: In |escape| in |URI|.
894            $current_token->{type} = {            $self->{t}->{type} = {
895                URI_TOKEN, URI_INVALID_TOKEN,                URI_TOKEN, URI_INVALID_TOKEN,
896                URI_INVALID_TOKEN, URI_INVALID_TOKEN,                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
897                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
898                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
899            }->{$current_token->{type}};            }->{$self->{t}->{type}};
900            $current_token->{value} .= chr $self->{c};            $self->{t}->{value} .= chr $self->{c};
901            $self->{state} = URI_UNQUOTED_STATE;            $self->{state} = URI_UNQUOTED_STATE;
902            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
903            redo A;            redo A;
904          } else {          } else {
905            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $current_token->{value} .= chr $self->{c};  
906            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
907            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
908            redo A;            redo A;
909          }          }
910        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
911          if ($q == 0) {          if ($q == 0) {
912            ## NOTE: In |escape| in ... in |ident|.            #
           $self->{state} = BEFORE_TOKEN_STATE;  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};  
           return $current_token;  
           # reconsume  
           #redo A;  
913          } elsif ($q == 1) {          } elsif ($q == 1) {
914            $current_token->{type} = {            ## NOTE: In |escape| in |URI|.
915              $self->{t}->{type} = {
916                URI_TOKEN, URI_INVALID_TOKEN,                URI_TOKEN, URI_INVALID_TOKEN,
917                URI_INVALID_TOKEN, URI_INVALID_TOKEN,                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
918                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
919                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
920            }->{$current_token->{type}};            }->{$self->{t}->{type}};
921            $current_token->{value} .= "\x0D\x0A";            $self->{state} = ESCAPE_BEFORE_LF_STATE;
922            $self->{state} = URI_UNQUOTED_STATE;            $self->{c} = $self->{get_char}->($self);
           $self->{c} = $self->{get_char}->();  
923            redo A;            redo A;
924          } else {          } else {
925            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $current_token->{value} .= "\x0D\x0A";  
926            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
927            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
928            redo A;            redo A;
929          }          }
930          } elsif ($self->{c} == -1) {
931            #
932        } else {        } else {
933          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
934          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
935          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
936              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
937          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
938          redo A;          redo A;
939        }        }
940    
941          if ($q == 0) {
942            if ($self->{t}->{type} == DIMENSION_TOKEN) {
943              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
944                $self->{state} = BEFORE_TOKEN_STATE;
945                # reprocess
946                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
947                                            line => $self->{line},
948                                            column => $self->{column} - 2};
949                unshift @{$self->{token}}, {type => MINUS_TOKEN,
950                                            line => $self->{line},
951                                            column => $self->{column} - 1};
952                ## BUG: line and column might be wrong if they are on the
953                ## line boundary.
954                $self->{t}->{type} = NUMBER_TOKEN;
955                $self->{t}->{value} = '';
956                return $self->{t};
957                #redo A;
958              } elsif (length $self->{t}->{value}) {
959                $self->{state} = BEFORE_TOKEN_STATE;
960                # reprocess
961                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
962                                            line => $self->{line},
963                                            column => $self->{column} - 1};
964                ## BUG: line and column might be wrong if they are on the
965                ## line boundary.
966                return $self->{t};
967                #redo A;
968              } else {
969                $self->{state} = BEFORE_TOKEN_STATE;
970                # reprocess
971                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
972                                            line => $self->{line},
973                                            column => $self->{column} - 1};
974                ## BUG: line and column might be wrong if they are on the
975                ## line boundary.
976                $self->{t}->{type} = NUMBER_TOKEN;
977                $self->{t}->{value} = '';
978                return $self->{t};
979                #redo A;
980              }
981            } else {
982              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
983                $self->{state} = BEFORE_TOKEN_STATE;
984                # reprocess
985                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
986                                            line => $self->{line},
987                                            column => $self->{column} - 2};
988                return {type => MINUS_TOKEN,
989                        line => $self->{line},
990                        column => $self->{column} - 1};
991                ## BUG: line and column might be wrong if they are on the
992                ## line boundary.
993                #redo A;
994              } elsif (length $self->{t}->{value}) {
995                $self->{state} = BEFORE_TOKEN_STATE;
996                # reprocess
997                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
998                                            line => $self->{line},
999                                            column => $self->{column} - 1};
1000                ## BUG: line and column might be wrong if they are on the
1001                ## line boundary.
1002                return $self->{t};
1003                #redo A;
1004              } else {
1005                $self->{state} = BEFORE_TOKEN_STATE;
1006                # reprocess
1007                return {type => DELIM_TOKEN, value => '\\',
1008                        line => $self->{line},
1009                        column => $self->{column} - 1};
1010                ## BUG: line and column might be wrong if they are on the
1011                ## line boundary.
1012                #redo A;
1013              }
1014            }
1015          } elsif ($q == 1) {
1016            $self->{state} = URI_UNQUOTED_STATE;
1017            $self->{c} = $self->{get_char}->($self);
1018            redo A;
1019          } else {
1020            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\',
1021                                        line => $self->{line},
1022                                        column => $self->{column} - 1};
1023            ## BUG: line and column might be wrong if they are on the
1024            ## line boundary.
1025            $self->{t}->{type} = {
1026              STRING_TOKEN, INVALID_TOKEN,
1027              URI_TOKEN, URI_INVALID_TOKEN,
1028              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1029            }->{$self->{t}->{type}} || $self->{t}->{type};
1030            $self->{state} = BEFORE_TOKEN_STATE;
1031            # reprocess
1032            return $self->{t};
1033            #redo A;
1034          }
1035      } elsif ($self->{state} == ESCAPE_STATE) {      } elsif ($self->{state} == ESCAPE_STATE) {
1036        ## NOTE: third..seventh character of |unicode| in |escape|.        ## NOTE: third..seventh character of |unicode| in |escape|.
1037        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
1038          $char = $char * 0x10 + $self->{c} - 0x0030;          $char = $char * 0x10 + $self->{c} - 0x0030;
1039          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1040          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1041          redo A;          redo A;
1042        } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F        } elsif (0x0041 <= $self->{c} and $self->{c} <= 0x0046) { # A..F
1043          $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;          $char = $char * 0x10 + $self->{c} - 0x0041 + 0xA;
1044          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1045          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1046          redo A;          redo A;
1047        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
1048          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
1049          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
1050          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1051          redo A;          redo A;
1052        } elsif ($self->{c} == 0x0020 or # SP        } elsif ($self->{c} == 0x0020 or # SP
1053                 $self->{c} == 0x000A or # \n                 $self->{c} == 0x000A or # \n
1054                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
1055                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
1056          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1057          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1058              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1059          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1060          redo A;          redo A;
1061        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
1062          $self->{state} = ESCAPE_BEFORE_LF_STATE;          $self->{state} = ESCAPE_BEFORE_LF_STATE;
1063          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1064          redo A;          redo A;
1065        } else {        } else {
1066          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1067          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1068              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1069          # reconsume          # reconsume
# Line 806  sub get_next_token ($) { Line 1075  sub get_next_token ($) {
1075            $self->{c} == 0x000A or # \n            $self->{c} == 0x000A or # \n
1076            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
1077            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
1078          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1079          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1080              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1081          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1082          redo A;          redo A;
1083        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
1084          $self->{state} = ESCAPE_BEFORE_NL_STATE;          $self->{state} = ESCAPE_BEFORE_NL_STATE;
1085          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1086          redo A;          redo A;
1087        } else {        } else {
1088          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1089          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1090              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1091          # reconsume          # reconsume
1092          redo A;          redo A;
1093        }        }
1094      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1095        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1096        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
         $current_token->{value} .= chr $char;  
1097          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1098              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1099          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1100          redo A;          redo A;
1101        } else {        } else {
         $current_token->{value} .= chr $char;  
1102          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1103              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1104          # reconsume          # reprocess
1105          redo A;          redo A;
1106        }        }
1107      } elsif ($self->{state} == STRING_STATE) {      } elsif ($self->{state} == STRING_STATE) {
# Line 844  sub get_next_token ($) { Line 1111  sub get_next_token ($) {
1111        ## Or, in |URI|.        ## Or, in |URI|.
1112        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
1113          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
1114          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1115          redo A;          redo A;
1116        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
1117          if ($current_token->{type} == STRING_TOKEN) {          if ($self->{t}->{type} == STRING_TOKEN) {
1118            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
1119            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
1120            return $current_token;            return $self->{t};
1121            #redo A;            #redo A;
1122          } else {          } else {
1123            $self->{state} = URI_AFTER_WSP_STATE;            $self->{state} = URI_AFTER_WSP_STATE;
1124            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->($self);
1125            redo A;            redo A;
1126          }          }
1127        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
1128                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
1129                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
1130                 $self->{c} == -1) {                 $self->{c} == -1) {
1131          $current_token->{type} = INVALID_TOKEN;          $self->{t}->{type} = {
1132              STRING_TOKEN, INVALID_TOKEN,
1133              INVALID_TOKEN, INVALID_TOKEN,
1134              URI_TOKEN, URI_INVALID_TOKEN,
1135              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1136              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1137              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1138            }->{$self->{t}->{type}};
1139          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1140          # reconsume          # reconsume
1141          return $current_token;          return $self->{t};
1142          #redo A;          #redo A;
1143        } else {        } else {
1144          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1145          # stay in the state          # stay in the state
1146          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1147          redo A;          redo A;
1148        }        }
1149      } elsif ($self->{state} == NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_STATE) {
1150        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1151        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1152          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1153          # stay in the state          # stay in the state
1154          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1155          redo A;          redo A;
1156        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
1157          $self->{state} = NUMBER_DOT_STATE;          $self->{state} = NUMBER_DOT_STATE;
1158          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1159          redo A;          redo A;
1160        } else {        } else {
1161          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1162          $current_token->{value} = '';          $self->{t}->{value} = '';
1163          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1164          # reprocess          # reprocess
1165          redo A;          redo A;
# Line 893  sub get_next_token ($) { Line 1167  sub get_next_token ($) {
1167      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
1168        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
1169        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1170          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1171          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1172          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1173          redo A;          redo A;
1174        } else {        } else {
1175          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DOT_TOKEN};
1176          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1177          $current_token->{value} = '';          $self->{t}->{value} = '';
1178          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1179          # reprocess          # reprocess
1180          return $current_token;          return $self->{t};
1181          #redo A;          #redo A;
1182        }        }
1183      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1184        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
1185        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1186          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1187          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1188          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1189          redo A;          redo A;
1190        } else {        } else {
1191          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1192          $self->{c} = $self->{get_char}->();          # reprocess
1193          return {type => DELIM_TOKEN, value => '.'};          return {type => DOT_TOKEN,
1194                    line => $self->{line}, column => $self->{column} - 1};
1195            ## BUG: line and column might be wrong if they are on the
1196            ## line boundary.
1197          #redo A;          #redo A;
1198        }        }
1199      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1200        ## NOTE: |[0-9]| in |num| after |.|.        ## NOTE: |[0-9]| in |num| after |.|.
1201        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1202          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1203          # stay in the state          # stay in the state
1204          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
1205          redo A;          redo A;
1206        } else {        } else {
1207          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1208          $current_token->{value} = '';          $self->{t}->{value} = '';
1209          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1210          # reprocess          # reprocess
1211          redo A;          redo A;
# Line 937  sub get_next_token ($) { Line 1214  sub get_next_token ($) {
1214        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";
1215      }      }
1216    } # A    } # A
1217    } # get_next_token
1218    
1219    ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|  sub serialize_token ($$) {
1220      shift;
1221      my $t = shift;
1222    
1223  } # get_next_token    ## NOTE: This function is not intended for roundtrip-able serialization.
1224    
1225      if ($t->{type} == IDENT_TOKEN) {
1226        return $t->{value};
1227      } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1228        return '@' . $t->{value};
1229      } elsif ($t->{type} == HASH_TOKEN) {
1230        return '#' . $t->{value};
1231      } elsif ($t->{type} == FUNCTION_TOKEN) {
1232        return $t->{value} . '(';
1233      } elsif ($t->{type} == URI_TOKEN) {
1234        return 'url(' . $t->{value} . ')';
1235      } elsif ($t->{type} == URI_INVALID_TOKEN) {
1236        return 'url(' . $t->{value};
1237      } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1238        return 'url-prefix(' . $t->{value} . ')';
1239      } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1240        return 'url-prefix(' . $t->{value};
1241      } elsif ($t->{type} == STRING_TOKEN) {
1242        return '"' . $t->{value} . '"';
1243      } elsif ($t->{type} == INVALID_TOKEN) {
1244        return '"' . $t->{value};
1245      } elsif ($t->{type} == NUMBER_TOKEN) {
1246        return $t->{number};
1247      } elsif ($t->{type} == DIMENSION_TOKEN) {
1248        return $t->{number} . $t->{value};
1249      } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1250        return $t->{number} . '%';
1251      } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1252        return 'U+' . $t->{value};
1253      } elsif ($t->{type} == DELIM_TOKEN) {
1254        return $t->{value};
1255      } elsif ($t->{type} == PLUS_TOKEN) {
1256        return '+';
1257      } elsif ($t->{type} == GREATER_TOKEN) {
1258        return '>';
1259      } elsif ($t->{type} == COMMA_TOKEN) {
1260        return ',';
1261      } elsif ($t->{type} == TILDE_TOKEN) {
1262        return '~';
1263      } elsif ($t->{type} == DASHMATCH_TOKEN) {
1264        return '|=';
1265      } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1266        return '^=';
1267      } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1268        return '$=';
1269      } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1270        return '*=';
1271      } elsif ($t->{type} == INCLUDES_TOKEN) {
1272        return '~=';
1273      } elsif ($t->{type} == SEMICOLON_TOKEN) {
1274        return ';';
1275      } elsif ($t->{type} == LBRACE_TOKEN) {
1276        return '{';
1277      } elsif ($t->{type} == RBRACE_TOKEN) {
1278        return '}';
1279      } elsif ($t->{type} == LPAREN_TOKEN) {
1280        return '(';
1281      } elsif ($t->{type} == RPAREN_TOKEN) {
1282        return ')';
1283      } elsif ($t->{type} == LBRACKET_TOKEN) {
1284        return '[';
1285      } elsif ($t->{type} == RBRACKET_TOKEN) {
1286        return ']';
1287      } elsif ($t->{type} == S_TOKEN) {
1288        return ' ';
1289      } elsif ($t->{type} == CDO_TOKEN) {
1290        return '<!--';
1291      } elsif ($t->{type} == CDC_TOKEN) {
1292        return '-->';
1293      } elsif ($t->{type} == COMMENT_TOKEN) {
1294        return '/**/';
1295      } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1296        return '/*';
1297      } elsif ($t->{type} == EOF_TOKEN) {
1298        return '{EOF}';
1299      } elsif ($t->{type} == MINUS_TOKEN) {
1300        return '-';
1301      } elsif ($t->{type} == STAR_TOKEN) {
1302        return '*';
1303      } elsif ($t->{type} == VBAR_TOKEN) {
1304        return '|';
1305      } elsif ($t->{type} == COLON_TOKEN) {
1306        return ':';
1307      } elsif ($t->{type} == MATCH_TOKEN) {
1308        return '=';
1309      } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1310        return '!';
1311      } else {
1312        return '{'.$t->{type}.'}';
1313      }
1314    } # serialize_token
1315    
1316    =head1 LICENSE
1317    
1318    Copyright 2007 Wakaba <w@suika.fam.cx>
1319    
1320    This library is free software; you can redistribute it
1321    and/or modify it under the same terms as Perl itself.
1322    
1323    =cut
1324    
1325  1;  1;
1326  # $Date$  # $Date$

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.19

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24