/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4 by wakaba, Sat Sep 8 02:58:24 2007 UTC revision 1.16 by wakaba, Wed Oct 17 10:46:26 2007 UTC
# Line 1  Line 1 
1  package Whatpm::CSS::Tokenizer;  package Whatpm::CSS::Tokenizer;
2  use strict;  use strict;
3    our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    require Exporter;
6    push our @ISA, 'Exporter';
7    
8  sub BEFORE_TOKEN_STATE () { 0 }  sub BEFORE_TOKEN_STATE () { 0 }
9  sub BEFORE_NMSTART_STATE () { 1 }  sub BEFORE_NMSTART_STATE () { 1 }
# Line 36  sub NUMBER_TOKEN () { 11 } Line 40  sub NUMBER_TOKEN () { 11 }
40  sub DIMENSION_TOKEN () { 12 }  sub DIMENSION_TOKEN () { 12 }
41  sub PERCENTAGE_TOKEN () { 13 }  sub PERCENTAGE_TOKEN () { 13 }
42  sub UNICODE_RANGE_TOKEN () { 14 }  sub UNICODE_RANGE_TOKEN () { 14 }
 sub UNICODE_RANGE_INVALID_TOKEN () { 15 }  
43  sub DELIM_TOKEN () { 16 }  sub DELIM_TOKEN () { 16 }
44  sub PLUS_TOKEN () { 17 }  sub PLUS_TOKEN () { 17 }
45  sub GREATER_TOKEN () { 18 }  sub GREATER_TOKEN () { 18 }
# Line 60  sub CDC_TOKEN () { 35 } Line 63  sub CDC_TOKEN () { 35 }
63  sub COMMENT_TOKEN () { 36 }  sub COMMENT_TOKEN () { 36 }
64  sub COMMENT_INVALID_TOKEN () { 37 }  sub COMMENT_INVALID_TOKEN () { 37 }
65  sub EOF_TOKEN () { 38 }  sub EOF_TOKEN () { 38 }
66    sub MINUS_TOKEN () { 39 }
67    sub STAR_TOKEN () { 40 }
68    sub VBAR_TOKEN () { 41 }
69    sub DOT_TOKEN () { 42 }
70    sub COLON_TOKEN () { 43 }
71    sub MATCH_TOKEN () { 44 }
72    sub EXCLAMATION_TOKEN () { 45 }
73    
74  our @TokenName = qw(  our @TokenName = qw(
75    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
76    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
77    UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH    0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80    COMMENT_INVALID EOF    COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81    );
82    
83    our @EXPORT_OK = qw(
84      IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85      URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86      STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87      UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88      TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89      SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90      RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91      S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92      MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93      EXCLAMATION_TOKEN
94  );  );
95    
96    our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97    
98  sub new ($) {  sub new ($) {
99    my $self = bless {token => [], get_char => sub { -1 },    my $self = bless {token => [], get_char => sub { -1 },
100                      onerror => sub { }}, shift;                      onerror => sub { }}, shift;
# Line 80  sub init ($) { Line 105  sub init ($) {
105    my $self = shift;    my $self = shift;
106    $self->{state} = BEFORE_TOKEN_STATE;    $self->{state} = BEFORE_TOKEN_STATE;
107    $self->{c} = $self->{get_char}->();    $self->{c} = $self->{get_char}->();
108      #$self->{t} = {type => token-type, value => value, number => number};
109  } # init  } # init
110    
111  sub get_next_token ($) {  sub get_next_token ($) {
# Line 88  sub get_next_token ($) { Line 114  sub get_next_token ($) {
114      return shift @{$self->{token}};      return shift @{$self->{token}};
115    }    }
116    
   my $current_token;  
117    my $char;    my $char;
118    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
119    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
# Line 103  sub get_next_token ($) { Line 128  sub get_next_token ($) {
128      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
129        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
130          ## NOTE: |-| in |ident| in |IDENT|          ## NOTE: |-| in |ident| in |IDENT|
131          $current_token = {type => IDENT_TOKEN, value => '-'};          $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
132          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
133          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
134          redo A;          redo A;
135          } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
136            $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
137            $self->{c} = $self->{get_char}->();
138            if ($self->{c} == 0x002B) { # +
139              $self->{c} = $self->{get_char}->();
140              if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
141                  (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
142                  (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
143                  $self->{c} == 0x003F) { # ?
144                $self->{t}->{value} = chr $self->{c};
145                $self->{t}->{type} = UNICODE_RANGE_TOKEN;
146                $self->{c} = $self->{get_char}->();
147                C: for (2..6) {
148                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
149                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
150                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
151                      $self->{c} == 0x003F) { # ?
152                    $self->{t}->{value} .= chr $self->{c};
153                    $self->{c} = $self->{get_char}->();
154                  } else {
155                    last C;
156                  }
157                } # C
158    
159                if ($self->{c} == 0x002D) { # -
160                  $self->{c} = $self->{get_char}->();
161                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
162                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
163                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
164                    $self->{t}->{value} .= '-' . chr $self->{c};
165                    $self->{c} = $self->{get_char}->();
166                    C: for (2..6) {
167                      if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
168                          (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
169                          (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
170                        $self->{t}->{value} .= chr $self->{c};
171                        $self->{c} = $self->{get_char}->();
172                      } else {
173                        last C;
174                      }
175                    } # C
176                    
177                    #
178                  } else {
179                    my $token = $self->{t};
180                    $self->{t} = {type => IDENT_TOKEN, value => '-'};
181                    $self->{state} = BEFORE_NMSTART_STATE;
182                    # reprocess
183                    return $token;
184                    #redo A;
185                  }
186                }
187    
188                $self->{state} = BEFORE_TOKEN_STATE;
189                # reprocess
190                return $self->{t};
191                #redo A;
192              } else {
193                unshift @{$self->{token}}, {type => PLUS_TOKEN};
194                $self->{state} = BEFORE_TOKEN_STATE;
195                # reprocess
196                return $self->{t};
197                #redo A;
198              }
199            } else {
200              $self->{state} = NAME_STATE;
201              # reprocess
202              redo A;
203            }
204        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
205                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
206                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
207                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
208          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
209          $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
210          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
211          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
212          redo A;          redo A;
213        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
214          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
215          $current_token = {type => IDENT_TOKEN, value => ''};          $self->{t} = {type => IDENT_TOKEN, value => ''};
216          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
217          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
218          redo A;          redo A;
219        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
220          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
221          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
222          $self->{state} = AFTER_AT_STATE;          $self->{state} = AFTER_AT_STATE;
223          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
224          redo A;          redo A;
225        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
226          $current_token = {type => STRING_TOKEN, value => ''};          $self->{t} = {type => STRING_TOKEN, value => ''};
227          $self->{state} = STRING_STATE; $q = $self->{c};          $self->{state} = STRING_STATE; $q = $self->{c};
228          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
229          redo A;          redo A;
230        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
231          ## NOTE: |#| in |HASH|.          ## NOTE: |#| in |HASH|.
232          $current_token = {type => HASH_TOKEN, value => ''};          $self->{t} = {type => HASH_TOKEN, value => ''};
233          $self->{state} = HASH_OPEN_STATE;          $self->{state} = HASH_OPEN_STATE;
234          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
235          redo A;          redo A;
236        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
237          ## NOTE: |num|.          ## NOTE: |num|.
238          $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};          $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
239          $self->{state} = NUMBER_STATE;          $self->{state} = NUMBER_STATE;
240          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
241          redo A;          redo A;
242        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
243          ## NOTE: |num|.          ## NOTE: |num|.
244          $current_token = {type => NUMBER_TOKEN, value => '0'};          $self->{t} = {type => NUMBER_TOKEN, value => '0'};
245          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
246          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
247          redo A;          redo A;
# Line 183  sub get_next_token ($) { Line 277  sub get_next_token ($) {
277          } else {          } else {
278            # stay in the state.            # stay in the state.
279            # reprocess            # reprocess
280            return {type => DELIM_STATE, value => '/'};            return {type => DELIM_TOKEN, value => '/'};
281            #redo A;            #redo A;
282          }                  }        
283        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
# Line 191  sub get_next_token ($) { Line 285  sub get_next_token ($) {
285          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
286          if ($self->{c} == 0x0021) { # !          if ($self->{c} == 0x0021) { # !
287            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
288            if ($self->{c} == 0x002C) { # -            if ($self->{c} == 0x002D) { # -
289              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->();
290              if ($self->{c} == 0x002C) { # -              if ($self->{c} == 0x002D) { # -
291                $self->{state} = BEFORE_TOKEN_STATE;                $self->{state} = BEFORE_TOKEN_STATE;
292                $self->{c} = $self->{get_char}->();                $self->{c} = $self->{get_char}->();
293                return {type => CDO_TOKEN};                return {type => CDO_TOKEN};
294                #redo A;                #redo A;
295              } else {              } else {
296                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
297                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
298                $current_token = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-'};
299                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
300                #reprocess                #reprocess
301                return {type => DELIM_TOKEN, value => '<'};                return {type => DELIM_TOKEN, value => '<'};
302                #redo A;                #redo A;
303              }              }
304            } else {            } else {
305              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};              unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
306              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
307              #reprocess              #reprocess
308              return {type => DELIM_TOKEN, value => '<'};              return {type => DELIM_TOKEN, value => '<'};
# Line 221  sub get_next_token ($) { Line 315  sub get_next_token ($) {
315            #redo A;            #redo A;
316          }          }
317        } elsif (my $t = {        } elsif (my $t = {
318                  0x003B => SEMICOLON_TOKEN, # ;                          0x0021 => EXCLAMATION_TOKEN, # !
319                  0x007B => LBRACE_TOKEN, # {                          0x002D => MINUS_TOKEN, # -
320                  0x007D => RBRACE_TOKEN, # }                          0x002E => DOT_TOKEN, # .
321                  0x0028 => LPAREN_TOKEN, # (                          0x003A => COLON_TOKEN, # :
322                  0x0029 => RPAREN_TOKEN, # )                          0x003B => SEMICOLON_TOKEN, # ;
323                  0x005B => LBRACKET_TOKEN, # [                          0x003D => MATCH_TOKEN, # =
324                  0x005D => RBRACKET_TOKEN, # ]                          0x007B => LBRACE_TOKEN, # {
325                            0x007D => RBRACE_TOKEN, # }
326                            0x0028 => LPAREN_TOKEN, # (
327                            0x0029 => RPAREN_TOKEN, # )
328                            0x005B => LBRACKET_TOKEN, # [
329                            0x005D => RBRACKET_TOKEN, # ]
330                 }->{$self->{c}}) {                 }->{$self->{c}}) {
331          # stay in the state          # stay in the state
332          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 280  sub get_next_token ($) { Line 379  sub get_next_token ($) {
379            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
380            return {type => $v};            return {type => $v};
381            #redo A;            #redo A;
382            } elsif ($v = {
383                           0x002A => STAR_TOKEN, # *
384                           0x007C => VBAR_TOKEN, # |
385                          }->{$c}) {
386              # stay in the state.
387              # reprocess
388              return {type => $v};
389              #redo A;
390          } else {          } else {
391            # stay in the state            # stay in the state
392            # reprocess            # reprocess
# Line 321  sub get_next_token ($) { Line 428  sub get_next_token ($) {
428          #redo A;          #redo A;
429        } else {        } else {
430          # stay in the state          # stay in the state
431          $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};          $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
432          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
433          return $current_token;          return $self->{t};
434          #redo A;          #redo A;
435        }        }
436      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
# Line 333  sub get_next_token ($) { Line 440  sub get_next_token ($) {
440            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
441            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
442            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
443          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
444          $current_token->{type} = DIMENSION_TOKEN          $self->{t}->{type} = DIMENSION_TOKEN
445              if $current_token->{type} == NUMBER_TOKEN;              if $self->{t}->{type} == NUMBER_TOKEN;
446          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
447          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
448          redo A;          redo A;
449        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
 ## TODO: 12-\X, 12-\{nl}  
450          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
451          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
452          redo A;          redo A;
453        } elsif ($self->{c} == 0x002D and # -        } elsif ($self->{c} == 0x002D) { # -
454                 $current_token->{type} == IDENT_TOKEN) {          if ($self->{t}->{type} == IDENT_TOKEN) {
         $self->{c} = $self->{get_char}->();  
         if ($self->{c} == 0x003E) { # >  
           $self->{state} = BEFORE_TOKEN_STATE;  
455            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
456            return {type => CDC_TOKEN};            if ($self->{c} == 0x003E) { # >
457            #redo A;              $self->{state} = BEFORE_TOKEN_STATE;
458                $self->{c} = $self->{get_char}->();
459                return {type => CDC_TOKEN};
460                #redo A;
461              } else {
462                ## NOTE: |-|, |-|, $self->{c}
463                #$self->{t} = {type => IDENT_TOKEN, value => '-'};
464                # stay in the state
465                # reconsume
466                return {type => MINUS_TOKEN};
467                #redo A;
468              }
469            } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
470              $self->{c} = $self->{get_char}->();
471              if ($self->{c} == 0x003E) { # >
472                unshift @{$self->{token}}, {type => CDC_TOKEN};
473                $self->{t}->{type} = NUMBER_TOKEN;
474                $self->{t}->{value} = '';
475                $self->{state} = BEFORE_TOKEN_STATE;
476                $self->{c} = $self->{get_char}->();
477                return $self->{t};
478                #redo A;
479              } else {
480                ## NOTE: |-|, |-|, $self->{c}
481                my $t = $self->{t};
482                $t->{type} = NUMBER_TOKEN;
483                $t->{value} = '';
484                $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
485                unshift @{$self->{token}}, {type => MINUS_TOKEN};
486                # stay in the state
487                # reconsume
488                return $t;
489                #redo A;
490              }
491          } else {          } else {
492            ## NOTE: |-|, |-|, $self->{c}            #
           #$current_token = {type => IDENT_TOKEN, value => '-'};  
           # stay in the state  
           # reconsume  
           return {type => DELIM_TOKEN, value => '-'};  
           #redo A;  
493          }          }
494        } else {        } else {
495          if ($current_token->{type} == NUMBER_TOKEN) {          #
496            ## NOTE: |-| after |NUMBER|.        }
497            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};        
498            $self->{state} = BEFORE_TOKEN_STATE;        if ($self->{t}->{type} == DIMENSION_TOKEN) {
499            # reconsume          ## NOTE: |-| after |NUMBER|.
500            $current_token->{value} = $current_token->{number};          unshift @{$self->{token}}, {type => MINUS_TOKEN};
501            delete $current_token->{number};          $self->{state} = BEFORE_TOKEN_STATE;
502            return $current_token;          # reprocess
503          } else {          $self->{t}->{type} = NUMBER_TOKEN;
504            ## NOTE: |-| not followed by |nmstart|.          $self->{t}->{value} = '';
505            $self->{state} = BEFORE_TOKEN_STATE;          return $self->{t};
506            $self->{c} = $self->{get_char}->();        } else {
507            return {type => DELIM_TOKEN, value => '-'};          ## NOTE: |-| not followed by |nmstart|.
508          }          $self->{state} = BEFORE_TOKEN_STATE;
509            # reprocess
510            return {type => MINUS_TOKEN};
511        }        }
512      } elsif ($self->{state} == AFTER_AT_STATE) {      } elsif ($self->{state} == AFTER_AT_STATE) {
513        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
514            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
515            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
516            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
517          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
518          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
519          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
520          redo A;          redo A;
521        } elsif ($self->{c} == 0x002D) { # -        } elsif ($self->{c} == 0x002D) { # -
522          $current_token->{value} .= '-';          $self->{t}->{value} .= '-';
523          $self->{state} = AFTER_AT_HYPHEN_STATE;          $self->{state} = AFTER_AT_HYPHEN_STATE;
524          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
525          redo A;          redo A;
# Line 404  sub get_next_token ($) { Line 537  sub get_next_token ($) {
537            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
538            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
539            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
540          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
541          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
542          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
543          redo A;          redo A;
# Line 417  sub get_next_token ($) { Line 550  sub get_next_token ($) {
550            return {type => DELIM_TOKEN, value => '@'};            return {type => DELIM_TOKEN, value => '@'};
551            #redo A;            #redo A;
552          } else {          } else {
553            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};            unshift @{$self->{token}}, {type => MINUS_TOKEN};
554            $current_token = {type => IDENT_TOKEN, value => '-'};            $self->{t} = {type => IDENT_TOKEN, value => '-'};
555            $self->{state} = BEFORE_NMSTART_STATE;            $self->{state} = BEFORE_NMSTART_STATE;
556            # reprocess            # reprocess
557            return {type => DELIM_TOKEN, value => '@'};            return {type => DELIM_TOKEN, value => '@'};
# Line 430  sub get_next_token ($) { Line 563  sub get_next_token ($) {
563          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
564          redo A;          redo A;
565        } else {        } else {
566          unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};          unshift @{$self->{token}}, {type => MINUS_TOKEN};
567          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
568          # reprocess          # reprocess
569          return {type => DELIM_TOKEN, value => '@'};          return {type => DELIM_TOKEN, value => '@'};
# Line 438  sub get_next_token ($) { Line 571  sub get_next_token ($) {
571      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
572        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
573          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
574          $current_token->{value} = '-';          $self->{t}->{hyphen} = 1;
575            $self->{t}->{value} = '-';
576            $self->{t}->{type} = DIMENSION_TOKEN;
577          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
578          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
579          redo A;          redo A;
# Line 447  sub get_next_token ($) { Line 582  sub get_next_token ($) {
582                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
583                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
584          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
585          $current_token->{value} = chr $self->{c};          $self->{t}->{value} = chr $self->{c};
586          $current_token->{type} = DIMENSION_TOKEN;          $self->{t}->{type} = DIMENSION_TOKEN;
587          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
588          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
589          redo A;          redo A;
590        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
591          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
592          $current_token->{value} = '';          $self->{t}->{value} = '';
593            $self->{t}->{type} = DIMENSION_TOKEN;
594          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
595          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
596          redo A;          redo A;
597        } elsif ($self->{c} == 0x0025) { # %        } elsif ($self->{c} == 0x0025) { # %
598          $current_token->{type} = PERCENTAGE_TOKEN;          $self->{t}->{type} = PERCENTAGE_TOKEN;
599          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
600          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
601          return $current_token;          return $self->{t};
602          #redo A;          #redo A;
603        } else {        } else {
604          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
605          # reprocess          # reprocess
606          return $current_token;          return $self->{t};
607          #redo A;          #redo A;
608        }        }
609      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
# Line 478  sub get_next_token ($) { Line 614  sub get_next_token ($) {
614            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
615            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
616            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
617          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
618          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
619          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
620          redo A;          redo A;
# Line 488  sub get_next_token ($) { Line 624  sub get_next_token ($) {
624          redo A;          redo A;
625        } else {        } else {
626          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
627          $self->{c} = $self->{get_char}->();          # reprocess
628          return {type => DELIM_TOKEN, value => '#'};          return {type => DELIM_TOKEN, value => '#'};
629          #redo A;          #redo A;
630        }        }
# Line 500  sub get_next_token ($) { Line 636  sub get_next_token ($) {
636            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
637            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
638            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
639          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
640          # stay in the state          # stay in the state
641          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
642          redo A;          redo A;
# Line 509  sub get_next_token ($) { Line 645  sub get_next_token ($) {
645          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
646          redo A;          redo A;
647        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
648                 $current_token->{type} == IDENT_TOKEN) { # (                 $self->{t}->{type} == IDENT_TOKEN) { # (
649          my $func_name = $current_token->{value};          my $func_name = $self->{t}->{value};
650          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
651          if ($func_name eq 'url' or $func_name eq 'url-prefix') {          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
652            if ($current_token->{has_escape}) {            if ($self->{t}->{has_escape}) {
653              ## TODO: warn              ## TODO: warn
654            }            }
655            $current_token->{type}            $self->{t}->{type}
656                = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;                = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
657            $current_token->{value} = '';            $self->{t}->{value} = '';
658            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
659            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
660            redo A;            redo A;
661          } else {          } else {
662            $current_token->{type} = FUNCTION_TOKEN;            $self->{t}->{type} = FUNCTION_TOKEN;
663            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
664            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
665            return $current_token;            return $self->{t};
666            #redo A;            #redo A;
667          }          }
668        } else {        } else {
669          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
670          # reconsume          # reconsume
671          return $current_token;          return $self->{t};
672          #redo A;          #redo A;
673        }        }
674      } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {      } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
# Line 546  sub get_next_token ($) { Line 682  sub get_next_token ($) {
682          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
683        }        }
684        if ($self->{c} == -1) {        if ($self->{c} == -1) {
685          $current_token->{type} = {          $self->{t}->{type} = {
686              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
687              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
688              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
689              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
690          }->{$current_token->{type}};                  }->{$self->{t}->{type}};        
691          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
692          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
693          return $current_token;          return $self->{t};
694          #redo A;          #redo A;
695        } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (        } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
696          ## TODO: Should we consider matches of "(" and ")"?          ## TODO: Should we consider matches of "(" and ")"?
697          $current_token->{type} = {          $self->{t}->{type} = {
698              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
699              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
700              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
701              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
702          }->{$current_token->{type}};          }->{$self->{t}->{type}};
703          $self->{state} = URI_UNQUOTED_STATE;          $self->{state} = URI_UNQUOTED_STATE;
704          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
705          redo A;          redo A;
# Line 574  sub get_next_token ($) { Line 710  sub get_next_token ($) {
710        } elsif ($self->{c} == 0x0029) { # )        } elsif ($self->{c} == 0x0029) { # )
711          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
712          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
713          return $current_token;          return $self->{t};
714          #redo A;          #redo A;
715        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
716          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
717          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
718          redo A;          redo A;
719        } else {        } else {
720          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
721          $self->{state} = URI_UNQUOTED_STATE;          $self->{state} = URI_UNQUOTED_STATE;
722          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
723          redo A;          redo A;
# Line 598  sub get_next_token ($) { Line 734  sub get_next_token ($) {
734          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
735          redo A;          redo A;
736        } elsif ($self->{c} == -1) {        } elsif ($self->{c} == -1) {
737          $current_token->{type} = {          $self->{t}->{type} = {
738              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
739              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
740              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
741              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
742          }->{$current_token->{type}};                  }->{$self->{t}->{type}};        
743          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
744          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
745          return $current_token;          return $self->{t};
746          #redo A;          #redo A;
747        } elsif ($self->{c} < 0x0020 or {        } elsif ($self->{c} < 0x0020 or {
748            0x0022 => 1, # "            0x0022 => 1, # "
# Line 614  sub get_next_token ($) { Line 750  sub get_next_token ($) {
750            0x0028 => 1, # (            0x0028 => 1, # (
751        }->{$self->{c}}) { # C0 or (        }->{$self->{c}}) { # C0 or (
752          ## TODO: Should we consider matches of "(" and ")", '"', or "'"?          ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
753          $current_token->{type} = {          $self->{t}->{type} = {
754              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
755              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
756              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
757              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
758          }->{$current_token->{type}};          }->{$self->{t}->{type}};
759          # stay in the state.          # stay in the state.
760          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
761          redo A;          redo A;
762        } elsif ($self->{c} == 0x0029) { # )        } elsif ($self->{c} == 0x0029) { # )
763          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
764          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
765          return $current_token;          return $self->{t};
766          #redo A;          #redo A;
767        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
768          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
769          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
770          redo A;          redo A;
771        } else {        } else {
772          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
773          # stay in the state.          # stay in the state.
774          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
775          redo A;          redo A;
# Line 650  sub get_next_token ($) { Line 786  sub get_next_token ($) {
786          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
787          redo A;          redo A;
788        } elsif ($self->{c} == -1) {        } elsif ($self->{c} == -1) {
789          $current_token->{type} = {          $self->{t}->{type} = {
790              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
791              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
792              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
793              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
794          }->{$current_token->{type}};                  }->{$self->{t}->{type}};        
795          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
796          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
797          return $current_token;          return $self->{t};
798          #redo A;          #redo A;
799        } elsif ($self->{c} == 0x0029) { # )        } elsif ($self->{c} == 0x0029) { # )
800          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
801          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
802          return $current_token;          return $self->{t};
803          #redo A;          #redo A;
804        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
805          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;          $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
# Line 671  sub get_next_token ($) { Line 807  sub get_next_token ($) {
807          redo A;          redo A;
808        } else {        } else {
809          ## TODO: Should we consider matches of "(" and ")", '"', or "'"?          ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
810          $current_token->{type} = {          $self->{t}->{type} = {
811              URI_TOKEN, URI_INVALID_TOKEN,              URI_TOKEN, URI_INVALID_TOKEN,
812              URI_INVALID_TOKEN, URI_INVALID_TOKEN,              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
813              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
814              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
815          }->{$current_token->{type}};          }->{$self->{t}->{type}};
816          # stay in the state.          # stay in the state.
817          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
818          redo A;          redo A;
819        }        }
820      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
821        $current_token->{has_escape} = 1;        $self->{t}->{has_escape} = 1;
822        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
823          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
824          $char = $self->{c} - 0x0030;          $char = $self->{c} - 0x0030;
# Line 697  sub get_next_token ($) { Line 833  sub get_next_token ($) {
833          redo A;          redo A;
834        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
835          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
836          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 + 0xA;
837          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
838          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
839          redo A;          redo A;
840        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
841                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
842          if ($q == 0) {          if ($q == 0) {
843            ## NOTE: In |escape| in ... in |ident|.            #
           $self->{state} = BEFORE_TOKEN_STATE;  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};  
           return $current_token;  
           # reconsume  
           #redo A;  
844          } elsif ($q == 1) {          } elsif ($q == 1) {
845            ## NOTE: In |escape| in |URI|.            ## NOTE: In |escape| in |URI|.
846            $current_token->{type} = {            $self->{t}->{type} = {
847                URI_TOKEN, URI_INVALID_TOKEN,                URI_TOKEN, URI_INVALID_TOKEN,
848                URI_INVALID_TOKEN, URI_INVALID_TOKEN,                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
849                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
850                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
851            }->{$current_token->{type}};            }->{$self->{t}->{type}};
852            $current_token->{value} .= chr $self->{c};            $self->{t}->{value} .= chr $self->{c};
853            $self->{state} = URI_UNQUOTED_STATE;            $self->{state} = URI_UNQUOTED_STATE;
854            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
855            redo A;            redo A;
856          } else {          } else {
857            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $current_token->{value} .= chr $self->{c};  
858            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
859            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
860            redo A;            redo A;
861          }          }
862        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
863          if ($q == 0) {          if ($q == 0) {
864            ## NOTE: In |escape| in ... in |ident|.            #
           $self->{state} = BEFORE_TOKEN_STATE;  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};  
           return $current_token;  
           # reconsume  
           #redo A;  
865          } elsif ($q == 1) {          } elsif ($q == 1) {
866            $current_token->{type} = {            ## NOTE: In |escape| in |URI|.
867              $self->{t}->{type} = {
868                URI_TOKEN, URI_INVALID_TOKEN,                URI_TOKEN, URI_INVALID_TOKEN,
869                URI_INVALID_TOKEN, URI_INVALID_TOKEN,                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
870                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872            }->{$current_token->{type}};            }->{$self->{t}->{type}};
873            $current_token->{value} .= "\x0D\x0A";            $self->{state} = ESCAPE_BEFORE_LF_STATE;
           $self->{state} = URI_UNQUOTED_STATE;  
874            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
875            redo A;            redo A;
876          } else {          } else {
877            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $current_token->{value} .= "\x0D\x0A";  
878            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
879            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
880            redo A;            redo A;
881          }          }
882          } elsif ($self->{c} == -1) {
883            #
884        } else {        } else {
885          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
886          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
887          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
888              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
889          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
890          redo A;          redo A;
891        }        }
892    
893          if ($q == 0) {
894            if ($self->{t}->{type} == DIMENSION_TOKEN) {
895              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
896                $self->{state} = BEFORE_TOKEN_STATE;
897                # reprocess
898                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
899                unshift @{$self->{token}}, {type => MINUS_TOKEN};
900                $self->{t}->{type} = NUMBER_TOKEN;
901                $self->{t}->{value} = '';
902                return $self->{t};
903                #redo A;
904              } elsif (length $self->{t}->{value}) {
905                $self->{state} = BEFORE_TOKEN_STATE;
906                # reprocess
907                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
908                return $self->{t};
909                #redo A;
910              } else {
911                $self->{state} = BEFORE_TOKEN_STATE;
912                # reprocess
913                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
914                $self->{t}->{type} = NUMBER_TOKEN;
915                $self->{t}->{value} = '';
916                return $self->{t};
917                #redo A;
918              }
919            } else {
920              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
921                $self->{state} = BEFORE_TOKEN_STATE;
922                # reprocess
923                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
924                return {type => MINUS_TOKEN};
925                #redo A;
926              } elsif (length $self->{t}->{value}) {
927                $self->{state} = BEFORE_TOKEN_STATE;
928                # reprocess
929                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
930                return $self->{t};
931                #redo A;
932              } else {
933                $self->{state} = BEFORE_TOKEN_STATE;
934                # reprocess
935                return {type => DELIM_TOKEN, value => '\\'};
936                #redo A;
937              }
938            }
939          } elsif ($q == 1) {
940            $self->{state} = URI_UNQUOTED_STATE;
941            $self->{c} = $self->{get_char}->();
942            redo A;
943          } else {
944            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
945            $self->{t}->{type} = {
946              STRING_TOKEN, INVALID_TOKEN,
947              URI_TOKEN, URI_INVALID_TOKEN,
948              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
949            }->{$self->{t}->{type}} || $self->{t}->{type};
950            $self->{state} = BEFORE_TOKEN_STATE;
951            # reprocess
952            return $self->{t};
953            #redo A;
954          }
955      } elsif ($self->{state} == ESCAPE_STATE) {      } elsif ($self->{state} == ESCAPE_STATE) {
956        ## NOTE: third..seventh character of |unicode| in |escape|.        ## NOTE: third..seventh character of |unicode| in |escape|.
957        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
# Line 776  sub get_next_token ($) { Line 965  sub get_next_token ($) {
965          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
966          redo A;          redo A;
967        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
968          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
969          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
970          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
971          redo A;          redo A;
# Line 784  sub get_next_token ($) { Line 973  sub get_next_token ($) {
973                 $self->{c} == 0x000A or # \n                 $self->{c} == 0x000A or # \n
974                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
975                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
976          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
977          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
978              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
979          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 794  sub get_next_token ($) { Line 983  sub get_next_token ($) {
983          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
984          redo A;          redo A;
985        } else {        } else {
986          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
987          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
988              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
989          # reconsume          # reconsume
# Line 806  sub get_next_token ($) { Line 995  sub get_next_token ($) {
995            $self->{c} == 0x000A or # \n            $self->{c} == 0x000A or # \n
996            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
997            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
998          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
999          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1000              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1001          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 816  sub get_next_token ($) { Line 1005  sub get_next_token ($) {
1005          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1006          redo A;          redo A;
1007        } else {        } else {
1008          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
1009          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1010              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1011          # reconsume          # reconsume
1012          redo A;          redo A;
1013        }        }
1014      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1015        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1016        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
         $current_token->{value} .= chr $char;  
1017          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1018              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1019          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1020          redo A;          redo A;
1021        } else {        } else {
         $current_token->{value} .= chr $char;  
1022          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1023              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1024          # reconsume          # reprocess
1025          redo A;          redo A;
1026        }        }
1027      } elsif ($self->{state} == STRING_STATE) {      } elsif ($self->{state} == STRING_STATE) {
# Line 847  sub get_next_token ($) { Line 1034  sub get_next_token ($) {
1034          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1035          redo A;          redo A;
1036        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
1037          if ($current_token->{type} == STRING_TOKEN) {          if ($self->{t}->{type} == STRING_TOKEN) {
1038            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
1039            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
1040            return $current_token;            return $self->{t};
1041            #redo A;            #redo A;
1042          } else {          } else {
1043            $self->{state} = URI_AFTER_WSP_STATE;            $self->{state} = URI_AFTER_WSP_STATE;
# Line 861  sub get_next_token ($) { Line 1048  sub get_next_token ($) {
1048                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
1049                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
1050                 $self->{c} == -1) {                 $self->{c} == -1) {
1051          $current_token->{type} = INVALID_TOKEN;          $self->{t}->{type} = {
1052              STRING_TOKEN, INVALID_TOKEN,
1053              INVALID_TOKEN, INVALID_TOKEN,
1054              URI_TOKEN, URI_INVALID_TOKEN,
1055              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1056              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1057              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1058            }->{$self->{t}->{type}};
1059          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1060          # reconsume          # reconsume
1061          return $current_token;          return $self->{t};
1062          #redo A;          #redo A;
1063        } else {        } else {
1064          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1065          # stay in the state          # stay in the state
1066          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1067          redo A;          redo A;
# Line 875  sub get_next_token ($) { Line 1069  sub get_next_token ($) {
1069      } elsif ($self->{state} == NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_STATE) {
1070        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1071        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1072          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1073          # stay in the state          # stay in the state
1074          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1075          redo A;          redo A;
# Line 884  sub get_next_token ($) { Line 1078  sub get_next_token ($) {
1078          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1079          redo A;          redo A;
1080        } else {        } else {
1081          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1082          $current_token->{value} = '';          $self->{t}->{value} = '';
1083          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1084          # reprocess          # reprocess
1085          redo A;          redo A;
# Line 893  sub get_next_token ($) { Line 1087  sub get_next_token ($) {
1087      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
1088        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
1089        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1090          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1091          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1092          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1093          redo A;          redo A;
1094        } else {        } else {
1095          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DOT_TOKEN};
1096          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1097          $current_token->{value} = '';          $self->{t}->{value} = '';
1098          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1099          # reprocess          # reprocess
1100          return $current_token;          return $self->{t};
1101          #redo A;          #redo A;
1102        }        }
1103      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1104        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
1105        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1106          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1107          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1108          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1109          redo A;          redo A;
1110        } else {        } else {
1111          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1112          $self->{c} = $self->{get_char}->();          # reprocess
1113          return {type => DELIM_TOKEN, value => '.'};          return {type => DOT_TOKEN};
1114          #redo A;          #redo A;
1115        }        }
1116      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1117        ## NOTE: |[0-9]| in |num| after |.|.        ## NOTE: |[0-9]| in |num| after |.|.
1118        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1119          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1120          # stay in the state          # stay in the state
1121          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1122          redo A;          redo A;
1123        } else {        } else {
1124          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1125          $current_token->{value} = '';          $self->{t}->{value} = '';
1126          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1127          # reprocess          # reprocess
1128          redo A;          redo A;
# Line 937  sub get_next_token ($) { Line 1131  sub get_next_token ($) {
1131        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";
1132      }      }
1133    } # A    } # A
1134    } # get_next_token
1135    
1136    ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|  =head1 LICENSE
1137    
1138  } # get_next_token  Copyright 2007 Wakaba <w@suika.fam.cx>
1139    
1140    This library is free software; you can redistribute it
1141    and/or modify it under the same terms as Perl itself.
1142    
1143    =cut
1144    
1145  1;  1;
1146  # $Date$  # $Date$

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.16

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24