/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Fri Aug 17 11:53:52 2007 UTC revision 1.13 by wakaba, Sat Sep 8 17:43:41 2007 UTC
# Line 1  Line 1 
1  package Whatpm::CSS::Tokenizer;  package Whatpm::CSS::Tokenizer;
2  use strict;  use strict;
3    
4    sub BEFORE_TOKEN_STATE () { 0 }
5    sub BEFORE_NMSTART_STATE () { 1 }
6    sub NAME_STATE () { 2 }
7    sub ESCAPE_OPEN_STATE () { 3 }
8    sub STRING_STATE () { 4 }
9    sub HASH_OPEN_STATE () { 5 }
10    sub NUMBER_STATE () { 6 }
11    sub NUMBER_FRACTION_STATE () { 7 }
12    sub AFTER_NUMBER_STATE () { 8 }
13    sub URI_BEFORE_WSP_STATE () { 9 }
14    sub ESCAPE_STATE () { 10 }
15    sub ESCAPE_BEFORE_LF_STATE () { 11 }
16    sub ESCAPE_BEFORE_NL_STATE () { 12 }
17    sub NUMBER_DOT_STATE () { 13 }
18    sub NUMBER_DOT_NUMBER_STATE () { 14 }
19    sub DELIM_STATE () { 15 }
20    sub URI_UNQUOTED_STATE () { 16 }
21    sub URI_AFTER_WSP_STATE () { 17 }
22    sub AFTER_AT_STATE () { 18 }
23    sub AFTER_AT_HYPHEN_STATE () { 19 }
24    
25    sub IDENT_TOKEN () { 1 }
26    sub ATKEYWORD_TOKEN () { 2 }
27    sub HASH_TOKEN () { 3 }
28    sub FUNCTION_TOKEN () { 4 }
29    sub URI_TOKEN () { 5 }
30    sub URI_INVALID_TOKEN () { 6 }
31    sub URI_PREFIX_TOKEN () { 7 }
32    sub URI_PREFIX_INVALID_TOKEN () { 8 }
33    sub STRING_TOKEN () { 9 }
34    sub INVALID_TOKEN () { 10 }
35    sub NUMBER_TOKEN () { 11 }
36    sub DIMENSION_TOKEN () { 12 }
37    sub PERCENTAGE_TOKEN () { 13 }
38    sub UNICODE_RANGE_TOKEN () { 14 }
39    sub DELIM_TOKEN () { 16 }
40    sub PLUS_TOKEN () { 17 }
41    sub GREATER_TOKEN () { 18 }
42    sub COMMA_TOKEN () { 19 }
43    sub TILDE_TOKEN () { 20 }
44    sub DASHMATCH_TOKEN () { 21 }
45    sub PREFIXMATCH_TOKEN () { 22 }
46    sub SUFFIXMATCH_TOKEN () { 23 }
47    sub SUBSTRINGMATCH_TOKEN () { 24 }
48    sub INCLUDES_TOKEN () { 25 }
49    sub SEMICOLON_TOKEN () { 26 }
50    sub LBRACE_TOKEN () { 27 }
51    sub RBRACE_TOKEN () { 28 }
52    sub LPAREN_TOKEN () { 29 }
53    sub RPAREN_TOKEN () { 30 }
54    sub LBRACKET_TOKEN () { 31 }
55    sub RBRACKET_TOKEN () { 32 }
56    sub S_TOKEN () { 33 }
57    sub CDO_TOKEN () { 34 }
58    sub CDC_TOKEN () { 35 }
59    sub COMMENT_TOKEN () { 36 }
60    sub COMMENT_INVALID_TOKEN () { 37 }
61    sub EOF_TOKEN () { 38 }
62    sub MINUS_TOKEN () { 39 }
63    sub STAR_TOKEN () { 40 }
64    sub VBAR_TOKEN () { 41 }
65    sub DOT_TOKEN () { 42 }
66    sub COLON_TOKEN () { 43 }
67    sub MATCH_TOKEN () { 44 }
68    sub EXCLAMATION_TOKEN () { 45 }
69    
70    our @TokenName = qw(
71      0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
72      STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
73      0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
74      PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
75      LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
76      COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
77    );
78    
79  sub new ($) {  sub new ($) {
80    my $self = bless {token => []}, shift;    my $self = bless {token => [], get_char => sub { -1 },
81                        onerror => sub { }}, shift;
82    return $self;    return $self;
83  } # new  } # new
84    
# Line 10  sub init ($) { Line 86  sub init ($) {
86    my $self = shift;    my $self = shift;
87    $self->{state} = BEFORE_TOKEN_STATE;    $self->{state} = BEFORE_TOKEN_STATE;
88    $self->{c} = $self->{get_char}->();    $self->{c} = $self->{get_char}->();
89      #$self->{t} = {type => token-type, value => value, number => number};
90  } # init  } # init
91    
92  sub get_next_token ($) {  sub get_next_token ($) {
# Line 18  sub get_next_token ($) { Line 95  sub get_next_token ($) {
95      return shift @{$self->{token}};      return shift @{$self->{token}};
96    }    }
97    
   my $current_token;  
98    my $char;    my $char;
99    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
100    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
101    my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|    my $q;
102          ## NOTE:
103          ##   0: in |ident|.
104          ##   1: in |URI| outside of |string|.
105          ##   0x0022: in |string1| or |invalid1|.
106          ##   0x0027: in |string2| or |invalid2|.
107    
108    A: {    A: {
109      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
110        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
111          ## NOTE: |-| in |ident| in |IDENT|          ## NOTE: |-| in |ident| in |IDENT|
112          $current_token = {type => IDENT_TOKEN, value => '-'};          $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
113          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
114          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
115          redo A;          redo A;
116        } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
117                 (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
118            $self->{c} = $self->{get_char}->();
119            if ($self->{c} == 0x002B) { # +
120              $self->{c} = $self->{get_char}->();
121              if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
122                  (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
123                  (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
124                  $self->{c} == 0x003F) { # ?
125                $self->{t}->{value} = chr $self->{c};
126                $self->{t}->{type} = UNICODE_RANGE_TOKEN;
127                $self->{c} = $self->{get_char}->();
128                C: for (2..6) {
129                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
130                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
131                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
132                      $self->{c} == 0x003F) { # ?
133                    $self->{t}->{value} .= chr $self->{c};
134                    $self->{c} = $self->{get_char}->();
135                  } else {
136                    last C;
137                  }
138                } # C
139    
140                if ($self->{c} == 0x002D) { # -
141                  $self->{c} = $self->{get_char}->();
142                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
145                    $self->{t}->{value} .= '-' . chr $self->{c};
146                    $self->{c} = $self->{get_char}->();
147                    C: for (2..6) {
148                      if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
149                          (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
150                          (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
151                        $self->{t}->{value} .= chr $self->{c};
152                        $self->{c} = $self->{get_char}->();
153                      } else {
154                        last C;
155                      }
156                    } # C
157                    
158                    #
159                  } else {
160                    my $token = $self->{t};
161                    $self->{t} = {type => IDENT_TOKEN, value => '-'};
162                    $self->{state} = BEFORE_NMSTART_STATE;
163                    # reprocess
164                    return $token;
165                    #redo A;
166                  }
167                }
168    
169                $self->{state} = BEFORE_TOKEN_STATE;
170                # reprocess
171                return $self->{t};
172                #redo A;
173              } else {
174                unshift @{$self->{token}}, {type => PLUS_TOKEN};
175                $self->{state} = BEFORE_TOKEN_STATE;
176                # reprocess
177                return $self->{t};
178                #redo A;
179              }
180            } else {
181              $self->{state} = NAME_STATE;
182              # reprocess
183              redo A;
184            }
185          } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
186                   (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
187                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
188                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
189          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
190          $current_token = {type => IDENT_TOKEN, value => chr $self->{char}};          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
191          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
192          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
193          redo A;          redo A;
194        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
195          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
196          $current_token = {type => IDENT_TOKEN, value => ''};          $self->{t} = {type => IDENT_TOKEN, value => ''};
197          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
198          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
199          redo A;          redo A;
200        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
201          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
202          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
203          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = AFTER_AT_STATE;
         $self->{c} = $self->{get_char}->();  
         redo A;  
       } elsif ($self->{c} == 0x0022) { # "  
         ## NOTE: |"| in |string1| in |string| in |STRING|, or  
         ## |"| in |invalid1| in |invalid| in |INVALID|.  
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 1;  
204          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
205          redo A;          redo A;
206        } elsif ($self->{c} == 0x0027) { # '        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
207          ## NOTE: |'| in |string2| in |string| in |STRING|, or          $self->{t} = {type => STRING_TOKEN, value => ''};
208          ## |'| in |invalid2| in |invalid| in |INVALID|.          $self->{state} = STRING_STATE; $q = $self->{c};
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 2;  
209          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
210          redo A;          redo A;
211        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
212          ## NOTE: |#| in |HASH|.          ## NOTE: |#| in |HASH|.
213          $current_token = {type => HASH_TOKEN, value => ''};          $self->{t} = {type => HASH_TOKEN, value => ''};
214          $self->{state} = HASH_OPEN_STATE;          $self->{state} = HASH_OPEN_STATE;
215          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
216          redo A;          redo A;
217        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
218          ## NOTE: |num|.          ## NOTE: |num|.
219          $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};          $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
220          $self->{state} = NUMBER_STATE;          $self->{state} = NUMBER_STATE;
221          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
222          redo A;          redo A;
223        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
224          ## NOTE: |num|.          ## NOTE: |num|.
225          $current_token = {type => NUMBER_TOKEN, value => '.'};          $self->{t} = {type => NUMBER_TOKEN, value => '0'};
226          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
227          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
228          redo A;          redo A;
229          } elsif ($self->{c} == 0x002F) { # /
230            $self->{c} = $self->{get_char}->();
231            if ($self->{c} == 0x002A) { # *
232              C: {
233                $self->{c} = $self->{get_char}->();
234                if ($self->{c} == 0x002A) { # *
235                  D: {
236                    $self->{c} = $self->{get_char}->();
237                    if ($self->{c} == 0x002F) { # /
238                      #
239                    } elsif ($self->{c} == 0x002A) { # *
240                      redo D;
241                    } else {
242                      redo C;
243                    }
244                  } # D
245                } elsif ($self->{c} == -1) {
246                  # stay in the state
247                  # reprocess
248                  return {type => COMMENT_INVALID_TOKEN};
249                  #redo A;
250                } else {
251                  redo C;
252                }
253              } # C
254    
255              # stay in the state.
256              $self->{c} = $self->{get_char}->();
257              redo A;
258            } else {
259              # stay in the state.
260              # reprocess
261              return {type => DELIM_TOKEN, value => '/'};
262              #redo A;
263            }        
264        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
265          ## NOTE: |CDO|          ## NOTE: |CDO|
266          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
267          if ($self->{c} == 0x0021) { # !          if ($self->{c} == 0x0021) { # !
268            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
269            if ($self->{c} == 0x002C) { # -            if ($self->{c} == 0x002D) { # -
270              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->();
271              if ($self->{c} == 0x002C) { # -              if ($self->{c} == 0x002D) { # -
272                $self->{state} = BEFORE_TOKEN_STATE;                $self->{state} = BEFORE_TOKEN_STATE;
273                $self->{c} = $self->{get_char}->();                $self->{c} = $self->{get_char}->();
274                return {type => CDO_TOKEN};                return {type => CDO_TOKEN};
275                #redo A;                #redo A;
276              } else {              } else {
277                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
278                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
279                $current_token = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-'};
280                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
281                #reprocess                #reprocess
282                return {type => DELIM_TOKEN, value => '<'};                return {type => DELIM_TOKEN, value => '<'};
283                #redo A;                #redo A;
284              }              }
285            } else {            } else {
286              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};              unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
287              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
288              #reprocess              #reprocess
289              return {type => DELIM_TOKEN, value => '<'};              return {type => DELIM_TOKEN, value => '<'};
# Line 119  sub get_next_token ($) { Line 295  sub get_next_token ($) {
295            return {type => DELIM_TOKEN, value => '<'};            return {type => DELIM_TOKEN, value => '<'};
296            #redo A;            #redo A;
297          }          }
298        } elsif ({        } elsif (my $t = {
299                  0x003B => 1, # ;                          0x0021 => EXCLAMATION_TOKEN, # !
300                  0x007B => 1, # {                          0x002D => MINUS_TOKEN, # -
301                  0x007D => 1, # }                          0x002E => DOT_TOKEN, # .
302                  0x0028 => 1, # (                          0x003A => COLON_TOKEN, # :
303                  0x0029 => 1, # )                          0x003B => SEMICOLON_TOKEN, # ;
304                  0x005B => 1, # [                          0x003D => MATCH_TOKEN, # =
305                  0x005D => 1, # ]                          0x007B => LBRACE_TOKEN, # {
306                            0x007D => RBRACE_TOKEN, # }
307                            0x0028 => LPAREN_TOKEN, # (
308                            0x0029 => RPAREN_TOKEN, # )
309                            0x005B => LBRACKET_TOKEN, # [
310                            0x005D => RBRACKET_TOKEN, # ]
311                 }->{$self->{c}}) {                 }->{$self->{c}}) {
312          # stay in the state          # stay in the state
313          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
314          return {type => chr $self->{c}};          return {type => $t};
315          # redo A;          # redo A;
316        } elsif ({        } elsif ({
317                  0x0020 => 1, # SP                  0x0020 => 1, # SP
# Line 172  sub get_next_token ($) { Line 353  sub get_next_token ($) {
353                          0x0024 => SUFFIXMATCH_TOKEN, # $                          0x0024 => SUFFIXMATCH_TOKEN, # $
354                          0x002A => SUBSTRINGMATCH_TOKEN, # *                          0x002A => SUBSTRINGMATCH_TOKEN, # *
355                         }->{$self->{c}}) {                         }->{$self->{c}}) {
356            my $c = $self->{c};
357          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
358          if ($self->{c} == 0x003D) { # =          if ($self->{c} == 0x003D) { # =
359            # stay in the state            # stay in the state
360            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
361            return {type => $v};            return {type => $v};
362            #redo A;            #redo A;
363            } elsif ($v = {
364                           0x002A => STAR_TOKEN, # *
365                           0x007C => VBAR_TOKEN, # |
366                          }->{$c}) {
367              # stay in the state.
368              # reprocess
369              return {type => $v};
370              #redo A;
371          } else {          } else {
372            # stay in the state            # stay in the state
373            # reprocess            # reprocess
374            return {type => DELIM_TOKEN, value => chr $self->{c}};            return {type => DELIM_TOKEN, value => chr $c};
375            #redo A;            #redo A;
376          }          }
377        } elsif ($self->{c} == 0x002B) { # +        } elsif ($self->{c} == 0x002B) { # +
# Line 219  sub get_next_token ($) { Line 409  sub get_next_token ($) {
409          #redo A;          #redo A;
410        } else {        } else {
411          # stay in the state          # stay in the state
412          $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};          $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
413          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
414          return $current_token;          return $self->{t};
415          #redo A;          #redo A;
416        }        }
417      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
418        ## NOTE: |nmstart| in |ident| in (|IDENT| or |ATKEYWORD|)        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
419        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        ## |FUNCTION|)
420            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
421              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
422            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
423            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
424          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
425            $self->{t}->{type} = DIMENSION_TOKEN
426                if $self->{t}->{type} == NUMBER_TOKEN;
427          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
428          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
429          redo A;          redo A;
# Line 238  sub get_next_token ($) { Line 431  sub get_next_token ($) {
431          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
432          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
433          redo A;          redo A;
434        } elsif ($self->{c} == 0x002D and # -        } elsif ($self->{c} == 0x002D) { # -
435                 $current_token->{type} == IDENT_TOKEN) {          if ($self->{t}->{type} == IDENT_TOKEN) {
436              $self->{c} = $self->{get_char}->();
437              if ($self->{c} == 0x003E) { # >
438                $self->{state} = BEFORE_TOKEN_STATE;
439                $self->{c} = $self->{get_char}->();
440                return {type => CDC_TOKEN};
441                #redo A;
442              } else {
443                ## NOTE: |-|, |-|, $self->{c}
444                #$self->{t} = {type => IDENT_TOKEN, value => '-'};
445                # stay in the state
446                # reconsume
447                return {type => MINUS_TOKEN};
448                #redo A;
449              }
450            } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
451              $self->{c} = $self->{get_char}->();
452              if ($self->{c} == 0x003E) { # >
453                unshift @{$self->{token}}, {type => CDC_TOKEN};
454                $self->{t}->{type} = NUMBER_TOKEN;
455                $self->{t}->{value} = '';
456                $self->{state} = BEFORE_TOKEN_STATE;
457                $self->{c} = $self->{get_char}->();
458                return $self->{t};
459                #redo A;
460              } else {
461                ## NOTE: |-|, |-|, $self->{c}
462                my $t = $self->{t};
463                $t->{type} = NUMBER_TOKEN;
464                $t->{value} = '';
465                $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
466                unshift @{$self->{token}}, {type => MINUS_TOKEN};
467                # stay in the state
468                # reconsume
469                return $t;
470                #redo A;
471              }
472            } else {
473              #
474            }
475          } else {
476            #
477          }
478          
479          if ($self->{t}->{type} == DIMENSION_TOKEN) {
480            ## NOTE: |-| after |NUMBER|.
481            unshift @{$self->{token}}, {type => MINUS_TOKEN};
482            $self->{state} = BEFORE_TOKEN_STATE;
483            # reprocess
484            $self->{t}->{type} = NUMBER_TOKEN;
485            $self->{t}->{value} = '';
486            return $self->{t};
487          } else {
488            ## NOTE: |-| not followed by |nmstart|.
489            $self->{state} = BEFORE_TOKEN_STATE;
490            # reprocess
491            return {type => MINUS_TOKEN};
492          }
493        } elsif ($self->{state} == AFTER_AT_STATE) {
494          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
495              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
496              $self->{c} == 0x005F or # _
497              $self->{c} > 0x007F) { # nonascii
498            $self->{t}->{value} .= chr $self->{c};
499            $self->{state} = NAME_STATE;
500            $self->{c} = $self->{get_char}->();
501            redo A;
502          } elsif ($self->{c} == 0x002D) { # -
503            $self->{t}->{value} .= '-';
504            $self->{state} = AFTER_AT_HYPHEN_STATE;
505            $self->{c} = $self->{get_char}->();
506            redo A;
507          } elsif ($self->{c} == 0x005C) { # \
508            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
509            $self->{c} = $self->{get_char}->();
510            redo A;
511          } else {
512            $self->{state} = BEFORE_TOKEN_STATE;
513            # reprocess
514            return {type => DELIM_TOKEN, value => '@'};
515          }
516        } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
517          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
518              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
519              $self->{c} == 0x005F or # _
520              $self->{c} > 0x007F) { # nonascii
521            $self->{t}->{value} .= chr $self->{c};
522            $self->{state} = NAME_STATE;
523            $self->{c} = $self->{get_char}->();
524            redo A;
525          } elsif ($self->{c} == 0x002D) { # -
526          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
527          if ($self->{c} == 0x003E) { # >          if ($self->{c} == 0x003E) { # >
528              unshift @{$self->{token}}, {type => CDC_TOKEN};
529            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
530            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
531            return {type => CDC_TOKEN};            return {type => DELIM_TOKEN, value => '@'};
532            #redo A;            #redo A;
533          } else {          } else {
534            ## NOTE: |-|, |-|, $self->{c}            unshift @{$self->{token}}, {type => MINUS_TOKEN};
535            #$current_token = {type => IDENT_TOKEN, value => '-'};            $self->{t} = {type => IDENT_TOKEN, value => '-'};
536            # stay in the state            $self->{state} = BEFORE_NMSTART_STATE;
537            # reconsume            # reprocess
538            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '@'};
539            #redo A;            #redo A;
540          }          }
541          } elsif ($self->{c} == 0x005C) { # \
542            ## TODO: @-\{nl}
543            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
544            $self->{c} = $self->{get_char}->();
545            redo A;
546        } else {        } else {
547          if ($current_token->{type} == NUMBER_TOKEN) {          unshift @{$self->{token}}, {type => MINUS_TOKEN};
548            ## NOTE: |-| after |num|.          $self->{state} = BEFORE_TOKEN_STATE;
549            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};          # reprocess
550            $self->{state} = BEFORE_TOKEN_STATE;          return {type => DELIM_TOKEN, value => '@'};
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {  
           ## NOTE: |-| after |@|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } else {  
           ## NOTE: |-| not followed by |nmstart|.  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return {type => DELIM_TOKEN, value => '-'};  
         }  
551        }        }
552      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
553        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
554          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
555          $current_token->{value} = '-';          $self->{t}->{hyphen} = 1;
556            $self->{t}->{value} = '-';
557            $self->{t}->{type} = DIMENSION_TOKEN;
558          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
559          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
560          redo A;          redo A;
561        } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
562                 (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
563                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
564                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
565          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
566          $current_token->{value} = chr $self->{char};          $self->{t}->{value} = chr $self->{c};
567            $self->{t}->{type} = DIMENSION_TOKEN;
568          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
569          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
570          redo A;          redo A;
571        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
572          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
573          $current_token->{value} = '';          $self->{t}->{value} = '';
574            $self->{t}->{type} = DIMENSION_TOKEN;
575          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
576          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
577          redo A;          redo A;
578        } elsif ($self->{c} == 0x0025) { # %        } elsif ($self->{c} == 0x0025) { # %
579          $current_token->{type} = PERCENTAGE_TOKEN;          $self->{t}->{type} = PERCENTAGE_TOKEN;
580          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
581          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
582          return $current_token;          return $self->{t};
583          #redo A;          #redo A;
584        } else {        } else {
585          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
586          # reprocess          # reprocess
587          return $current_token;          return $self->{t};
588          #redo A;          #redo A;
589        }        }
590      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
591        ## NOTE: The first |nmchar| in |name| in |HASH|.        ## NOTE: The first |nmchar| in |name| in |HASH|.
592        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
593            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
594            (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9            (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
595            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
596            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
597            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
598          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
599          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
600          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
601          redo A;          redo A;
# Line 326  sub get_next_token ($) { Line 605  sub get_next_token ($) {
605          redo A;          redo A;
606        } else {        } else {
607          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
608          $self->{c} = $self->{get_char}->();          # reprocess
609          return {type => DELIM_TOKEN, value => '#'};          return {type => DELIM_TOKEN, value => '#'};
610          #redo A;          #redo A;
611        }        }
612      } elsif ($self->{state} == NAME_STATE) {      } elsif ($self->{state} == NAME_STATE) {
613        ## NOTE: |nmchar| in (|ident| or |name|).        ## NOTE: |nmchar| in (|ident| or |name|).
614        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
615            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
616            (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9            (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
617            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
618            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
619            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
620          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
621          # stay in the state          # stay in the state
622          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
623          redo A;          redo A;
624        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
625          $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
626          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
627          redo A;          redo A;
628        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
629                 $current_token->{type} == IDENT_TOKEN) { # (                 $self->{t}->{type} == IDENT_TOKEN) { # (
630          if (not $current_token->{has_escape} and          my $func_name = $self->{t}->{value};
631              {url => 1, Url => 1, uRl => 1, urL => 1,          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
632               URl => 1, UrL => 1, uRL => 1, URL => 1}          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
633              ->{$current_token->{value}}) {            if ($self->{t}->{has_escape}) {
634            $current_token->{type} = URI_TOKEN;              ## TODO: warn
635              }
636              $self->{t}->{type}
637                  = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
638              $self->{t}->{value} = '';
639            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
640            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
   
           ## NOTE: This version of the tokenizer does not support the |URI|  
           ## token type.  Note that browsers disagree in how to tokenize  
           ## |url| function.  
           $current_token->{type} = FUNCTION_TOKEN;  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
   
641            redo A;            redo A;
642          } else {          } else {
643            $current_token->{type} = FUNCTION_TOKEN;            $self->{t}->{type} = FUNCTION_TOKEN;
644            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
645            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
646            return $current_token;            return $self->{t};
647            #redo A;            #redo A;
648          }          }
649        } else {        } else {
650          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
651          # reconsume          # reconsume
652          return $current_token;          return $self->{t};
653            #redo A;
654          }
655        } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
656          while ({
657                    0x0020 => 1, # SP
658                    0x0009 => 1, # \t
659                    0x000D => 1, # \r
660                    0x000A => 1, # \n
661                    0x000C => 1, # \f
662                 }->{$self->{c}}) {
663            $self->{c} = $self->{get_char}->();
664          }
665          if ($self->{c} == -1) {
666            $self->{t}->{type} = {
667                URI_TOKEN, URI_INVALID_TOKEN,
668                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
669                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
670                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
671            }->{$self->{t}->{type}};        
672            $self->{state} = BEFORE_TOKEN_STATE;
673            $self->{c} = $self->{get_char}->();
674            return $self->{t};
675            #redo A;
676          } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
677            ## TODO: Should we consider matches of "(" and ")"?
678            $self->{t}->{type} = {
679                URI_TOKEN, URI_INVALID_TOKEN,
680                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
681                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
682                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
683            }->{$self->{t}->{type}};
684            $self->{state} = URI_UNQUOTED_STATE;
685            $self->{c} = $self->{get_char}->();
686            redo A;
687          } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
688            $self->{state} = STRING_STATE; $q = $self->{c};
689            $self->{c} = $self->{get_char}->();
690            redo A;
691          } elsif ($self->{c} == 0x0029) { # )
692            $self->{state} = BEFORE_TOKEN_STATE;
693            $self->{c} = $self->{get_char}->();
694            return $self->{t};
695            #redo A;
696          } elsif ($self->{c} == 0x005C) { # \
697            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
698            $self->{c} = $self->{get_char}->();
699            redo A;
700          } else {
701            $self->{t}->{value} .= chr $self->{c};
702            $self->{state} = URI_UNQUOTED_STATE;
703            $self->{c} = $self->{get_char}->();
704            redo A;
705          }
706        } elsif ($self->{state} == URI_UNQUOTED_STATE) {
707          if ({
708               0x0020 => 1, # SP
709               0x0009 => 1, # \t
710               0x000D => 1, # \r
711               0x000A => 1, # \n
712               0x000C => 1, # \f
713              }->{$self->{c}}) {
714            $self->{state} = URI_AFTER_WSP_STATE;
715            $self->{c} = $self->{get_char}->();
716            redo A;
717          } elsif ($self->{c} == -1) {
718            $self->{t}->{type} = {
719                URI_TOKEN, URI_INVALID_TOKEN,
720                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
721                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
722                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
723            }->{$self->{t}->{type}};        
724            $self->{state} = BEFORE_TOKEN_STATE;
725            $self->{c} = $self->{get_char}->();
726            return $self->{t};
727            #redo A;
728          } elsif ($self->{c} < 0x0020 or {
729              0x0022 => 1, # "
730              0x0027 => 1, # '
731              0x0028 => 1, # (
732          }->{$self->{c}}) { # C0 or (
733            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
734            $self->{t}->{type} = {
735                URI_TOKEN, URI_INVALID_TOKEN,
736                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
737                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
738                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
739            }->{$self->{t}->{type}};
740            # stay in the state.
741            $self->{c} = $self->{get_char}->();
742            redo A;
743          } elsif ($self->{c} == 0x0029) { # )
744            $self->{state} = BEFORE_TOKEN_STATE;
745            $self->{c} = $self->{get_char}->();
746            return $self->{t};
747          #redo A;          #redo A;
748          } elsif ($self->{c} == 0x005C) { # \
749            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
750            $self->{c} = $self->{get_char}->();
751            redo A;
752          } else {
753            $self->{t}->{value} .= chr $self->{c};
754            # stay in the state.
755            $self->{c} = $self->{get_char}->();
756            redo A;
757          }
758        } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
759          if ({
760               0x0020 => 1, # SP
761               0x0009 => 1, # \t
762               0x000D => 1, # \r
763               0x000A => 1, # \n
764               0x000C => 1, # \f
765              }->{$self->{c}}) {
766            # stay in the state.
767            $self->{c} = $self->{get_char}->();
768            redo A;
769          } elsif ($self->{c} == -1) {
770            $self->{t}->{type} = {
771                URI_TOKEN, URI_INVALID_TOKEN,
772                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
773                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
774                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
775            }->{$self->{t}->{type}};        
776            $self->{state} = BEFORE_TOKEN_STATE;
777            $self->{c} = $self->{get_char}->();
778            return $self->{t};
779            #redo A;
780          } elsif ($self->{c} == 0x0029) { # )
781            $self->{state} = BEFORE_TOKEN_STATE;
782            $self->{c} = $self->{get_char}->();
783            return $self->{t};
784            #redo A;
785          } elsif ($self->{c} == 0x005C) { # \
786            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
787            $self->{c} = $self->{get_char}->();
788            redo A;
789          } else {
790            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
791            $self->{t}->{type} = {
792                URI_TOKEN, URI_INVALID_TOKEN,
793                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
794                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
795                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
796            }->{$self->{t}->{type}};
797            # stay in the state.
798            $self->{c} = $self->{get_char}->();
799            redo A;
800        }        }
801      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
802        $current_token->{has_escape} = 1;        $self->{t}->{has_escape} = 1;
803        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
804          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
805          $char = $self->{c} - 0x0030;          $char = $self->{c} - 0x0030;
# Line 392  sub get_next_token ($) { Line 812  sub get_next_token ($) {
812          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
813          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
814          redo A;          redo A;
815        } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
816          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
817          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 + 0xA;
818          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
819          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
820          redo A;          redo A;
821        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
822                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
823          if ($q == 0) {          if ($q == 0) {
824            ## NOTE: In |escape| in ... in |ident|.            #
825            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
826            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
827            return $current_token;            $self->{t}->{type} = {
828            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
829            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
830                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
831                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
832              }->{$self->{t}->{type}};
833              $self->{t}->{value} .= chr $self->{c};
834              $self->{state} = URI_UNQUOTED_STATE;
835              $self->{c} = $self->{get_char}->();
836              redo A;
837          } else {          } else {
838            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
839            $current_token->{value} .= chr $self->{c};            $self->{t}->{value} .= chr $self->{c};
840            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
841            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
842            redo A;            redo A;
843          }          }
844        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
845          if ($q == 0) {          if ($q == 0) {
846            ## NOTE: In |escape| in ... in |ident|.            #
847            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
848            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
849            return $current_token;            $self->{t}->{type} = {
850            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
851            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
852                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
853                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
854              }->{$self->{t}->{type}};
855              $self->{t}->{value} .= "\x0D";
856              $self->{state} = ESCAPE_BEFORE_LF_STATE;
857              $self->{c} = $self->{get_char}->();
858              redo A;
859          } else {          } else {
860            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
861            $current_token->{value} .= "\x0D\x0A";            $self->{t}->{value} .= "\x0D";
862            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
863            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
864            redo A;            redo A;
865          }          }
866          } elsif ($self->{c} == -1) {
867            #
868        } else {        } else {
869          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
870          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
871          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
872                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
873          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
874          redo A;          redo A;
875        }        }
876    
877          if ($q == 0) {
878            if ($self->{t}->{type} == DIMENSION_TOKEN) {
879              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
880                $self->{state} = BEFORE_TOKEN_STATE;
881                # reprocess
882                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
883                unshift @{$self->{token}}, {type => MINUS_TOKEN};
884                $self->{t}->{type} = NUMBER_TOKEN;
885                $self->{t}->{value} = '';
886                return $self->{t};
887                #redo A;
888              } elsif (length $self->{t}->{value}) {
889                $self->{state} = BEFORE_TOKEN_STATE;
890                # reprocess
891                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
892                return $self->{t};
893                #redo A;
894              } else {
895                $self->{state} = BEFORE_TOKEN_STATE;
896                # reprocess
897                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
898                $self->{t}->{type} = NUMBER_TOKEN;
899                $self->{t}->{value} = '';
900                return $self->{t};
901                #redo A;
902              }
903            } else {
904              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
905                $self->{state} = BEFORE_TOKEN_STATE;
906                # reprocess
907                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
908                return {type => MINUS_TOKEN};
909                #redo A;
910              } elsif (length $self->{t}->{value}) {
911                $self->{state} = BEFORE_TOKEN_STATE;
912                # reprocess
913                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
914                return $self->{t};
915                #redo A;
916              } else {
917                $self->{state} = BEFORE_TOKEN_STATE;
918                # reprocess
919                return {type => DELIM_TOKEN, value => '\\'};
920                #redo A;
921              }
922            }
923          } elsif ($q == 1) {
924            $self->{state} = URI_UNQUOTED_STATE;
925            $self->{c} = $self->{get_char}->();
926            redo A;
927          } else {
928            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
929            $self->{t}->{type} = {
930              STRING_TOKEN, INVALID_TOKEN,
931              URI_TOKEN, URI_INVALID_TOKEN,
932              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
933            }->{$self->{t}->{type}} || $self->{t}->{type};
934            $self->{state} = BEFORE_TOKEN_STATE;
935            # reprocess
936            return $self->{t};
937            #redo A;
938          }
939      } elsif ($self->{state} == ESCAPE_STATE) {      } elsif ($self->{state} == ESCAPE_STATE) {
940        ## NOTE: third..seventh character of |unicode| in |escape|.        ## NOTE: third..seventh character of |unicode| in |escape|.
941        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
# Line 448  sub get_next_token ($) { Line 948  sub get_next_token ($) {
948          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
949          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
950          redo A;          redo A;
951        } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
952          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
953          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
954          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
955          redo A;          redo A;
# Line 457  sub get_next_token ($) { Line 957  sub get_next_token ($) {
957                 $self->{c} == 0x000A or # \n                 $self->{c} == 0x000A or # \n
958                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
959                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
960          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
961          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
962                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
963          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
964          redo A;          redo A;
965        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 466  sub get_next_token ($) { Line 967  sub get_next_token ($) {
967          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
968          redo A;          redo A;
969        } else {        } else {
970          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
971          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
972                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
973          # reconsume          # reconsume
974          redo A;          redo A;
975        }        }
# Line 477  sub get_next_token ($) { Line 979  sub get_next_token ($) {
979            $self->{c} == 0x000A or # \n            $self->{c} == 0x000A or # \n
980            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
981            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
982          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
983          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
984                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
985          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
986          redo A;          redo A;
987        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 486  sub get_next_token ($) { Line 989  sub get_next_token ($) {
989          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
990          redo A;          redo A;
991        } else {        } else {
992          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
993          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
994                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
995          # reconsume          # reconsume
996          redo A;          redo A;
997        }        }
998      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
999        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
1000        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
1001          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $self->{c};
1002          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
1003                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1004          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1005          redo A;          redo A;
1006        } else {        } else {
1007          $current_token->{value} .= chr $char;          $self->{state} = $q == 0 ? NAME_STATE :
1008          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1009          # reconsume          # reprocess
1010          redo A;          redo A;
1011        }        }
1012      } elsif ($self->{state} == STRING_STATE) {      } elsif ($self->{state} == STRING_STATE) {
1013        ## NOTE: A character in |string$Q| in |string| in |STRING|, or        ## NOTE: A character in |string$Q| in |string| in |STRING|, or
1014        ## a character in |invalid$Q| in |invalid| in |INVALID|,        ## a character in |invalid$Q| in |invalid| in |INVALID|,
1015        ## where |$Q = $q == 0x0022 ? 1 : 2|.        ## where |$Q = $q == 0x0022 ? 1 : 2|.
1016          ## Or, in |URI|.
1017        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
1018          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
1019          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1020          redo A;          redo A;
1021        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
1022          $self->{state} = BEFORE_TOKEN_STATE;          if ($self->{t}->{type} == STRING_TOKEN) {
1023          $self->{c} = $self->{get_char}->();            $self->{state} = BEFORE_TOKEN_STATE;
1024          return $current_token;            $self->{c} = $self->{get_char}->();
1025          #redo A;            return $self->{t};
1026              #redo A;
1027            } else {
1028              $self->{state} = URI_AFTER_WSP_STATE;
1029              $self->{c} = $self->{get_char}->();
1030              redo A;
1031            }
1032        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
1033                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
1034                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
1035                 $self->{c} == -1) {                 $self->{c} == -1) {
1036          $current_token->{type} = INVALID_TOKEN;          $self->{t}->{type} = {
1037              STRING_TOKEN, INVALID_TOKEN,
1038              INVALID_TOKEN, INVALID_TOKEN,
1039              URI_TOKEN, URI_INVALID_TOKEN,
1040              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1041              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1042              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1043            }->{$self->{t}->{type}};
1044          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1045          # reconsume          # reconsume
1046          return $current_token;          return $self->{t};
1047          #redo A;          #redo A;
1048        } else {        } else {
1049          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1050          # stay in the state          # stay in the state
1051          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1052          redo A;          redo A;
# Line 535  sub get_next_token ($) { Line 1054  sub get_next_token ($) {
1054      } elsif ($self->{state} == NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_STATE) {
1055        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1056        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1057          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1058          # stay in the state          # stay in the state
1059          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1060          redo A;          redo A;
# Line 544  sub get_next_token ($) { Line 1063  sub get_next_token ($) {
1063          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1064          redo A;          redo A;
1065        } else {        } else {
1066          $self->{number} = $self->{value};          $self->{t}->{number} = $self->{t}->{value};
1067          $self->{value} = '';          $self->{t}->{value} = '';
1068          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1069          # reprocess          # reprocess
1070          return $current_token;          redo A;
         #redo A;  
1071        }        }
1072      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
1073        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
1074        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1075          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1076          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1077          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1078          redo A;          redo A;
1079        } else {        } else {
1080          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DOT_TOKEN};
1081          $self->{number} = $self->{value};          $self->{t}->{number} = $self->{t}->{value};
1082          $self->{value} = '';          $self->{t}->{value} = '';
1083          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1084          # reprocess          # reprocess
1085          return $current_token;          return $self->{t};
1086          #redo A;          #redo A;
1087        }        }
1088      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1089        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
1090        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1091          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1092          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1093          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1094          redo A;          redo A;
1095        } else {        } else {
1096          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1097          $self->{c} = $self->{get_char}->();          # reprocess
1098          return {type => DELIM_TOKEN, value => '.'};          return {type => DOT_TOKEN};
1099          #redo A;          #redo A;
1100        }        }
1101      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1102        ## NOTE: |[0-9]| in |num| after |.|.        ## NOTE: |[0-9]| in |num| after |.|.
1103        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1104          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1105          # stay in the state          # stay in the state
1106          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1107          redo A;          redo A;
1108        } else {        } else {
1109          $self->{number} = $self->{value};          $self->{t}->{number} = $self->{t}->{value};
1110          $self->{value} = '';          $self->{t}->{value} = '';
1111          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1112          # reprocess          # reprocess
1113          return $current_token;          redo A;
         #redo A;  
1114        }        }
1115      } else {      } else {
1116        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";
1117      }      }
1118    } # A    } # A
   
   ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|  
   
1119  } # get_next_token  } # get_next_token
1120    
1121  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.13

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24