/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Fri Aug 17 11:53:52 2007 UTC revision 1.5 by wakaba, Sat Sep 8 03:25:05 2007 UTC
# Line 1  Line 1 
1  package Whatpm::CSS::Tokenizer;  package Whatpm::CSS::Tokenizer;
2  use strict;  use strict;
3    
4    sub BEFORE_TOKEN_STATE () { 0 }
5    sub BEFORE_NMSTART_STATE () { 1 }
6    sub NAME_STATE () { 2 }
7    sub ESCAPE_OPEN_STATE () { 3 }
8    sub STRING_STATE () { 4 }
9    sub HASH_OPEN_STATE () { 5 }
10    sub NUMBER_STATE () { 6 }
11    sub NUMBER_FRACTION_STATE () { 7 }
12    sub AFTER_NUMBER_STATE () { 8 }
13    sub URI_BEFORE_WSP_STATE () { 9 }
14    sub ESCAPE_STATE () { 10 }
15    sub ESCAPE_BEFORE_LF_STATE () { 11 }
16    sub ESCAPE_BEFORE_NL_STATE () { 12 }
17    sub NUMBER_DOT_STATE () { 13 }
18    sub NUMBER_DOT_NUMBER_STATE () { 14 }
19    sub DELIM_STATE () { 15 }
20    sub URI_UNQUOTED_STATE () { 16 }
21    sub URI_AFTER_WSP_STATE () { 17 }
22    sub AFTER_AT_STATE () { 18 }
23    sub AFTER_AT_HYPHEN_STATE () { 19 }
24    
25    sub IDENT_TOKEN () { 1 }
26    sub ATKEYWORD_TOKEN () { 2 }
27    sub HASH_TOKEN () { 3 }
28    sub FUNCTION_TOKEN () { 4 }
29    sub URI_TOKEN () { 5 }
30    sub URI_INVALID_TOKEN () { 6 }
31    sub URI_PREFIX_TOKEN () { 7 }
32    sub URI_PREFIX_INVALID_TOKEN () { 8 }
33    sub STRING_TOKEN () { 9 }
34    sub INVALID_TOKEN () { 10 }
35    sub NUMBER_TOKEN () { 11 }
36    sub DIMENSION_TOKEN () { 12 }
37    sub PERCENTAGE_TOKEN () { 13 }
38    sub UNICODE_RANGE_TOKEN () { 14 }
39    sub UNICODE_RANGE_INVALID_TOKEN () { 15 }
40    sub DELIM_TOKEN () { 16 }
41    sub PLUS_TOKEN () { 17 }
42    sub GREATER_TOKEN () { 18 }
43    sub COMMA_TOKEN () { 19 }
44    sub TILDE_TOKEN () { 20 }
45    sub DASHMATCH_TOKEN () { 21 }
46    sub PREFIXMATCH_TOKEN () { 22 }
47    sub SUFFIXMATCH_TOKEN () { 23 }
48    sub SUBSTRINGMATCH_TOKEN () { 24 }
49    sub INCLUDES_TOKEN () { 25 }
50    sub SEMICOLON_TOKEN () { 26 }
51    sub LBRACE_TOKEN () { 27 }
52    sub RBRACE_TOKEN () { 28 }
53    sub LPAREN_TOKEN () { 29 }
54    sub RPAREN_TOKEN () { 30 }
55    sub LBRACKET_TOKEN () { 31 }
56    sub RBRACKET_TOKEN () { 32 }
57    sub S_TOKEN () { 33 }
58    sub CDO_TOKEN () { 34 }
59    sub CDC_TOKEN () { 35 }
60    sub COMMENT_TOKEN () { 36 }
61    sub COMMENT_INVALID_TOKEN () { 37 }
62    sub EOF_TOKEN () { 38 }
63    
64    our @TokenName = qw(
65      0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
66      STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
67      UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH
68      PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
69      LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
70      COMMENT_INVALID EOF
71    );
72    
73  sub new ($) {  sub new ($) {
74    my $self = bless {token => []}, shift;    my $self = bless {token => [], get_char => sub { -1 },
75                        onerror => sub { }}, shift;
76    return $self;    return $self;
77  } # new  } # new
78    
# Line 10  sub init ($) { Line 80  sub init ($) {
80    my $self = shift;    my $self = shift;
81    $self->{state} = BEFORE_TOKEN_STATE;    $self->{state} = BEFORE_TOKEN_STATE;
82    $self->{c} = $self->{get_char}->();    $self->{c} = $self->{get_char}->();
83      #$self->{t} = {type => token-type, value => value, number => number};
84  } # init  } # init
85    
86  sub get_next_token ($) {  sub get_next_token ($) {
# Line 18  sub get_next_token ($) { Line 89  sub get_next_token ($) {
89      return shift @{$self->{token}};      return shift @{$self->{token}};
90    }    }
91    
   my $current_token;  
92    my $char;    my $char;
93    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
94    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
95    my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|    my $q;
96          ## NOTE:
97          ##   0: in |ident|.
98          ##   1: in |URI| outside of |string|.
99          ##   0x0022: in |string1| or |invalid1|.
100          ##   0x0027: in |string2| or |invalid2|.
101    
102    A: {    A: {
103      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
104        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
105          ## NOTE: |-| in |ident| in |IDENT|          ## NOTE: |-| in |ident| in |IDENT|
106          $current_token = {type => IDENT_TOKEN, value => '-'};          $self->{t} = {type => IDENT_TOKEN, value => '-'};
107          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
108          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
109          redo A;          redo A;
110        } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
111                 (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
112            $self->{c} = $self->{get_char}->();
113            if ($self->{c} == 0x002B) { # +
114              $self->{c} = $self->{get_char}->();
115              if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
116                  (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
117                  (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
118                  $self->{c} == 0x003F) { # ?
119                $self->{t}->{value} .= '+' . chr $self->{c};
120                $self->{t}->{type} = UNICODE_RANGE_TOKEN;
121                $self->{c} = $self->{get_char}->();
122                C: for (2..6) {
123                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
124                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
125                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
126                      $self->{c} == 0x003F) { # ?
127                    $self->{t}->{value} .= chr $self->{c};
128                    $self->{c} = $self->{get_char}->();
129                  } else {
130                    last C;
131                  }
132                } # C
133    
134                if ($self->{c} == 0x002D) { # -
135                  $self->{c} = $self->{get_char}->();
136                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
137                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
138                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
139                    $self->{t}->{value} .= '-' . chr $self->{c};
140                    $self->{c} = $self->{get_char}->();
141                    C: for (2..6) {
142                      if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
143                          (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
144                          (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
145                        $self->{t}->{value} .= chr $self->{c};
146                        $self->{c} = $self->{get_char}->();
147                      } else {
148                        last C;
149                      }
150                    } # C
151                    
152                    #
153                  } else {
154                    my $token = $self->{t};
155                    $self->{t} = {type => IDENT_TOKEN, value => '-'};
156                    $self->{state} = BEFORE_NMSTART_STATE;
157                    # reprocess
158                    return $token;
159                    #redo A;
160                  }
161                }
162    
163                $self->{state} = BEFORE_TOKEN_STATE;
164                # reprocess
165                return $self->{t};
166                #redo A;
167              } else {
168                unshift @{$self->{token}}, {type => PLUS_TOKEN};
169                $self->{state} = BEFORE_TOKEN_STATE;
170                # reprocess
171                return $self->{t};
172                #redo A;
173              }
174            } else {
175              $self->{state} = NAME_STATE;
176              # reprocess
177              redo A;
178            }
179          } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
180                   (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
181                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
182                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
183          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
184          $current_token = {type => IDENT_TOKEN, value => chr $self->{char}};          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
185          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
186          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
187          redo A;          redo A;
188        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
189          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
190          $current_token = {type => IDENT_TOKEN, value => ''};          $self->{t} = {type => IDENT_TOKEN, value => ''};
191          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
192          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
193          redo A;          redo A;
194        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
195          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
196          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
197          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = AFTER_AT_STATE;
198          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
199          redo A;          redo A;
200        } elsif ($self->{c} == 0x0022) { # "        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
201          ## NOTE: |"| in |string1| in |string| in |STRING|, or          $self->{t} = {type => STRING_TOKEN, value => ''};
202          ## |"| in |invalid1| in |invalid| in |INVALID|.          $self->{state} = STRING_STATE; $q = $self->{c};
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 1;  
         $self->{c} = $self->{get_char}->();  
         redo A;  
       } elsif ($self->{c} == 0x0027) { # '  
         ## NOTE: |'| in |string2| in |string| in |STRING|, or  
         ## |'| in |invalid2| in |invalid| in |INVALID|.  
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 2;  
203          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
204          redo A;          redo A;
205        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
206          ## NOTE: |#| in |HASH|.          ## NOTE: |#| in |HASH|.
207          $current_token = {type => HASH_TOKEN, value => ''};          $self->{t} = {type => HASH_TOKEN, value => ''};
208          $self->{state} = HASH_OPEN_STATE;          $self->{state} = HASH_OPEN_STATE;
209          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
210          redo A;          redo A;
211        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
212          ## NOTE: |num|.          ## NOTE: |num|.
213          $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};          $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
214          $self->{state} = NUMBER_STATE;          $self->{state} = NUMBER_STATE;
215          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
216          redo A;          redo A;
217        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
218          ## NOTE: |num|.          ## NOTE: |num|.
219          $current_token = {type => NUMBER_TOKEN, value => '.'};          $self->{t} = {type => NUMBER_TOKEN, value => '0'};
220          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
221          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
222          redo A;          redo A;
223          } elsif ($self->{c} == 0x002F) { # /
224            $self->{c} = $self->{get_char}->();
225            if ($self->{c} == 0x002A) { # *
226              C: {
227                $self->{c} = $self->{get_char}->();
228                if ($self->{c} == 0x002A) { # *
229                  D: {
230                    $self->{c} = $self->{get_char}->();
231                    if ($self->{c} == 0x002F) { # /
232                      #
233                    } elsif ($self->{c} == 0x002A) { # *
234                      redo D;
235                    } else {
236                      redo C;
237                    }
238                  } # D
239                } elsif ($self->{c} == -1) {
240                  # stay in the state
241                  # reprocess
242                  return {type => COMMENT_INVALID_TOKEN};
243                  #redo A;
244                } else {
245                  redo C;
246                }
247              } # C
248    
249              # stay in the state.
250              $self->{c} = $self->{get_char}->();
251              redo A;
252            } else {
253              # stay in the state.
254              # reprocess
255              return {type => DELIM_STATE, value => '/'};
256              #redo A;
257            }        
258        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
259          ## NOTE: |CDO|          ## NOTE: |CDO|
260          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 100  sub get_next_token ($) { Line 270  sub get_next_token ($) {
270              } else {              } else {
271                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
272                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
273                $current_token = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-'};
274                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
275                #reprocess                #reprocess
276                return {type => DELIM_TOKEN, value => '<'};                return {type => DELIM_TOKEN, value => '<'};
# Line 119  sub get_next_token ($) { Line 289  sub get_next_token ($) {
289            return {type => DELIM_TOKEN, value => '<'};            return {type => DELIM_TOKEN, value => '<'};
290            #redo A;            #redo A;
291          }          }
292        } elsif ({        } elsif (my $t = {
293                  0x003B => 1, # ;                  0x003B => SEMICOLON_TOKEN, # ;
294                  0x007B => 1, # {                  0x007B => LBRACE_TOKEN, # {
295                  0x007D => 1, # }                  0x007D => RBRACE_TOKEN, # }
296                  0x0028 => 1, # (                  0x0028 => LPAREN_TOKEN, # (
297                  0x0029 => 1, # )                  0x0029 => RPAREN_TOKEN, # )
298                  0x005B => 1, # [                  0x005B => LBRACKET_TOKEN, # [
299                  0x005D => 1, # ]                  0x005D => RBRACKET_TOKEN, # ]
300                 }->{$self->{c}}) {                 }->{$self->{c}}) {
301          # stay in the state          # stay in the state
302          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
303          return {type => chr $self->{c}};          return {type => $t};
304          # redo A;          # redo A;
305        } elsif ({        } elsif ({
306                  0x0020 => 1, # SP                  0x0020 => 1, # SP
# Line 172  sub get_next_token ($) { Line 342  sub get_next_token ($) {
342                          0x0024 => SUFFIXMATCH_TOKEN, # $                          0x0024 => SUFFIXMATCH_TOKEN, # $
343                          0x002A => SUBSTRINGMATCH_TOKEN, # *                          0x002A => SUBSTRINGMATCH_TOKEN, # *
344                         }->{$self->{c}}) {                         }->{$self->{c}}) {
345            my $c = $self->{c};
346          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
347          if ($self->{c} == 0x003D) { # =          if ($self->{c} == 0x003D) { # =
348            # stay in the state            # stay in the state
# Line 181  sub get_next_token ($) { Line 352  sub get_next_token ($) {
352          } else {          } else {
353            # stay in the state            # stay in the state
354            # reprocess            # reprocess
355            return {type => DELIM_TOKEN, value => chr $self->{c}};            return {type => DELIM_TOKEN, value => chr $c};
356            #redo A;            #redo A;
357          }          }
358        } elsif ($self->{c} == 0x002B) { # +        } elsif ($self->{c} == 0x002B) { # +
# Line 219  sub get_next_token ($) { Line 390  sub get_next_token ($) {
390          #redo A;          #redo A;
391        } else {        } else {
392          # stay in the state          # stay in the state
393          $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};          $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
394          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
395          return $current_token;          return $self->{t};
396          #redo A;          #redo A;
397        }        }
398      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
399        ## NOTE: |nmstart| in |ident| in (|IDENT| or |ATKEYWORD|)        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
400        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        ## |FUNCTION|)
401            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
402              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
403            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
404            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
405          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
406            $self->{t}->{type} = DIMENSION_TOKEN
407                if $self->{t}->{type} == NUMBER_TOKEN;
408          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
409          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
410          redo A;          redo A;
411        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
412    ## TODO: 12-\X, 12-\{nl}
413          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
414          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
415          redo A;          redo A;
416        } elsif ($self->{c} == 0x002D and # -        } elsif ($self->{c} == 0x002D and # -
417                 $current_token->{type} == IDENT_TOKEN) {                 $self->{t}->{type} == IDENT_TOKEN) {
418          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
419          if ($self->{c} == 0x003E) { # >          if ($self->{c} == 0x003E) { # >
420            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
# Line 248  sub get_next_token ($) { Line 423  sub get_next_token ($) {
423            #redo A;            #redo A;
424          } else {          } else {
425            ## NOTE: |-|, |-|, $self->{c}            ## NOTE: |-|, |-|, $self->{c}
426            #$current_token = {type => IDENT_TOKEN, value => '-'};            #$self->{t} = {type => IDENT_TOKEN, value => '-'};
427            # stay in the state            # stay in the state
428            # reconsume            # reconsume
429            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '-'};
430            #redo A;            #redo A;
431          }          }
432        } else {        } else {
433          if ($current_token->{type} == NUMBER_TOKEN) {          if ($self->{t}->{type} == NUMBER_TOKEN) {
434            ## NOTE: |-| after |num|.            ## NOTE: |-| after |NUMBER|.
435            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
436            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
437            $self->{c} = $self->{get_char}->();            # reconsume
438            return $current_token;            $self->{t}->{value} = $self->{t}->{number};
439          } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {            delete $self->{t}->{number};
440            ## NOTE: |-| after |@|.            return $self->{t};
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
441          } else {          } else {
442            ## NOTE: |-| not followed by |nmstart|.            ## NOTE: |-| not followed by |nmstart|.
443            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
# Line 274  sub get_next_token ($) { Line 445  sub get_next_token ($) {
445            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '-'};
446          }          }
447        }        }
448        } elsif ($self->{state} == AFTER_AT_STATE) {
449          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
450              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
451              $self->{c} == 0x005F or # _
452              $self->{c} > 0x007F) { # nonascii
453            $self->{t}->{value} .= chr $self->{c};
454            $self->{state} = NAME_STATE;
455            $self->{c} = $self->{get_char}->();
456            redo A;
457          } elsif ($self->{c} == 0x002D) { # -
458            $self->{t}->{value} .= '-';
459            $self->{state} = AFTER_AT_HYPHEN_STATE;
460            $self->{c} = $self->{get_char}->();
461            redo A;
462          } elsif ($self->{c} == 0x005C) { # \
463            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
464            $self->{c} = $self->{get_char}->();
465            redo A;
466          } else {
467            $self->{state} = BEFORE_TOKEN_STATE;
468            # reprocess
469            return {type => DELIM_TOKEN, value => '@'};
470          }
471        } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
472          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
473              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
474              $self->{c} == 0x005F or # _
475              $self->{c} > 0x007F) { # nonascii
476            $self->{t}->{value} .= chr $self->{c};
477            $self->{state} = NAME_STATE;
478            $self->{c} = $self->{get_char}->();
479            redo A;
480          } elsif ($self->{c} == 0x002D) { # -
481            $self->{c} = $self->{get_char}->();
482            if ($self->{c} == 0x003E) { # >
483              unshift @{$self->{token}}, {type => CDC_TOKEN};
484              $self->{state} = BEFORE_TOKEN_STATE;
485              $self->{c} = $self->{get_char}->();
486              return {type => DELIM_TOKEN, value => '@'};
487              #redo A;
488            } else {
489              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
490              $self->{t} = {type => IDENT_TOKEN, value => '-'};
491              $self->{state} = BEFORE_NMSTART_STATE;
492              # reprocess
493              return {type => DELIM_TOKEN, value => '@'};
494              #redo A;
495            }
496          } elsif ($self->{c} == 0x005C) { # \
497            ## TODO: @-\{nl}
498            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
499            $self->{c} = $self->{get_char}->();
500            redo A;
501          } else {
502            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
503            $self->{state} = BEFORE_TOKEN_STATE;
504            # reprocess
505            return {type => DELIM_TOKEN, value => '@'};
506          }
507      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
508        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
509          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
510          $current_token->{value} = '-';          $self->{t}->{value} = '-';
511          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
512          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
513          redo A;          redo A;
514        } elsif ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
515                 (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
516                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
517                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
518          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
519          $current_token->{value} = chr $self->{char};          $self->{t}->{value} = chr $self->{c};
520            $self->{t}->{type} = DIMENSION_TOKEN;
521          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
522          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
523          redo A;          redo A;
524        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
525          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
526          $current_token->{value} = '';          $self->{t}->{value} = '';
527          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
528          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
529          redo A;          redo A;
530        } elsif ($self->{c} == 0x0025) { # %        } elsif ($self->{c} == 0x0025) { # %
531          $current_token->{type} = PERCENTAGE_TOKEN;          $self->{t}->{type} = PERCENTAGE_TOKEN;
532          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
533          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
534          return $current_token;          return $self->{t};
535          #redo A;          #redo A;
536        } else {        } else {
537          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
538          # reprocess          # reprocess
539          return $current_token;          return $self->{t};
540          #redo A;          #redo A;
541        }        }
542      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
543        ## NOTE: The first |nmchar| in |name| in |HASH|.        ## NOTE: The first |nmchar| in |name| in |HASH|.
544        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
545            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
546            (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9            (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
547            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
548            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
549            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
550          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
551          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
552          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
553          redo A;          redo A;
# Line 332  sub get_next_token ($) { Line 563  sub get_next_token ($) {
563        }        }
564      } elsif ($self->{state} == NAME_STATE) {      } elsif ($self->{state} == NAME_STATE) {
565        ## NOTE: |nmchar| in (|ident| or |name|).        ## NOTE: |nmchar| in (|ident| or |name|).
566        if ((0x0041 <= $self->{c} or $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
567            (0x0061 <= $self->{c} or $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
568            (0x0030 <= $self->{c} or $self->{c} <= 0x0039) or # 0..9            (0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
569            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
570            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
571            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
572          $current_token->{value} .= chr $self->{char};          $self->{t}->{value} .= chr $self->{c};
573          # stay in the state          # stay in the state
574          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
575          redo A;          redo A;
576        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
577          $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
578          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
579          redo A;          redo A;
580        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
581                 $current_token->{type} == IDENT_TOKEN) { # (                 $self->{t}->{type} == IDENT_TOKEN) { # (
582          if (not $current_token->{has_escape} and          my $func_name = $self->{t}->{value};
583              {url => 1, Url => 1, uRl => 1, urL => 1,          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
584               URl => 1, UrL => 1, uRL => 1, URL => 1}          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
585              ->{$current_token->{value}}) {            if ($self->{t}->{has_escape}) {
586            $current_token->{type} = URI_TOKEN;              ## TODO: warn
587              }
588              $self->{t}->{type}
589                  = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
590              $self->{t}->{value} = '';
591            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
592            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
   
           ## NOTE: This version of the tokenizer does not support the |URI|  
           ## token type.  Note that browsers disagree in how to tokenize  
           ## |url| function.  
           $current_token->{type} = FUNCTION_TOKEN;  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
   
593            redo A;            redo A;
594          } else {          } else {
595            $current_token->{type} = FUNCTION_TOKEN;            $self->{t}->{type} = FUNCTION_TOKEN;
596            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
597            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
598            return $current_token;            return $self->{t};
599            #redo A;            #redo A;
600          }          }
601        } else {        } else {
602          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
603          # reconsume          # reconsume
604          return $current_token;          return $self->{t};
605          #redo A;          #redo A;
606        }        }
607        } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
608          while ({
609                    0x0020 => 1, # SP
610                    0x0009 => 1, # \t
611                    0x000D => 1, # \r
612                    0x000A => 1, # \n
613                    0x000C => 1, # \f
614                 }->{$self->{c}}) {
615            $self->{c} = $self->{get_char}->();
616          }
617          if ($self->{c} == -1) {
618            $self->{t}->{type} = {
619                URI_TOKEN, URI_INVALID_TOKEN,
620                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
621                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
622                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
623            }->{$self->{t}->{type}};        
624            $self->{state} = BEFORE_TOKEN_STATE;
625            $self->{c} = $self->{get_char}->();
626            return $self->{t};
627            #redo A;
628          } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
629            ## TODO: Should we consider matches of "(" and ")"?
630            $self->{t}->{type} = {
631                URI_TOKEN, URI_INVALID_TOKEN,
632                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
633                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
634                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
635            }->{$self->{t}->{type}};
636            $self->{state} = URI_UNQUOTED_STATE;
637            $self->{c} = $self->{get_char}->();
638            redo A;
639          } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
640            $self->{state} = STRING_STATE; $q = $self->{c};
641            $self->{c} = $self->{get_char}->();
642            redo A;
643          } elsif ($self->{c} == 0x0029) { # )
644            $self->{state} = BEFORE_TOKEN_STATE;
645            $self->{c} = $self->{get_char}->();
646            return $self->{t};
647            #redo A;
648          } elsif ($self->{c} == 0x005C) { # \
649            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
650            $self->{c} = $self->{get_char}->();
651            redo A;
652          } else {
653            $self->{t}->{value} .= chr $self->{c};
654            $self->{state} = URI_UNQUOTED_STATE;
655            $self->{c} = $self->{get_char}->();
656            redo A;
657          }
658        } elsif ($self->{state} == URI_UNQUOTED_STATE) {
659          if ({
660               0x0020 => 1, # SP
661               0x0009 => 1, # \t
662               0x000D => 1, # \r
663               0x000A => 1, # \n
664               0x000C => 1, # \f
665              }->{$self->{c}}) {
666            $self->{state} = URI_AFTER_WSP_STATE;
667            $self->{c} = $self->{get_char}->();
668            redo A;
669          } elsif ($self->{c} == -1) {
670            $self->{t}->{type} = {
671                URI_TOKEN, URI_INVALID_TOKEN,
672                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
673                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
674                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
675            }->{$self->{t}->{type}};        
676            $self->{state} = BEFORE_TOKEN_STATE;
677            $self->{c} = $self->{get_char}->();
678            return $self->{t};
679            #redo A;
680          } elsif ($self->{c} < 0x0020 or {
681              0x0022 => 1, # "
682              0x0027 => 1, # '
683              0x0028 => 1, # (
684          }->{$self->{c}}) { # C0 or (
685            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
686            $self->{t}->{type} = {
687                URI_TOKEN, URI_INVALID_TOKEN,
688                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
689                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
690                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
691            }->{$self->{t}->{type}};
692            # stay in the state.
693            $self->{c} = $self->{get_char}->();
694            redo A;
695          } elsif ($self->{c} == 0x0029) { # )
696            $self->{state} = BEFORE_TOKEN_STATE;
697            $self->{c} = $self->{get_char}->();
698            return $self->{t};
699            #redo A;
700          } elsif ($self->{c} == 0x005C) { # \
701            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
702            $self->{c} = $self->{get_char}->();
703            redo A;
704          } else {
705            $self->{t}->{value} .= chr $self->{c};
706            # stay in the state.
707            $self->{c} = $self->{get_char}->();
708            redo A;
709          }
710        } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
711          if ({
712               0x0020 => 1, # SP
713               0x0009 => 1, # \t
714               0x000D => 1, # \r
715               0x000A => 1, # \n
716               0x000C => 1, # \f
717              }->{$self->{c}}) {
718            # stay in the state.
719            $self->{c} = $self->{get_char}->();
720            redo A;
721          } elsif ($self->{c} == -1) {
722            $self->{t}->{type} = {
723                URI_TOKEN, URI_INVALID_TOKEN,
724                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
725                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
726                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
727            }->{$self->{t}->{type}};        
728            $self->{state} = BEFORE_TOKEN_STATE;
729            $self->{c} = $self->{get_char}->();
730            return $self->{t};
731            #redo A;
732          } elsif ($self->{c} == 0x0029) { # )
733            $self->{state} = BEFORE_TOKEN_STATE;
734            $self->{c} = $self->{get_char}->();
735            return $self->{t};
736            #redo A;
737          } elsif ($self->{c} == 0x005C) { # \
738            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
739            $self->{c} = $self->{get_char}->();
740            redo A;
741          } else {
742            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
743            $self->{t}->{type} = {
744                URI_TOKEN, URI_INVALID_TOKEN,
745                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
746                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
747                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
748            }->{$self->{t}->{type}};
749            # stay in the state.
750            $self->{c} = $self->{get_char}->();
751            redo A;
752          }
753      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
754        $current_token->{has_escape} = 1;        $self->{t}->{has_escape} = 1;
755        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
756          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
757          $char = $self->{c} - 0x0030;          $char = $self->{c} - 0x0030;
# Line 392  sub get_next_token ($) { Line 764  sub get_next_token ($) {
764          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
765          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
766          redo A;          redo A;
767        } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
768          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
769          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 - 0xA;
770          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
# Line 404  sub get_next_token ($) { Line 776  sub get_next_token ($) {
776            ## NOTE: In |escape| in ... in |ident|.            ## NOTE: In |escape| in ... in |ident|.
777            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
778            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
779            return $current_token;            return $self->{t};
780            # reconsume            # reconsume
781            #redo A;            #redo A;
782            } elsif ($q == 1) {
783              ## NOTE: In |escape| in |URI|.
784              $self->{t}->{type} = {
785                  URI_TOKEN, URI_INVALID_TOKEN,
786                  URI_INVALID_TOKEN, URI_INVALID_TOKEN,
787                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
788                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
789              }->{$self->{t}->{type}};
790              $self->{t}->{value} .= chr $self->{c};
791              $self->{state} = URI_UNQUOTED_STATE;
792              $self->{c} = $self->{get_char}->();
793              redo A;
794          } else {          } else {
795            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
796            $current_token->{value} .= chr $self->{c};            $self->{t}->{value} .= chr $self->{c};
797            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
798            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
799            redo A;            redo A;
# Line 419  sub get_next_token ($) { Line 803  sub get_next_token ($) {
803            ## NOTE: In |escape| in ... in |ident|.            ## NOTE: In |escape| in ... in |ident|.
804            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
805            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
806            return $current_token;            return $self->{t};
807            # reconsume            # reconsume
808            #redo A;            #redo A;
809            } elsif ($q == 1) {
810              $self->{t}->{type} = {
811                  URI_TOKEN, URI_INVALID_TOKEN,
812                  URI_INVALID_TOKEN, URI_INVALID_TOKEN,
813                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
814                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
815              }->{$self->{t}->{type}};
816              $self->{t}->{value} .= "\x0D\x0A";
817              $self->{state} = URI_UNQUOTED_STATE;
818              $self->{c} = $self->{get_char}->();
819              redo A;
820          } else {          } else {
821            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
822            $current_token->{value} .= "\x0D\x0A";            $self->{t}->{value} .= "\x0D\x0A";
823            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
824            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
825            redo A;            redo A;
826          }          }
827        } else {        } else {
828          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
829          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
830          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
831                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
832          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
833          redo A;          redo A;
834        }        }
# Line 448  sub get_next_token ($) { Line 844  sub get_next_token ($) {
844          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
845          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
846          redo A;          redo A;
847        } elsif (0x0061 <= $self->{c} or $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
848          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;
849          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
850          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 457  sub get_next_token ($) { Line 853  sub get_next_token ($) {
853                 $self->{c} == 0x000A or # \n                 $self->{c} == 0x000A or # \n
854                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
855                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
856          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
857          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
858                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
859          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
860          redo A;          redo A;
861        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 466  sub get_next_token ($) { Line 863  sub get_next_token ($) {
863          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
864          redo A;          redo A;
865        } else {        } else {
866          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
867          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
868                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
869          # reconsume          # reconsume
870          redo A;          redo A;
871        }        }
# Line 477  sub get_next_token ($) { Line 875  sub get_next_token ($) {
875            $self->{c} == 0x000A or # \n            $self->{c} == 0x000A or # \n
876            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
877            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
878          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
879          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
880                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
881          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
882          redo A;          redo A;
883        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 486  sub get_next_token ($) { Line 885  sub get_next_token ($) {
885          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
886          redo A;          redo A;
887        } else {        } else {
888          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
889          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
890                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
891          # reconsume          # reconsume
892          redo A;          redo A;
893        }        }
894      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
895        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
896        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
897          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
898          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
899                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
900          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
901          redo A;          redo A;
902        } else {        } else {
903          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
904          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
905                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
906          # reconsume          # reconsume
907          redo A;          redo A;
908        }        }
# Line 508  sub get_next_token ($) { Line 910  sub get_next_token ($) {
910        ## NOTE: A character in |string$Q| in |string| in |STRING|, or        ## NOTE: A character in |string$Q| in |string| in |STRING|, or
911        ## a character in |invalid$Q| in |invalid| in |INVALID|,        ## a character in |invalid$Q| in |invalid| in |INVALID|,
912        ## where |$Q = $q == 0x0022 ? 1 : 2|.        ## where |$Q = $q == 0x0022 ? 1 : 2|.
913          ## Or, in |URI|.
914        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
915          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
916          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
917          redo A;          redo A;
918        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
919          $self->{state} = BEFORE_TOKEN_STATE;          if ($self->{t}->{type} == STRING_TOKEN) {
920          $self->{c} = $self->{get_char}->();            $self->{state} = BEFORE_TOKEN_STATE;
921          return $current_token;            $self->{c} = $self->{get_char}->();
922          #redo A;            return $self->{t};
923              #redo A;
924            } else {
925              $self->{state} = URI_AFTER_WSP_STATE;
926              $self->{c} = $self->{get_char}->();
927              redo A;
928            }
929        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
930                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
931                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
932                 $self->{c} == -1) {                 $self->{c} == -1) {
933          $current_token->{type} = INVALID_TOKEN;          $self->{t}->{type} = INVALID_TOKEN;
934          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
935          # reconsume          # reconsume
936          return $current_token;          return $self->{t};
937          #redo A;          #redo A;
938        } else {        } else {
939          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
940          # stay in the state          # stay in the state
941          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
942          redo A;          redo A;
# Line 535  sub get_next_token ($) { Line 944  sub get_next_token ($) {
944      } elsif ($self->{state} == NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_STATE) {
945        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
946        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
947          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
948          # stay in the state          # stay in the state
949          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
950          redo A;          redo A;
# Line 544  sub get_next_token ($) { Line 953  sub get_next_token ($) {
953          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
954          redo A;          redo A;
955        } else {        } else {
956          $self->{number} = $self->{value};          $self->{t}->{number} = $self->{t}->{value};
957          $self->{value} = '';          $self->{t}->{value} = '';
958          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
959          # reprocess          # reprocess
960          return $current_token;          redo A;
         #redo A;  
961        }        }
962      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
963        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
964        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
965          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
966          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
967          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
968          redo A;          redo A;
969        } else {        } else {
970          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};
971          $self->{number} = $self->{value};          $self->{t}->{number} = $self->{t}->{value};
972          $self->{value} = '';          $self->{t}->{value} = '';
973          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
974          # reprocess          # reprocess
975          return $current_token;          return $self->{t};
976          #redo A;          #redo A;
977        }        }
978      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
979        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
980        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
981          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
982          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
983          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
984          redo A;          redo A;
# Line 583  sub get_next_token ($) { Line 991  sub get_next_token ($) {
991      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
992        ## NOTE: |[0-9]| in |num| after |.|.        ## NOTE: |[0-9]| in |num| after |.|.
993        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
994          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
995          # stay in the state          # stay in the state
996          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
997          redo A;          redo A;
998        } else {        } else {
999          $self->{number} = $self->{value};          $self->{t}->{number} = $self->{t}->{value};
1000          $self->{value} = '';          $self->{t}->{value} = '';
1001          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1002          # reprocess          # reprocess
1003          return $current_token;          redo A;
         #redo A;  
1004        }        }
1005      } else {      } else {
1006        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";
1007      }      }
1008    } # A    } # A
   
   ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|  
   
1009  } # get_next_token  } # get_next_token
1010    
1011  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.5

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24