/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2 by wakaba, Sat Sep 8 01:31:44 2007 UTC revision 1.12 by wakaba, Sat Sep 8 15:43:12 2007 UTC
# Line 17  sub ESCAPE_BEFORE_NL_STATE () { 12 } Line 17  sub ESCAPE_BEFORE_NL_STATE () { 12 }
17  sub NUMBER_DOT_STATE () { 13 }  sub NUMBER_DOT_STATE () { 13 }
18  sub NUMBER_DOT_NUMBER_STATE () { 14 }  sub NUMBER_DOT_NUMBER_STATE () { 14 }
19  sub DELIM_STATE () { 15 }  sub DELIM_STATE () { 15 }
20    sub URI_UNQUOTED_STATE () { 16 }
21    sub URI_AFTER_WSP_STATE () { 17 }
22    sub AFTER_AT_STATE () { 18 }
23    sub AFTER_AT_HYPHEN_STATE () { 19 }
24    
25  sub IDENT_TOKEN () { 1 }  sub IDENT_TOKEN () { 1 }
26  sub ATKEYWORD_TOKEN () { 2 }  sub ATKEYWORD_TOKEN () { 2 }
# Line 32  sub NUMBER_TOKEN () { 11 } Line 36  sub NUMBER_TOKEN () { 11 }
36  sub DIMENSION_TOKEN () { 12 }  sub DIMENSION_TOKEN () { 12 }
37  sub PERCENTAGE_TOKEN () { 13 }  sub PERCENTAGE_TOKEN () { 13 }
38  sub UNICODE_RANGE_TOKEN () { 14 }  sub UNICODE_RANGE_TOKEN () { 14 }
 sub UNICODE_RANGE_INVALID_TOKEN () { 15 }  
39  sub DELIM_TOKEN () { 16 }  sub DELIM_TOKEN () { 16 }
40  sub PLUS_TOKEN () { 17 }  sub PLUS_TOKEN () { 17 }
41  sub GREATER_TOKEN () { 18 }  sub GREATER_TOKEN () { 18 }
# Line 58  sub COMMENT_INVALID_TOKEN () { 37 } Line 61  sub COMMENT_INVALID_TOKEN () { 37 }
61  sub EOF_TOKEN () { 38 }  sub EOF_TOKEN () { 38 }
62    
63  our @TokenName = qw(  our @TokenName = qw(
64    0 IDENT ATKWTWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
65    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE    STRING INVALID NUMBER DIMENSION PERCENTAGE UNICODE_RANGE
66    UNICODE_RANGE_INVALID DELIM PLUS GREATER COMMA TILDE DASHMATCH    0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
67    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
68    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
69    COMMENT_INVALID EOF    COMMENT_INVALID EOF
# Line 76  sub init ($) { Line 79  sub init ($) {
79    my $self = shift;    my $self = shift;
80    $self->{state} = BEFORE_TOKEN_STATE;    $self->{state} = BEFORE_TOKEN_STATE;
81    $self->{c} = $self->{get_char}->();    $self->{c} = $self->{get_char}->();
82      #$self->{t} = {type => token-type, value => value, number => number};
83  } # init  } # init
84    
85  sub get_next_token ($) {  sub get_next_token ($) {
# Line 84  sub get_next_token ($) { Line 88  sub get_next_token ($) {
88      return shift @{$self->{token}};      return shift @{$self->{token}};
89    }    }
90    
   my $current_token;  
91    my $char;    my $char;
92    my $num; # |{num}|, if any.    my $num; # |{num}|, if any.
93    my $i; # |$i + 1|th character in |unicode| in |escape|.    my $i; # |$i + 1|th character in |unicode| in |escape|.
94    my $q; # |$q == 0 ? "in |ident|" : "in |string$q| or in |invalid$q|"|    my $q;
95          ## NOTE:
96          ##   0: in |ident|.
97          ##   1: in |URI| outside of |string|.
98          ##   0x0022: in |string1| or |invalid1|.
99          ##   0x0027: in |string2| or |invalid2|.
100    
101    A: {    A: {
102      if ($self->{state} == BEFORE_TOKEN_STATE) {      if ($self->{state} == BEFORE_TOKEN_STATE) {
103        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
104          ## NOTE: |-| in |ident| in |IDENT|          ## NOTE: |-| in |ident| in |IDENT|
105          $current_token = {type => IDENT_TOKEN, value => '-'};          $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
106          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
107          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
108          redo A;          redo A;
109          } elsif ($self->{c} == 0x0055 or $self->{c} == 0x0075) { # U or u
110            $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
111            $self->{c} = $self->{get_char}->();
112            if ($self->{c} == 0x002B) { # +
113              $self->{c} = $self->{get_char}->();
114              if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
115                  (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
116                  (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
117                  $self->{c} == 0x003F) { # ?
118                $self->{t}->{value} = chr $self->{c};
119                $self->{t}->{type} = UNICODE_RANGE_TOKEN;
120                $self->{c} = $self->{get_char}->();
121                C: for (2..6) {
122                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
123                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
124                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
125                      $self->{c} == 0x003F) { # ?
126                    $self->{t}->{value} .= chr $self->{c};
127                    $self->{c} = $self->{get_char}->();
128                  } else {
129                    last C;
130                  }
131                } # C
132    
133                if ($self->{c} == 0x002D) { # -
134                  $self->{c} = $self->{get_char}->();
135                  if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
136                      (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
137                      (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
138                    $self->{t}->{value} .= '-' . chr $self->{c};
139                    $self->{c} = $self->{get_char}->();
140                    C: for (2..6) {
141                      if ((0x0030 <= $self->{c} and $self->{c} <= 0x0039) or # 0..9
142                          (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
143                          (0x0061 <= $self->{c} and $self->{c} <= 0x0066)) { # a..f
144                        $self->{t}->{value} .= chr $self->{c};
145                        $self->{c} = $self->{get_char}->();
146                      } else {
147                        last C;
148                      }
149                    } # C
150                    
151                    #
152                  } else {
153                    my $token = $self->{t};
154                    $self->{t} = {type => IDENT_TOKEN, value => '-'};
155                    $self->{state} = BEFORE_NMSTART_STATE;
156                    # reprocess
157                    return $token;
158                    #redo A;
159                  }
160                }
161    
162                $self->{state} = BEFORE_TOKEN_STATE;
163                # reprocess
164                return $self->{t};
165                #redo A;
166              } else {
167                unshift @{$self->{token}}, {type => PLUS_TOKEN};
168                $self->{state} = BEFORE_TOKEN_STATE;
169                # reprocess
170                return $self->{t};
171                #redo A;
172              }
173            } else {
174              $self->{state} = NAME_STATE;
175              # reprocess
176              redo A;
177            }
178        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        } elsif ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
179                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z                 (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
180                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
181                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
182          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
183          $current_token = {type => IDENT_TOKEN, value => chr $self->{c}};          $self->{t} = {type => IDENT_TOKEN, value => chr $self->{c}};
184          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
185          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
186          redo A;          redo A;
187        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
188          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
189          $current_token = {type => IDENT_TOKEN, value => ''};          $self->{t} = {type => IDENT_TOKEN, value => ''};
190          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
191          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
192          redo A;          redo A;
193        } elsif ($self->{c} == 0x0040) { # @        } elsif ($self->{c} == 0x0040) { # @
194          ## NOTE: |@| in |ATKEYWORD|          ## NOTE: |@| in |ATKEYWORD|
195          $current_token = {type => ATKEYWORD_TOKEN, value => ''};          $self->{t} = {type => ATKEYWORD_TOKEN, value => ''};
196          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = AFTER_AT_STATE;
         $self->{c} = $self->{get_char}->();  
         redo A;  
       } elsif ($self->{c} == 0x0022) { # "  
         ## NOTE: |"| in |string1| in |string| in |STRING|, or  
         ## |"| in |invalid1| in |invalid| in |INVALID|.  
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 1;  
197          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
198          redo A;          redo A;
199        } elsif ($self->{c} == 0x0027) { # '        } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
200          ## NOTE: |'| in |string2| in |string| in |STRING|, or          $self->{t} = {type => STRING_TOKEN, value => ''};
201          ## |'| in |invalid2| in |invalid| in |INVALID|.          $self->{state} = STRING_STATE; $q = $self->{c};
         $current_token = {type => STRING_TOKEN, value => ''};  
         $self->{state} = STRING_STATE; $q = 2;  
202          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
203          redo A;          redo A;
204        } elsif ($self->{c} == 0x0023) { # #        } elsif ($self->{c} == 0x0023) { # #
205          ## NOTE: |#| in |HASH|.          ## NOTE: |#| in |HASH|.
206          $current_token = {type => HASH_TOKEN, value => ''};          $self->{t} = {type => HASH_TOKEN, value => ''};
207          $self->{state} = HASH_OPEN_STATE;          $self->{state} = HASH_OPEN_STATE;
208          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
209          redo A;          redo A;
210        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        } elsif (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
211          ## NOTE: |num|.          ## NOTE: |num|.
212          $current_token = {type => NUMBER_TOKEN, value => chr $self->{c}};          $self->{t} = {type => NUMBER_TOKEN, value => chr $self->{c}};
213          $self->{state} = NUMBER_STATE;          $self->{state} = NUMBER_STATE;
214          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
215          redo A;          redo A;
216        } elsif ($self->{c} == 0x002E) { # .        } elsif ($self->{c} == 0x002E) { # .
217          ## NOTE: |num|.          ## NOTE: |num|.
218          $current_token = {type => NUMBER_TOKEN, value => '0'};          $self->{t} = {type => NUMBER_TOKEN, value => '0'};
219          $self->{state} = NUMBER_FRACTION_STATE;          $self->{state} = NUMBER_FRACTION_STATE;
220          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
221          redo A;          redo A;
222          } elsif ($self->{c} == 0x002F) { # /
223            $self->{c} = $self->{get_char}->();
224            if ($self->{c} == 0x002A) { # *
225              C: {
226                $self->{c} = $self->{get_char}->();
227                if ($self->{c} == 0x002A) { # *
228                  D: {
229                    $self->{c} = $self->{get_char}->();
230                    if ($self->{c} == 0x002F) { # /
231                      #
232                    } elsif ($self->{c} == 0x002A) { # *
233                      redo D;
234                    } else {
235                      redo C;
236                    }
237                  } # D
238                } elsif ($self->{c} == -1) {
239                  # stay in the state
240                  # reprocess
241                  return {type => COMMENT_INVALID_TOKEN};
242                  #redo A;
243                } else {
244                  redo C;
245                }
246              } # C
247    
248              # stay in the state.
249              $self->{c} = $self->{get_char}->();
250              redo A;
251            } else {
252              # stay in the state.
253              # reprocess
254              return {type => DELIM_TOKEN, value => '/'};
255              #redo A;
256            }        
257        } elsif ($self->{c} == 0x003C) { # <        } elsif ($self->{c} == 0x003C) { # <
258          ## NOTE: |CDO|          ## NOTE: |CDO|
259          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
260          if ($self->{c} == 0x0021) { # !          if ($self->{c} == 0x0021) { # !
261            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
262            if ($self->{c} == 0x002C) { # -            if ($self->{c} == 0x002D) { # -
263              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->();
264              if ($self->{c} == 0x002C) { # -              if ($self->{c} == 0x002D) { # -
265                $self->{state} = BEFORE_TOKEN_STATE;                $self->{state} = BEFORE_TOKEN_STATE;
266                $self->{c} = $self->{get_char}->();                $self->{c} = $self->{get_char}->();
267                return {type => CDO_TOKEN};                return {type => CDO_TOKEN};
# Line 166  sub get_next_token ($) { Line 269  sub get_next_token ($) {
269              } else {              } else {
270                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};
271                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
272                $current_token = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-'};
273                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
274                #reprocess                #reprocess
275                return {type => DELIM_TOKEN, value => '<'};                return {type => DELIM_TOKEN, value => '<'};
# Line 286  sub get_next_token ($) { Line 389  sub get_next_token ($) {
389          #redo A;          #redo A;
390        } else {        } else {
391          # stay in the state          # stay in the state
392          $current_token = {type => DELIM_TOKEN, value => chr $self->{c}};          $self->{t} = {type => DELIM_TOKEN, value => chr $self->{c}};
393          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
394          return $current_token;          return $self->{t};
395          #redo A;          #redo A;
396        }        }
397      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {      } elsif ($self->{state} == BEFORE_NMSTART_STATE) {
398        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or |ATKEYWORD|)        ## NOTE: |nmstart| in |ident| in (|IDENT|, |DIMENSION|, or
399          ## |FUNCTION|)
400          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
401              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
402              $self->{c} == 0x005F or # _
403              $self->{c} > 0x007F) { # nonascii
404            $self->{t}->{value} .= chr $self->{c};
405            $self->{t}->{type} = DIMENSION_TOKEN
406                if $self->{t}->{type} == NUMBER_TOKEN;
407            $self->{state} = NAME_STATE;
408            $self->{c} = $self->{get_char}->();
409            redo A;
410          } elsif ($self->{c} == 0x005C) { # \
411            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
412            $self->{c} = $self->{get_char}->();
413            redo A;
414          } elsif ($self->{c} == 0x002D) { # -
415            if ($self->{t}->{type} == IDENT_TOKEN) {
416              $self->{c} = $self->{get_char}->();
417              if ($self->{c} == 0x003E) { # >
418                $self->{state} = BEFORE_TOKEN_STATE;
419                $self->{c} = $self->{get_char}->();
420                return {type => CDC_TOKEN};
421                #redo A;
422              } else {
423                ## NOTE: |-|, |-|, $self->{c}
424                #$self->{t} = {type => IDENT_TOKEN, value => '-'};
425                # stay in the state
426                # reconsume
427                return {type => DELIM_TOKEN, value => '-'};
428                #redo A;
429              }
430            } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
431              $self->{c} = $self->{get_char}->();
432              if ($self->{c} == 0x003E) { # >
433                unshift @{$self->{token}}, {type => CDC_TOKEN};
434                $self->{t}->{type} = NUMBER_TOKEN;
435                $self->{t}->{value} = '';
436                $self->{state} = BEFORE_TOKEN_STATE;
437                $self->{c} = $self->{get_char}->();
438                return $self->{t};
439                #redo A;
440              } else {
441                ## NOTE: |-|, |-|, $self->{c}
442                my $t = $self->{t};
443                $t->{type} = NUMBER_TOKEN;
444                $t->{value} = '';
445                $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
446                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
447                # stay in the state
448                # reconsume
449                return $t;
450                #redo A;
451              }
452            } else {
453              #
454            }
455          } else {
456            #
457          }
458          
459          if ($self->{t}->{type} == DIMENSION_TOKEN) {
460            ## NOTE: |-| after |NUMBER|.
461            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
462            $self->{state} = BEFORE_TOKEN_STATE;
463            # reprocess
464            $self->{t}->{type} = NUMBER_TOKEN;
465            $self->{t}->{value} = '';
466            return $self->{t};
467          } else {
468            ## NOTE: |-| not followed by |nmstart|.
469            $self->{state} = BEFORE_TOKEN_STATE;
470            # reprocess
471            return {type => DELIM_TOKEN, value => '-'};
472          }
473        } elsif ($self->{state} == AFTER_AT_STATE) {
474        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
475            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z            (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
476            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
477            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
478          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
         $current_token->{type} = DIMENSION_TOKEN  
             if $current_token->{type} == NUMBER_TOKEN;  
479          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
480          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
481          redo A;          redo A;
482          } elsif ($self->{c} == 0x002D) { # -
483            $self->{t}->{value} .= '-';
484            $self->{state} = AFTER_AT_HYPHEN_STATE;
485            $self->{c} = $self->{get_char}->();
486            redo A;
487        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
 ## TODO: 12-\X, 12-\{nl}  
488          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
489          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
490          redo A;          redo A;
491        } elsif ($self->{c} == 0x002D and # -        } else {
492                 $current_token->{type} == IDENT_TOKEN) {          $self->{state} = BEFORE_TOKEN_STATE;
493            # reprocess
494            return {type => DELIM_TOKEN, value => '@'};
495          }
496        } elsif ($self->{state} == AFTER_AT_HYPHEN_STATE) {
497          if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
498              (0x0061 <= $self->{c} and $self->{c} <= 0x007A) or # a..z
499              $self->{c} == 0x005F or # _
500              $self->{c} > 0x007F) { # nonascii
501            $self->{t}->{value} .= chr $self->{c};
502            $self->{state} = NAME_STATE;
503            $self->{c} = $self->{get_char}->();
504            redo A;
505          } elsif ($self->{c} == 0x002D) { # -
506          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
507          if ($self->{c} == 0x003E) { # >          if ($self->{c} == 0x003E) { # >
508              unshift @{$self->{token}}, {type => CDC_TOKEN};
509            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
510            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
511            return {type => CDC_TOKEN};            return {type => DELIM_TOKEN, value => '@'};
512            #redo A;            #redo A;
513          } else {          } else {
514            ## NOTE: |-|, |-|, $self->{c}            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
515            #$current_token = {type => IDENT_TOKEN, value => '-'};            $self->{t} = {type => IDENT_TOKEN, value => '-'};
516            # stay in the state            $self->{state} = BEFORE_NMSTART_STATE;
517            # reconsume            # reprocess
518            return {type => DELIM_TOKEN, value => '-'};            return {type => DELIM_TOKEN, value => '@'};
519            #redo A;            #redo A;
520          }          }
521          } elsif ($self->{c} == 0x005C) { # \
522            ## TODO: @-\{nl}
523            $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
524            $self->{c} = $self->{get_char}->();
525            redo A;
526        } else {        } else {
527          if ($current_token->{type} == NUMBER_TOKEN) {          unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
528            ## NOTE: |-| after |num|.          $self->{state} = BEFORE_TOKEN_STATE;
529            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};          # reprocess
530            $self->{state} = BEFORE_TOKEN_STATE;          return {type => DELIM_TOKEN, value => '@'};
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == ATKEYWORD_TOKEN) {  
           ## NOTE: |-| after |@|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '@'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
         } elsif ($current_token->{type} == NUMBER_TOKEN) {  
           ## NOTE: |-| after |NUMBER|.  
           unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};  
           $self->{state} = BEFORE_TOKEN_STATE;  
           # reconsume  
           $current_token->{value} = $current_token->{number};  
           delete $current_token->{number};  
           return $current_token;  
         } else {  
           ## NOTE: |-| not followed by |nmstart|.  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return {type => DELIM_TOKEN, value => '-'};  
         }  
531        }        }
532      } elsif ($self->{state} == AFTER_NUMBER_STATE) {      } elsif ($self->{state} == AFTER_NUMBER_STATE) {
533        if ($self->{c} == 0x002D) { # -        if ($self->{c} == 0x002D) { # -
534          ## NOTE: |-| in |ident|.          ## NOTE: |-| in |ident|.
535          $current_token->{value} = '-';          $self->{t}->{hyphen} = 1;
536            $self->{t}->{value} = '-';
537            $self->{t}->{type} = DIMENSION_TOKEN;
538          $self->{state} = BEFORE_NMSTART_STATE;          $self->{state} = BEFORE_NMSTART_STATE;
539          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
540          redo A;          redo A;
# Line 364  sub get_next_token ($) { Line 543  sub get_next_token ($) {
543                 $self->{c} == 0x005F or # _                 $self->{c} == 0x005F or # _
544                 $self->{c} > 0x007F) { # nonascii                 $self->{c} > 0x007F) { # nonascii
545          ## NOTE: |nmstart| in |ident|.          ## NOTE: |nmstart| in |ident|.
546          $current_token->{value} = chr $self->{c};          $self->{t}->{value} = chr $self->{c};
547          $current_token->{type} = DIMENSION_TOKEN;          $self->{t}->{type} = DIMENSION_TOKEN;
548          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
549          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
550          redo A;          redo A;
551        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
552          ## NOTE: |nmstart| in |ident| in |IDENT|          ## NOTE: |nmstart| in |ident| in |IDENT|
553          $current_token->{value} = '';          $self->{t}->{value} = '';
554            $self->{t}->{type} = DIMENSION_TOKEN;
555          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
556          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
557          redo A;          redo A;
558        } elsif ($self->{c} == 0x0025) { # %        } elsif ($self->{c} == 0x0025) { # %
559          $current_token->{type} = PERCENTAGE_TOKEN;          $self->{t}->{type} = PERCENTAGE_TOKEN;
560          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
561          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
562          return $current_token;          return $self->{t};
563          #redo A;          #redo A;
564        } else {        } else {
565          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
566          # reprocess          # reprocess
567          return $current_token;          return $self->{t};
568          #redo A;          #redo A;
569        }        }
570      } elsif ($self->{state} == HASH_OPEN_STATE) {      } elsif ($self->{state} == HASH_OPEN_STATE) {
# Line 395  sub get_next_token ($) { Line 575  sub get_next_token ($) {
575            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
576            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
577            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
578          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
579          $self->{state} = NAME_STATE;          $self->{state} = NAME_STATE;
580          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
581          redo A;          redo A;
# Line 405  sub get_next_token ($) { Line 585  sub get_next_token ($) {
585          redo A;          redo A;
586        } else {        } else {
587          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
588          $self->{c} = $self->{get_char}->();          # reprocess
589          return {type => DELIM_TOKEN, value => '#'};          return {type => DELIM_TOKEN, value => '#'};
590          #redo A;          #redo A;
591        }        }
# Line 417  sub get_next_token ($) { Line 597  sub get_next_token ($) {
597            $self->{c} == 0x005F or # _            $self->{c} == 0x005F or # _
598            $self->{c} == 0x002D or # -            $self->{c} == 0x002D or # -
599            $self->{c} > 0x007F) { # nonascii            $self->{c} > 0x007F) { # nonascii
600          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
601          # stay in the state          # stay in the state
602          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
603          redo A;          redo A;
604        } elsif ($self->{c} == 0x005C) { # \        } elsif ($self->{c} == 0x005C) { # \
605          $self->{state} = ESCAPE_OPEN_STATE; # $q = 0;          $self->{state} = ESCAPE_OPEN_STATE; $q = 0;
606          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
607          redo A;          redo A;
608        } elsif ($self->{c} == 0x0028 and # (        } elsif ($self->{c} == 0x0028 and # (
609                 $current_token->{type} == IDENT_TOKEN) { # (                 $self->{t}->{type} == IDENT_TOKEN) { # (
610          if (not $current_token->{has_escape} and          my $func_name = $self->{t}->{value};
611              {url => 1, Url => 1, uRl => 1, urL => 1,          $func_name =~ tr/A-Z/a-z/; ## TODO: Unicode or ASCII case-insensitive?
612               URl => 1, UrL => 1, uRL => 1, URL => 1}          if ($func_name eq 'url' or $func_name eq 'url-prefix') {
613              ->{$current_token->{value}}) {            if ($self->{t}->{has_escape}) {
614            $current_token->{type} = URI_TOKEN;              ## TODO: warn
615              }
616              $self->{t}->{type}
617                  = $func_name eq 'url' ? URI_TOKEN : URI_PREFIX_TOKEN;
618              $self->{t}->{value} = '';
619            $self->{state} = URI_BEFORE_WSP_STATE;            $self->{state} = URI_BEFORE_WSP_STATE;
620            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
   
           ## NOTE: This version of the tokenizer does not support the |URI|  
           ## token type.  Note that browsers disagree in how to tokenize  
           ## |url| function.  
           $current_token->{type} = FUNCTION_TOKEN;  
           $self->{state} = BEFORE_TOKEN_STATE;  
           $self->{c} = $self->{get_char}->();  
           return $current_token;  
   
621            redo A;            redo A;
622          } else {          } else {
623            $current_token->{type} = FUNCTION_TOKEN;            $self->{t}->{type} = FUNCTION_TOKEN;
624            $self->{state} = BEFORE_TOKEN_STATE;            $self->{state} = BEFORE_TOKEN_STATE;
625            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
626            return $current_token;            return $self->{t};
627            #redo A;            #redo A;
628          }          }
629        } else {        } else {
630          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
631          # reconsume          # reconsume
632          return $current_token;          return $self->{t};
633          #redo A;          #redo A;
634        }        }
635        } elsif ($self->{state} == URI_BEFORE_WSP_STATE) {
636          while ({
637                    0x0020 => 1, # SP
638                    0x0009 => 1, # \t
639                    0x000D => 1, # \r
640                    0x000A => 1, # \n
641                    0x000C => 1, # \f
642                 }->{$self->{c}}) {
643            $self->{c} = $self->{get_char}->();
644          }
645          if ($self->{c} == -1) {
646            $self->{t}->{type} = {
647                URI_TOKEN, URI_INVALID_TOKEN,
648                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
649                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
650                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
651            }->{$self->{t}->{type}};        
652            $self->{state} = BEFORE_TOKEN_STATE;
653            $self->{c} = $self->{get_char}->();
654            return $self->{t};
655            #redo A;
656          } elsif ($self->{c} < 0x0020 or $self->{c} == 0x0028) { # C0 or (
657            ## TODO: Should we consider matches of "(" and ")"?
658            $self->{t}->{type} = {
659                URI_TOKEN, URI_INVALID_TOKEN,
660                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
661                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
662                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
663            }->{$self->{t}->{type}};
664            $self->{state} = URI_UNQUOTED_STATE;
665            $self->{c} = $self->{get_char}->();
666            redo A;
667          } elsif ($self->{c} == 0x0022 or $self->{c} == 0x0027) { # " or '
668            $self->{state} = STRING_STATE; $q = $self->{c};
669            $self->{c} = $self->{get_char}->();
670            redo A;
671          } elsif ($self->{c} == 0x0029) { # )
672            $self->{state} = BEFORE_TOKEN_STATE;
673            $self->{c} = $self->{get_char}->();
674            return $self->{t};
675            #redo A;
676          } elsif ($self->{c} == 0x005C) { # \
677            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
678            $self->{c} = $self->{get_char}->();
679            redo A;
680          } else {
681            $self->{t}->{value} .= chr $self->{c};
682            $self->{state} = URI_UNQUOTED_STATE;
683            $self->{c} = $self->{get_char}->();
684            redo A;
685          }
686        } elsif ($self->{state} == URI_UNQUOTED_STATE) {
687          if ({
688               0x0020 => 1, # SP
689               0x0009 => 1, # \t
690               0x000D => 1, # \r
691               0x000A => 1, # \n
692               0x000C => 1, # \f
693              }->{$self->{c}}) {
694            $self->{state} = URI_AFTER_WSP_STATE;
695            $self->{c} = $self->{get_char}->();
696            redo A;
697          } elsif ($self->{c} == -1) {
698            $self->{t}->{type} = {
699                URI_TOKEN, URI_INVALID_TOKEN,
700                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
701                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
702                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
703            }->{$self->{t}->{type}};        
704            $self->{state} = BEFORE_TOKEN_STATE;
705            $self->{c} = $self->{get_char}->();
706            return $self->{t};
707            #redo A;
708          } elsif ($self->{c} < 0x0020 or {
709              0x0022 => 1, # "
710              0x0027 => 1, # '
711              0x0028 => 1, # (
712          }->{$self->{c}}) { # C0 or (
713            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
714            $self->{t}->{type} = {
715                URI_TOKEN, URI_INVALID_TOKEN,
716                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
717                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
718                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
719            }->{$self->{t}->{type}};
720            # stay in the state.
721            $self->{c} = $self->{get_char}->();
722            redo A;
723          } elsif ($self->{c} == 0x0029) { # )
724            $self->{state} = BEFORE_TOKEN_STATE;
725            $self->{c} = $self->{get_char}->();
726            return $self->{t};
727            #redo A;
728          } elsif ($self->{c} == 0x005C) { # \
729            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
730            $self->{c} = $self->{get_char}->();
731            redo A;
732          } else {
733            $self->{t}->{value} .= chr $self->{c};
734            # stay in the state.
735            $self->{c} = $self->{get_char}->();
736            redo A;
737          }
738        } elsif ($self->{state} == URI_AFTER_WSP_STATE) {
739          if ({
740               0x0020 => 1, # SP
741               0x0009 => 1, # \t
742               0x000D => 1, # \r
743               0x000A => 1, # \n
744               0x000C => 1, # \f
745              }->{$self->{c}}) {
746            # stay in the state.
747            $self->{c} = $self->{get_char}->();
748            redo A;
749          } elsif ($self->{c} == -1) {
750            $self->{t}->{type} = {
751                URI_TOKEN, URI_INVALID_TOKEN,
752                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
753                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
754                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
755            }->{$self->{t}->{type}};        
756            $self->{state} = BEFORE_TOKEN_STATE;
757            $self->{c} = $self->{get_char}->();
758            return $self->{t};
759            #redo A;
760          } elsif ($self->{c} == 0x0029) { # )
761            $self->{state} = BEFORE_TOKEN_STATE;
762            $self->{c} = $self->{get_char}->();
763            return $self->{t};
764            #redo A;
765          } elsif ($self->{c} == 0x005C) { # \
766            $self->{state} = ESCAPE_OPEN_STATE; $q = 1;
767            $self->{c} = $self->{get_char}->();
768            redo A;
769          } else {
770            ## TODO: Should we consider matches of "(" and ")", '"', or "'"?
771            $self->{t}->{type} = {
772                URI_TOKEN, URI_INVALID_TOKEN,
773                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
774                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
775                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
776            }->{$self->{t}->{type}};
777            # stay in the state.
778            $self->{c} = $self->{get_char}->();
779            redo A;
780          }
781      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {      } elsif ($self->{state} == ESCAPE_OPEN_STATE) {
782        $current_token->{has_escape} = 1;        $self->{t}->{has_escape} = 1;
783        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) { # 0..9
784          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
785          $char = $self->{c} - 0x0030;          $char = $self->{c} - 0x0030;
# Line 473  sub get_next_token ($) { Line 794  sub get_next_token ($) {
794          redo A;          redo A;
795        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
796          ## NOTE: second character of |unicode| in |escape|.          ## NOTE: second character of |unicode| in |escape|.
797          $char = $self->{c} - 0x0061 - 0xA;          $char = $self->{c} - 0x0061 + 0xA;
798          $self->{state} = ESCAPE_STATE; $i = 2;          $self->{state} = ESCAPE_STATE; $i = 2;
799          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
800          redo A;          redo A;
801        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
802                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
803          if ($q == 0) {          if ($q == 0) {
804            ## NOTE: In |escape| in ... in |ident|.            #
805            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
806            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
807            return $current_token;            $self->{t}->{type} = {
808            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
809            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
810                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
811                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
812              }->{$self->{t}->{type}};
813              $self->{t}->{value} .= chr $self->{c};
814              $self->{state} = URI_UNQUOTED_STATE;
815              $self->{c} = $self->{get_char}->();
816              redo A;
817          } else {          } else {
818            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
819            $current_token->{value} .= chr $self->{c};            $self->{t}->{value} .= chr $self->{c};
820            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
821            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
822            redo A;            redo A;
823          }          }
824        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
825          if ($q == 0) {          if ($q == 0) {
826            ## NOTE: In |escape| in ... in |ident|.            #
827            $self->{state} = BEFORE_TOKEN_STATE;          } elsif ($q == 1) {
828            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};            ## NOTE: In |escape| in |URI|.
829            return $current_token;            $self->{t}->{type} = {
830            # reconsume                URI_TOKEN, URI_INVALID_TOKEN,
831            #redo A;                URI_INVALID_TOKEN, URI_INVALID_TOKEN,
832                  URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
833                  URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
834              }->{$self->{t}->{type}};
835              $self->{t}->{value} .= "\x0D";
836              $self->{state} = ESCAPE_BEFORE_LF_STATE;
837              $self->{c} = $self->{get_char}->();
838              redo A;
839          } else {          } else {
840            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
841            $current_token->{value} .= "\x0D\x0A";            $self->{t}->{value} .= "\x0D";
842            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
843            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
844            redo A;            redo A;
845          }          }
846          } elsif ($self->{c} == -1) {
847            #
848        } else {        } else {
849          ## NOTE: second character of |escape|.          ## NOTE: second character of |escape|.
850          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
851          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
852                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
853            $self->{c} = $self->{get_char}->();
854            redo A;
855          }
856    
857          if ($q == 0) {
858            if ($self->{t}->{type} == DIMENSION_TOKEN) {
859              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
860                $self->{state} = BEFORE_TOKEN_STATE;
861                # reprocess
862                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
863                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};
864                $self->{t}->{type} = NUMBER_TOKEN;
865                $self->{t}->{value} = '';
866                return $self->{t};
867                #redo A;
868              } elsif (length $self->{t}->{value}) {
869                $self->{state} = BEFORE_TOKEN_STATE;
870                # reprocess
871                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
872                return $self->{t};
873                #redo A;
874              } else {
875                $self->{state} = BEFORE_TOKEN_STATE;
876                # reprocess
877                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
878                $self->{t}->{type} = NUMBER_TOKEN;
879                $self->{t}->{value} = '';
880                return $self->{t};
881                #redo A;
882              }
883            } else {
884              if ($self->{t}->{hyphen} and $self->{t}->{value} eq '-') {
885                $self->{state} = BEFORE_TOKEN_STATE;
886                # reprocess
887                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
888                return {type => DELIM_TOKEN, value => '-'};
889                #redo A;
890              } elsif (length $self->{t}->{value}) {
891                $self->{state} = BEFORE_TOKEN_STATE;
892                # reprocess
893                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
894                return $self->{t};
895                #redo A;
896              } else {
897                $self->{state} = BEFORE_TOKEN_STATE;
898                # reprocess
899                return {type => DELIM_TOKEN, value => '\\'};
900                #redo A;
901              }
902            }
903          } elsif ($q == 1) {
904            $self->{state} = URI_UNQUOTED_STATE;
905          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
906          redo A;          redo A;
907          } else {
908            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
909            $self->{t}->{type} = {
910              STRING_TOKEN, INVALID_TOKEN,
911              URI_TOKEN, URI_INVALID_TOKEN,
912              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
913            }->{$self->{t}->{type}} || $self->{t}->{type};
914            $self->{state} = BEFORE_TOKEN_STATE;
915            # reprocess
916            return $self->{t};
917            #redo A;
918        }        }
919      } elsif ($self->{state} == ESCAPE_STATE) {      } elsif ($self->{state} == ESCAPE_STATE) {
920        ## NOTE: third..seventh character of |unicode| in |escape|.        ## NOTE: third..seventh character of |unicode| in |escape|.
# Line 528  sub get_next_token ($) { Line 929  sub get_next_token ($) {
929          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
930          redo A;          redo A;
931        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f        } elsif (0x0061 <= $self->{c} and $self->{c} <= 0x0066) { # a..f
932          $char = $char * 0x10 + $self->{c} - 0x0061 - 0xA;          $char = $char * 0x10 + $self->{c} - 0x0061 + 0xA;
933          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;          $self->{state} = ++$i == 7 ? ESCAPE_BEFORE_NL_STATE : ESCAPE_STATE;
934          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
935          redo A;          redo A;
# Line 536  sub get_next_token ($) { Line 937  sub get_next_token ($) {
937                 $self->{c} == 0x000A or # \n                 $self->{c} == 0x000A or # \n
938                 $self->{c} == 0x0009 or # \t                 $self->{c} == 0x0009 or # \t
939                 $self->{c} == 0x000C) { # \f                 $self->{c} == 0x000C) { # \f
940          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
941          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
942                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
943          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
944          redo A;          redo A;
945        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 545  sub get_next_token ($) { Line 947  sub get_next_token ($) {
947          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
948          redo A;          redo A;
949        } else {        } else {
950          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
951          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
952                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
953          # reconsume          # reconsume
954          redo A;          redo A;
955        }        }
# Line 556  sub get_next_token ($) { Line 959  sub get_next_token ($) {
959            $self->{c} == 0x000A or # \n            $self->{c} == 0x000A or # \n
960            $self->{c} == 0x0009 or # \t            $self->{c} == 0x0009 or # \t
961            $self->{c} == 0x000C) { # \f            $self->{c} == 0x000C) { # \f
962          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
963          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
964                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
965          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
966          redo A;          redo A;
967        } elsif ($self->{c} == 0x000D) { # \r        } elsif ($self->{c} == 0x000D) { # \r
# Line 565  sub get_next_token ($) { Line 969  sub get_next_token ($) {
969          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
970          redo A;          redo A;
971        } else {        } else {
972          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $char;
973          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
974                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
975          # reconsume          # reconsume
976          redo A;          redo A;
977        }        }
978      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
979        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.
980        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
981          $current_token->{value} .= chr $char;          $self->{t}->{value} .= chr $self->{c};
982          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;          $self->{state} = $q == 0 ? NAME_STATE :
983                $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
984          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
985          redo A;          redo A;
986        } else {        } else {
987          $current_token->{value} .= chr $char;          $self->{state} = $q == 0 ? NAME_STATE :
988          $self->{state} = $q == 0 ? NAME_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
989          # reconsume          # reprocess
990          redo A;          redo A;
991        }        }
992      } elsif ($self->{state} == STRING_STATE) {      } elsif ($self->{state} == STRING_STATE) {
993        ## NOTE: A character in |string$Q| in |string| in |STRING|, or        ## NOTE: A character in |string$Q| in |string| in |STRING|, or
994        ## a character in |invalid$Q| in |invalid| in |INVALID|,        ## a character in |invalid$Q| in |invalid| in |INVALID|,
995        ## where |$Q = $q == 0x0022 ? 1 : 2|.        ## where |$Q = $q == 0x0022 ? 1 : 2|.
996          ## Or, in |URI|.
997        if ($self->{c} == 0x005C) { # \        if ($self->{c} == 0x005C) { # \
998          $self->{state} = ESCAPE_OPEN_STATE;          $self->{state} = ESCAPE_OPEN_STATE;
999          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1000          redo A;          redo A;
1001        } elsif ($self->{c} == $q) { # " | '        } elsif ($self->{c} == $q) { # " | '
1002          $self->{state} = BEFORE_TOKEN_STATE;          if ($self->{t}->{type} == STRING_TOKEN) {
1003          $self->{c} = $self->{get_char}->();            $self->{state} = BEFORE_TOKEN_STATE;
1004          return $current_token;            $self->{c} = $self->{get_char}->();
1005          #redo A;            return $self->{t};
1006              #redo A;
1007            } else {
1008              $self->{state} = URI_AFTER_WSP_STATE;
1009              $self->{c} = $self->{get_char}->();
1010              redo A;
1011            }
1012        } elsif ($self->{c} == 0x000A or # \n        } elsif ($self->{c} == 0x000A or # \n
1013                 $self->{c} == 0x000D or # \r                 $self->{c} == 0x000D or # \r
1014                 $self->{c} == 0x000C or # \f                 $self->{c} == 0x000C or # \f
1015                 $self->{c} == -1) {                 $self->{c} == -1) {
1016          $current_token->{type} = INVALID_TOKEN;          $self->{t}->{type} = {
1017              STRING_TOKEN, INVALID_TOKEN,
1018              INVALID_TOKEN, INVALID_TOKEN,
1019              URI_TOKEN, URI_INVALID_TOKEN,
1020              URI_INVALID_TOKEN, URI_INVALID_TOKEN,
1021              URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
1022              URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
1023            }->{$self->{t}->{type}};
1024          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1025          # reconsume          # reconsume
1026          return $current_token;          return $self->{t};
1027          #redo A;          #redo A;
1028        } else {        } else {
1029          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1030          # stay in the state          # stay in the state
1031          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1032          redo A;          redo A;
# Line 614  sub get_next_token ($) { Line 1034  sub get_next_token ($) {
1034      } elsif ($self->{state} == NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_STATE) {
1035        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.        ## NOTE: 2nd, 3rd, or ... character in |num| before |.|.
1036        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1037          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1038          # stay in the state          # stay in the state
1039          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1040          redo A;          redo A;
# Line 623  sub get_next_token ($) { Line 1043  sub get_next_token ($) {
1043          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1044          redo A;          redo A;
1045        } else {        } else {
1046          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1047          $current_token->{value} = '';          $self->{t}->{value} = '';
1048          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1049          # reprocess          # reprocess
1050          redo A;          redo A;
# Line 632  sub get_next_token ($) { Line 1052  sub get_next_token ($) {
1052      } elsif ($self->{state} == NUMBER_DOT_STATE) {      } elsif ($self->{state} == NUMBER_DOT_STATE) {
1053        ## NOTE: The character immediately following |.| in |num|.        ## NOTE: The character immediately following |.| in |num|.
1054        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1055          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1056          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1057          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1058          redo A;          redo A;
1059        } else {        } else {
1060          unshift @{$self->{token}}, {type => DELIM_STATE, value => '.'};          unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '.'};
1061          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1062          $current_token->{value} = '';          $self->{t}->{value} = '';
1063          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1064          # reprocess          # reprocess
1065          return $current_token;          return $self->{t};
1066          #redo A;          #redo A;
1067        }        }
1068      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {      } elsif ($self->{state} == NUMBER_FRACTION_STATE) {
1069        ## NOTE: The character immediately following |.| at the beginning of |num|.        ## NOTE: The character immediately following |.| at the beginning of |num|.
1070        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1071          $current_token->{value} .= '.' . chr $self->{c};          $self->{t}->{value} .= '.' . chr $self->{c};
1072          $self->{state} = NUMBER_DOT_NUMBER_STATE;          $self->{state} = NUMBER_DOT_NUMBER_STATE;
1073          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1074          redo A;          redo A;
1075        } else {        } else {
1076          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1077          $self->{c} = $self->{get_char}->();          # reprocess
1078          return {type => DELIM_TOKEN, value => '.'};          return {type => DELIM_TOKEN, value => '.'};
1079          #redo A;          #redo A;
1080        }        }
1081      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
1082        ## NOTE: |[0-9]| in |num| after |.|.        ## NOTE: |[0-9]| in |num| after |.|.
1083        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {        if (0x0030 <= $self->{c} and $self->{c} <= 0x0039) {
1084          $current_token->{value} .= chr $self->{c};          $self->{t}->{value} .= chr $self->{c};
1085          # stay in the state          # stay in the state
1086          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1087          redo A;          redo A;
1088        } else {        } else {
1089          $current_token->{number} = $current_token->{value};          $self->{t}->{number} = $self->{t}->{value};
1090          $current_token->{value} = '';          $self->{t}->{value} = '';
1091          $self->{state} = AFTER_NUMBER_STATE;          $self->{state} = AFTER_NUMBER_STATE;
1092          # reprocess          # reprocess
1093          redo A;          redo A;
# Line 676  sub get_next_token ($) { Line 1096  sub get_next_token ($) {
1096        die "$0: Unknown state |$self->{state}|";        die "$0: Unknown state |$self->{state}|";
1097      }      }
1098    } # A    } # A
   
   ## TODO: |URI|, |UNICODE-RANGE|, |COMMENT|  
   
1099  } # get_next_token  } # get_next_token
1100    
1101  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.12

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24