/[suikacvs]/markup/html/whatpm/Whatpm/CSS/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/CSS/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11 by wakaba, Sat Sep 8 15:20:41 2007 UTC revision 1.17 by wakaba, Sun Jan 20 04:02:25 2008 UTC
# Line 1  Line 1 
1  package Whatpm::CSS::Tokenizer;  package Whatpm::CSS::Tokenizer;
2  use strict;  use strict;
3    our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    require Exporter;
6    push our @ISA, 'Exporter';
7    
8  sub BEFORE_TOKEN_STATE () { 0 }  sub BEFORE_TOKEN_STATE () { 0 }
9  sub BEFORE_NMSTART_STATE () { 1 }  sub BEFORE_NMSTART_STATE () { 1 }
# Line 59  sub CDC_TOKEN () { 35 } Line 63  sub CDC_TOKEN () { 35 }
63  sub COMMENT_TOKEN () { 36 }  sub COMMENT_TOKEN () { 36 }
64  sub COMMENT_INVALID_TOKEN () { 37 }  sub COMMENT_INVALID_TOKEN () { 37 }
65  sub EOF_TOKEN () { 38 }  sub EOF_TOKEN () { 38 }
66    sub MINUS_TOKEN () { 39 }
67    sub STAR_TOKEN () { 40 }
68    sub VBAR_TOKEN () { 41 }
69    sub DOT_TOKEN () { 42 }
70    sub COLON_TOKEN () { 43 }
71    sub MATCH_TOKEN () { 44 }
72    sub EXCLAMATION_TOKEN () { 45 }
73    
74  our @TokenName = qw(  our @TokenName = qw(
75    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID    0 IDENT ATKEYWORD HASH FUNCTION URI URI_INVALID URI_PREFIX URI_PREFIX_INVALID
# Line 66  our @TokenName = qw( Line 77  our @TokenName = qw(
77    0 DELIM PLUS GREATER COMMA TILDE DASHMATCH    0 DELIM PLUS GREATER COMMA TILDE DASHMATCH
78    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON    PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH INCLUDES SEMICOLON
79    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT    LBRACE RBRACE LPAREN RPAREN LBRACKET RBRACKET S CDO CDC COMMENT
80    COMMENT_INVALID EOF    COMMENT_INVALID EOF MINUS STAR VBAR DOT COLON MATCH EXCLAMATION
81    );
82    
83    our @EXPORT_OK = qw(
84      IDENT_TOKEN ATKEYWORD_TOKEN HASH_TOKEN FUNCTION_TOKEN URI_TOKEN
85      URI_INVALID_TOKEN URI_PREFIX_TOKEN URI_PREFIX_INVALID_TOKEN
86      STRING_TOKEN INVALID_TOKEN NUMBER_TOKEN DIMENSION_TOKEN PERCENTAGE_TOKEN
87      UNICODE_RANGE_TOKEN DELIM_TOKEN PLUS_TOKEN GREATER_TOKEN COMMA_TOKEN
88      TILDE_TOKEN DASHMATCH_TOKEN PREFIXMATCH_TOKEN SUFFIXMATCH_TOKEN
89      SUBSTRINGMATCH_TOKEN INCLUDES_TOKEN SEMICOLON_TOKEN LBRACE_TOKEN
90      RBRACE_TOKEN LPAREN_TOKEN RPAREN_TOKEN LBRACKET_TOKEN RBRACKET_TOKEN
91      S_TOKEN CDO_TOKEN CDC_TOKEN COMMENT_TOKEN COMMENT_INVALID_TOKEN EOF_TOKEN
92      MINUS_TOKEN STAR_TOKEN VBAR_TOKEN DOT_TOKEN COLON_TOKEN MATCH_TOKEN
93      EXCLAMATION_TOKEN
94  );  );
95    
96    our %EXPORT_TAGS = ('token' => [@EXPORT_OK]);
97    
98  sub new ($) {  sub new ($) {
99    my $self = bless {token => [], get_char => sub { -1 },    my $self = bless {token => [], get_char => sub { -1 }}, shift;
                     onerror => sub { }}, shift;  
100    return $self;    return $self;
101  } # new  } # new
102    
# Line 115  sub get_next_token ($) { Line 140  sub get_next_token ($) {
140                (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F                (0x0041 <= $self->{c} and $self->{c} <= 0x0046) or # A..F
141                (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f                (0x0061 <= $self->{c} and $self->{c} <= 0x0066) or # a..f
142                $self->{c} == 0x003F) { # ?                $self->{c} == 0x003F) { # ?
143              $self->{t}->{value} .= '+' . chr $self->{c};              $self->{t}->{value} = chr $self->{c};
144              $self->{t}->{type} = UNICODE_RANGE_TOKEN;              $self->{t}->{type} = UNICODE_RANGE_TOKEN;
145              $self->{c} = $self->{get_char}->();              $self->{c} = $self->{get_char}->();
146              C: for (2..6) {              C: for (2..6) {
# Line 267  sub get_next_token ($) { Line 292  sub get_next_token ($) {
292                return {type => CDO_TOKEN};                return {type => CDO_TOKEN};
293                #redo A;                #redo A;
294              } else {              } else {
295                unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};                unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
296                ## NOTE: |-| in |ident| in |IDENT|                ## NOTE: |-| in |ident| in |IDENT|
297                $self->{t} = {type => IDENT_TOKEN, value => '-'};                $self->{t} = {type => IDENT_TOKEN, value => '-'};
298                $self->{state} = BEFORE_NMSTART_STATE;                $self->{state} = BEFORE_NMSTART_STATE;
# Line 276  sub get_next_token ($) { Line 301  sub get_next_token ($) {
301                #redo A;                #redo A;
302              }              }
303            } else {            } else {
304              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '!'};              unshift @{$self->{token}}, {type => EXCLAMATION_TOKEN};
305              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
306              #reprocess              #reprocess
307              return {type => DELIM_TOKEN, value => '<'};              return {type => DELIM_TOKEN, value => '<'};
# Line 289  sub get_next_token ($) { Line 314  sub get_next_token ($) {
314            #redo A;            #redo A;
315          }          }
316        } elsif (my $t = {        } elsif (my $t = {
317                  0x003B => SEMICOLON_TOKEN, # ;                          0x0021 => EXCLAMATION_TOKEN, # !
318                  0x007B => LBRACE_TOKEN, # {                          0x002D => MINUS_TOKEN, # -
319                  0x007D => RBRACE_TOKEN, # }                          0x002E => DOT_TOKEN, # .
320                  0x0028 => LPAREN_TOKEN, # (                          0x003A => COLON_TOKEN, # :
321                  0x0029 => RPAREN_TOKEN, # )                          0x003B => SEMICOLON_TOKEN, # ;
322                  0x005B => LBRACKET_TOKEN, # [                          0x003D => MATCH_TOKEN, # =
323                  0x005D => RBRACKET_TOKEN, # ]                          0x007B => LBRACE_TOKEN, # {
324                            0x007D => RBRACE_TOKEN, # }
325                            0x0028 => LPAREN_TOKEN, # (
326                            0x0029 => RPAREN_TOKEN, # )
327                            0x005B => LBRACKET_TOKEN, # [
328                            0x005D => RBRACKET_TOKEN, # ]
329                 }->{$self->{c}}) {                 }->{$self->{c}}) {
330            my ($l, $c) = ($self->{line}, $self->{column});
331          # stay in the state          # stay in the state
332          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->($self);
333          return {type => $t};          return {type => $t, line => $l, column => $c};
334          # redo A;          # redo A;
335        } elsif ({        } elsif ({
336                  0x0020 => 1, # SP                  0x0020 => 1, # SP
# Line 348  sub get_next_token ($) { Line 379  sub get_next_token ($) {
379            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
380            return {type => $v};            return {type => $v};
381            #redo A;            #redo A;
382            } elsif ($v = {
383                           0x002A => STAR_TOKEN, # *
384                           0x007C => VBAR_TOKEN, # |
385                          }->{$c}) {
386              # stay in the state.
387              # reprocess
388              return {type => $v};
389              #redo A;
390          } else {          } else {
391            # stay in the state            # stay in the state
392            # reprocess            # reprocess
# Line 424  sub get_next_token ($) { Line 463  sub get_next_token ($) {
463              #$self->{t} = {type => IDENT_TOKEN, value => '-'};              #$self->{t} = {type => IDENT_TOKEN, value => '-'};
464              # stay in the state              # stay in the state
465              # reconsume              # reconsume
466              return {type => DELIM_TOKEN, value => '-'};              return {type => MINUS_TOKEN};
467              #redo A;              #redo A;
468            }            }
469          } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {          } elsif ($self->{t}->{type} == DIMENSION_TOKEN) {
# Line 443  sub get_next_token ($) { Line 482  sub get_next_token ($) {
482              $t->{type} = NUMBER_TOKEN;              $t->{type} = NUMBER_TOKEN;
483              $t->{value} = '';              $t->{value} = '';
484              $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};              $self->{t} = {type => IDENT_TOKEN, value => '-', hyphen => 1};
485              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};              unshift @{$self->{token}}, {type => MINUS_TOKEN};
486              # stay in the state              # stay in the state
487              # reconsume              # reconsume
488              return $t;              return $t;
# Line 458  sub get_next_token ($) { Line 497  sub get_next_token ($) {
497                
498        if ($self->{t}->{type} == DIMENSION_TOKEN) {        if ($self->{t}->{type} == DIMENSION_TOKEN) {
499          ## NOTE: |-| after |NUMBER|.          ## NOTE: |-| after |NUMBER|.
500          unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};          unshift @{$self->{token}}, {type => MINUS_TOKEN};
501          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
502          # reprocess          # reprocess
503          $self->{t}->{type} = NUMBER_TOKEN;          $self->{t}->{type} = NUMBER_TOKEN;
# Line 468  sub get_next_token ($) { Line 507  sub get_next_token ($) {
507          ## NOTE: |-| not followed by |nmstart|.          ## NOTE: |-| not followed by |nmstart|.
508          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
509          # reprocess          # reprocess
510          return {type => DELIM_TOKEN, value => '-'};          return {type => MINUS_TOKEN};
511        }        }
512      } elsif ($self->{state} == AFTER_AT_STATE) {      } elsif ($self->{state} == AFTER_AT_STATE) {
513        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z        if ((0x0041 <= $self->{c} and $self->{c} <= 0x005A) or # A..Z
# Line 511  sub get_next_token ($) { Line 550  sub get_next_token ($) {
550            return {type => DELIM_TOKEN, value => '@'};            return {type => DELIM_TOKEN, value => '@'};
551            #redo A;            #redo A;
552          } else {          } else {
553            unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};            unshift @{$self->{token}}, {type => MINUS_TOKEN};
554            $self->{t} = {type => IDENT_TOKEN, value => '-'};            $self->{t} = {type => IDENT_TOKEN, value => '-'};
555            $self->{state} = BEFORE_NMSTART_STATE;            $self->{state} = BEFORE_NMSTART_STATE;
556            # reprocess            # reprocess
# Line 524  sub get_next_token ($) { Line 563  sub get_next_token ($) {
563          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
564          redo A;          redo A;
565        } else {        } else {
566          unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};          unshift @{$self->{token}}, {type => MINUS_TOKEN};
567          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
568          # reprocess          # reprocess
569          return {type => DELIM_TOKEN, value => '@'};          return {type => DELIM_TOKEN, value => '@'};
# Line 816  sub get_next_token ($) { Line 855  sub get_next_token ($) {
855            redo A;            redo A;
856          } else {          } else {
857            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $self->{t}->{value} .= chr $self->{c};  
858            $self->{state} = STRING_STATE;            $self->{state} = STRING_STATE;
859            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
860            redo A;            redo A;
# Line 832  sub get_next_token ($) { Line 870  sub get_next_token ($) {
870                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_TOKEN, URI_PREFIX_INVALID_TOKEN,
871                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,                URI_PREFIX_INVALID_TOKEN, URI_PREFIX_INVALID_TOKEN,
872            }->{$self->{t}->{type}};            }->{$self->{t}->{type}};
           $self->{t}->{value} .= "\x0D";  
873            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
874            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
875            redo A;            redo A;
876          } else {          } else {
877            ## Note: In |nl| in ... in |string| or |ident|.            ## Note: In |nl| in ... in |string| or |ident|.
           $self->{t}->{value} .= "\x0D";  
878            $self->{state} = ESCAPE_BEFORE_LF_STATE;            $self->{state} = ESCAPE_BEFORE_LF_STATE;
879            $self->{c} = $self->{get_char}->();            $self->{c} = $self->{get_char}->();
880            redo A;            redo A;
# Line 860  sub get_next_token ($) { Line 896  sub get_next_token ($) {
896              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
897              # reprocess              # reprocess
898              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
899              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '-'};              unshift @{$self->{token}}, {type => MINUS_TOKEN};
900              $self->{t}->{type} = NUMBER_TOKEN;              $self->{t}->{type} = NUMBER_TOKEN;
901              $self->{t}->{value} = '';              $self->{t}->{value} = '';
902              return $self->{t};              return $self->{t};
# Line 885  sub get_next_token ($) { Line 921  sub get_next_token ($) {
921              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
922              # reprocess              # reprocess
923              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};              unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '\\'};
924              return {type => DELIM_TOKEN, value => '-'};              return {type => MINUS_TOKEN};
925              #redo A;              #redo A;
926            } elsif (length $self->{t}->{value}) {            } elsif (length $self->{t}->{value}) {
927              $self->{state} = BEFORE_TOKEN_STATE;              $self->{state} = BEFORE_TOKEN_STATE;
# Line 976  sub get_next_token ($) { Line 1012  sub get_next_token ($) {
1012          redo A;          redo A;
1013        }        }
1014      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {      } elsif ($self->{state} == ESCAPE_BEFORE_LF_STATE) {
1015        ## NOTE: |\n| in |\r\n| in |unicode| in |escape|.        ## NOTE: |\n| in |\r\n| in |nl| in |escape|.
1016        if ($self->{c} == 0x000A) { # \n        if ($self->{c} == 0x000A) { # \n
         $self->{t}->{value} .= chr $self->{c};  
1017          $self->{state} = $q == 0 ? NAME_STATE :          $self->{state} = $q == 0 ? NAME_STATE :
1018              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;              $q == 1 ? URI_UNQUOTED_STATE : STRING_STATE;
1019          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
# Line 1057  sub get_next_token ($) { Line 1092  sub get_next_token ($) {
1092          $self->{c} = $self->{get_char}->();          $self->{c} = $self->{get_char}->();
1093          redo A;          redo A;
1094        } else {        } else {
1095          unshift @{$self->{token}}, {type => DELIM_TOKEN, value => '.'};          unshift @{$self->{token}}, {type => DOT_TOKEN};
1096          $self->{t}->{number} = $self->{t}->{value};          $self->{t}->{number} = $self->{t}->{value};
1097          $self->{t}->{value} = '';          $self->{t}->{value} = '';
1098          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
# Line 1075  sub get_next_token ($) { Line 1110  sub get_next_token ($) {
1110        } else {        } else {
1111          $self->{state} = BEFORE_TOKEN_STATE;          $self->{state} = BEFORE_TOKEN_STATE;
1112          # reprocess          # reprocess
1113          return {type => DELIM_TOKEN, value => '.'};          return {type => DOT_TOKEN};
1114          #redo A;          #redo A;
1115        }        }
1116      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {      } elsif ($self->{state} == NUMBER_DOT_NUMBER_STATE) {
# Line 1098  sub get_next_token ($) { Line 1133  sub get_next_token ($) {
1133    } # A    } # A
1134  } # get_next_token  } # get_next_token
1135    
1136    sub serialize_token ($$) {
1137      shift;
1138      my $t = shift;
1139    
1140      ## NOTE: This function is not intended for roundtrip-able serialization.
1141    
1142      if ($t->{type} == IDENT_TOKEN) {
1143        return $t->{value};
1144      } elsif ($t->{type} == ATKEYWORD_TOKEN) {
1145        return '@' . $t->{value};
1146      } elsif ($t->{type} == HASH_TOKEN) {
1147        return '#' . $t->{value};
1148      } elsif ($t->{type} == FUNCTION_TOKEN) {
1149        return $t->{value} . '(';
1150      } elsif ($t->{type} == URI_TOKEN) {
1151        return 'url(' . $t->{value} . ')';
1152      } elsif ($t->{type} == URI_INVALID_TOKEN) {
1153        return 'url(' . $t->{value};
1154      } elsif ($t->{type} == URI_PREFIX_TOKEN) {
1155        return 'url-prefix(' . $t->{value} . ')';
1156      } elsif ($t->{type} == URI_PREFIX_INVALID_TOKEN) {
1157        return 'url-prefix(' . $t->{value};
1158      } elsif ($t->{type} == STRING_TOKEN) {
1159        return '"' . $t->{value} . '"';
1160      } elsif ($t->{type} == INVALID_TOKEN) {
1161        return '"' . $t->{value};
1162      } elsif ($t->{type} == NUMBER_TOKEN) {
1163        return $t->{number};
1164      } elsif ($t->{type} == DIMENSION_TOKEN) {
1165        return $t->{number} . $t->{value};
1166      } elsif ($t->{type} == PERCENTAGE_TOKEN) {
1167        return $t->{number} . '%';
1168      } elsif ($t->{type} == UNICODE_RANGE_TOKEN) {
1169        return 'U+' . $t->{value};
1170      } elsif ($t->{type} == DELIM_TOKEN) {
1171        return $t->{value};
1172      } elsif ($t->{type} == PLUS_TOKEN) {
1173        return '+';
1174      } elsif ($t->{type} == GREATER_TOKEN) {
1175        return '>';
1176      } elsif ($t->{type} == COMMA_TOKEN) {
1177        return ',';
1178      } elsif ($t->{type} == TILDE_TOKEN) {
1179        return '~';
1180      } elsif ($t->{type} == DASHMATCH_TOKEN) {
1181        return '|=';
1182      } elsif ($t->{type} == PREFIXMATCH_TOKEN) {
1183        return '^=';
1184      } elsif ($t->{type} == SUFFIXMATCH_TOKEN) {
1185        return '$=';
1186      } elsif ($t->{type} == SUBSTRINGMATCH_TOKEN) {
1187        return '*=';
1188      } elsif ($t->{type} == INCLUDES_TOKEN) {
1189        return '~=';
1190      } elsif ($t->{type} == SEMICOLON_TOKEN) {
1191        return ';';
1192      } elsif ($t->{type} == LBRACE_TOKEN) {
1193        return '{';
1194      } elsif ($t->{type} == RBRACE_TOKEN) {
1195        return '}';
1196      } elsif ($t->{type} == LPAREN_TOKEN) {
1197        return '(';
1198      } elsif ($t->{type} == RPAREN_TOKEN) {
1199        return ')';
1200      } elsif ($t->{type} == LBRACKET_TOKEN) {
1201        return '[';
1202      } elsif ($t->{type} == RBRACKET_TOKEN) {
1203        return ']';
1204      } elsif ($t->{type} == S_TOKEN) {
1205        return ' ';
1206      } elsif ($t->{type} == CDO_TOKEN) {
1207        return '<!--';
1208      } elsif ($t->{type} == CDC_TOKEN) {
1209        return '-->';
1210      } elsif ($t->{type} == COMMENT_TOKEN) {
1211        return '/**/';
1212      } elsif ($t->{type} == COMMENT_INVALID_TOKEN) {
1213        return '/*';
1214      } elsif ($t->{type} == EOF_TOKEN) {
1215        return '{EOF}';
1216      } elsif ($t->{type} == MINUS_TOKEN) {
1217        return '-';
1218      } elsif ($t->{type} == STAR_TOKEN) {
1219        return '*';
1220      } elsif ($t->{type} == VBAR_TOKEN) {
1221        return '|';
1222      } elsif ($t->{type} == COLON_TOKEN) {
1223        return ':';
1224      } elsif ($t->{type} == MATCH_TOKEN) {
1225        return '=';
1226      } elsif ($t->{type} == EXCLAMATION_TOKEN) {
1227        return '!';
1228      } else {
1229        return '{'.$t->{type}.'}';
1230      }
1231    } # serialize_token
1232    
1233    =head1 LICENSE
1234    
1235    Copyright 2007 Wakaba <w@suika.fam.cx>
1236    
1237    This library is free software; you can redistribute it
1238    and/or modify it under the same terms as Perl itself.
1239    
1240    =cut
1241    
1242  1;  1;
1243  # $Date$  # $Date$

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.17

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24