/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Tue Oct 14 02:27:58 2008 UTC revision 1.4 by wakaba, Tue Oct 14 11:46:57 2008 UTC
# Line 2  package Whatpm::HTML::Tokenizer; Line 2  package Whatpm::HTML::Tokenizer;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    BEGIN {
6      require Exporter;
7      push our @ISA, 'Exporter';
8    
9      our @EXPORT_OK = qw(
10        DOCTYPE_TOKEN
11        COMMENT_TOKEN
12        START_TAG_TOKEN
13        END_TAG_TOKEN
14        END_OF_FILE_TOKEN
15        CHARACTER_TOKEN
16        PI_TOKEN
17        ABORT_TOKEN
18      );
19      
20      our %EXPORT_TAGS = (
21        token => [qw(
22          DOCTYPE_TOKEN
23          COMMENT_TOKEN
24          START_TAG_TOKEN
25          END_TAG_TOKEN
26          END_OF_FILE_TOKEN
27          CHARACTER_TOKEN
28          PI_TOKEN
29          ABORT_TOKEN
30        )],
31      );
32    }
33    
34    ## Token types
35    
36    sub DOCTYPE_TOKEN () { 1 }
37    sub COMMENT_TOKEN () { 2 }
38    sub START_TAG_TOKEN () { 3 }
39    sub END_TAG_TOKEN () { 4 }
40    sub END_OF_FILE_TOKEN () { 5 }
41    sub CHARACTER_TOKEN () { 6 }
42    sub PI_TOKEN () { 7 } # XML5
43    sub ABORT_TOKEN () { 8 } # Not a token actually
44    
45  package Whatpm::HTML;  package Whatpm::HTML;
46    
47    BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49  ## Content model flags  ## Content model flags
50    
51  sub CM_ENTITY () { 0b001 } # & markup in data  sub CM_ENTITY () { 0b001 } # & markup in data
# Line 72  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
 ## Token types  
   
 sub DOCTYPE_TOKEN () { 1 }  
 sub COMMENT_TOKEN () { 2 }  
 sub START_TAG_TOKEN () { 3 }  
 sub END_TAG_TOKEN () { 4 }  
 sub END_OF_FILE_TOKEN () { 5 }  
 sub CHARACTER_TOKEN () { 6 }  
   
117  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
118  ## list and descriptions)  ## list and descriptions)
119    
# Line 142  sub _initialize_tokenizer ($) { Line 175  sub _initialize_tokenizer ($) {
175    #$self->{level}    #$self->{level}
176    #$self->{set_nc}    #$self->{set_nc}
177    #$self->{parse_error}    #$self->{parse_error}
178      #$self->{is_xml} (if XML)
179    
180    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
181    #$self->{s_kwd}; # state keyword - initialized when used    #$self->{s_kwd}; # state keyword - initialized when used
# Line 507  sub _get_next_token ($) { Line 541  sub _get_next_token ($) {
541                        
542            $self->{ct}            $self->{ct}
543              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
544                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
545                 line => $self->{line_prev},                 line => $self->{line_prev},
546                 column => $self->{column_prev}};                 column => $self->{column_prev}};
547            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 626  sub _get_next_token ($) { Line 660  sub _get_next_token ($) {
660                    
661          $self->{ct}          $self->{ct}
662              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
663                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
664                 line => $l, column => $c};                 line => $l, column => $c};
665          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
666                    
# Line 817  sub _get_next_token ($) { Line 851  sub _get_next_token ($) {
851        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
852                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
853                    
854          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
855                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
856            # start tag or end tag            # start tag or end tag
857          ## Stay in this state          ## Stay in this state
858                    
# Line 939  sub _get_next_token ($) { Line 974  sub _get_next_token ($) {
974                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
975                    
976          $self->{ca}          $self->{ca}
977              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
978                 value => '',                 value => '',
979                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
980          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1100  sub _get_next_token ($) { Line 1135  sub _get_next_token ($) {
1135        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
1136                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1137                    
1138          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
1139                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1140          ## Stay in the state          ## Stay in the state
1141                    
1142      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 1244  sub _get_next_token ($) { Line 1280  sub _get_next_token ($) {
1280                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
1281                    
1282          $self->{ca}          $self->{ca}
1283              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1284                 value => '',                 value => '',
1285                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
1286          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1977  sub _get_next_token ($) { Line 2013  sub _get_next_token ($) {
2013      }      }
2014        
2015          redo A;          redo A;
2016        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2017                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2018                    $self->{is_xml}) and
2019                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
2020                                                    
2021          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.4

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24