/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1 by wakaba, Tue Oct 14 02:27:58 2008 UTC revision 1.4 by wakaba, Tue Oct 14 11:46:57 2008 UTC
# Line 2  package Whatpm::HTML::Tokenizer; Line 2  package Whatpm::HTML::Tokenizer;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5    BEGIN {
6      require Exporter;
7      push our @ISA, 'Exporter';
8    
9      our @EXPORT_OK = qw(
10        DOCTYPE_TOKEN
11        COMMENT_TOKEN
12        START_TAG_TOKEN
13        END_TAG_TOKEN
14        END_OF_FILE_TOKEN
15        CHARACTER_TOKEN
16        PI_TOKEN
17        ABORT_TOKEN
18      );
19      
20      our %EXPORT_TAGS = (
21        token => [qw(
22          DOCTYPE_TOKEN
23          COMMENT_TOKEN
24          START_TAG_TOKEN
25          END_TAG_TOKEN
26          END_OF_FILE_TOKEN
27          CHARACTER_TOKEN
28          PI_TOKEN
29          ABORT_TOKEN
30        )],
31      );
32    }
33    
34    ## Token types
35    
36    sub DOCTYPE_TOKEN () { 1 }
37    sub COMMENT_TOKEN () { 2 }
38    sub START_TAG_TOKEN () { 3 }
39    sub END_TAG_TOKEN () { 4 }
40    sub END_OF_FILE_TOKEN () { 5 }
41    sub CHARACTER_TOKEN () { 6 }
42    sub PI_TOKEN () { 7 } # XML5
43    sub ABORT_TOKEN () { 8 } # Not a token actually
44    
45  package Whatpm::HTML;  package Whatpm::HTML;
46    
47    BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49  ## Content model flags  ## Content model flags
50    
51  sub CM_ENTITY () { 0b001 } # & markup in data  sub CM_ENTITY () { 0b001 } # & markup in data
# Line 72  sub HEXREF_HEX_STATE () { 48 } Line 114  sub HEXREF_HEX_STATE () { 48 }
114  sub ENTITY_NAME_STATE () { 49 }  sub ENTITY_NAME_STATE () { 49 }
115  sub PCDATA_STATE () { 50 } # "data state" in the spec  sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
 ## Token types  
   
 sub DOCTYPE_TOKEN () { 1 }  
 sub COMMENT_TOKEN () { 2 }  
 sub START_TAG_TOKEN () { 3 }  
 sub END_TAG_TOKEN () { 4 }  
 sub END_OF_FILE_TOKEN () { 5 }  
 sub CHARACTER_TOKEN () { 6 }  
   
117  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
118  ## list and descriptions)  ## list and descriptions)
119    
# Line 142  sub _initialize_tokenizer ($) { Line 175  sub _initialize_tokenizer ($) {
175    #$self->{level}    #$self->{level}
176    #$self->{set_nc}    #$self->{set_nc}
177    #$self->{parse_error}    #$self->{parse_error}
178      #$self->{is_xml} (if XML)
179    
180    $self->{state} = DATA_STATE; # MUST    $self->{state} = DATA_STATE; # MUST
181    #$self->{s_kwd}; # state keyword - initialized when used    #$self->{s_kwd}; # state keyword - initialized when used
# Line 407  sub _get_next_token ($) { Line 441  sub _get_next_token ($) {
441            !!!cp (19);            !!!cp (19);
442            $self->{ct}            $self->{ct}
443              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
444                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
445                 line => $self->{line_prev},                 line => $self->{line_prev},
446                 column => $self->{column_prev}};                 column => $self->{column_prev}};
447            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
# Line 496  sub _get_next_token ($) { Line 530  sub _get_next_token ($) {
530          !!!cp (29);          !!!cp (29);
531          $self->{ct}          $self->{ct}
532              = {type => END_TAG_TOKEN,              = {type => END_TAG_TOKEN,
533                 tag_name => chr ($self->{nc} + 0x0020),                 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
534                 line => $l, column => $c};                 line => $l, column => $c};
535          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
536          !!!next-input-character;          !!!next-input-character;
# Line 627  sub _get_next_token ($) { Line 661  sub _get_next_token ($) {
661        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
662                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
663          !!!cp (38);          !!!cp (38);
664          $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);          $self->{ct}->{tag_name}
665                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
666            # start tag or end tag            # start tag or end tag
667          ## Stay in this state          ## Stay in this state
668          !!!next-input-character;          !!!next-input-character;
# Line 699  sub _get_next_token ($) { Line 734  sub _get_next_token ($) {
734                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
735          !!!cp (49);          !!!cp (49);
736          $self->{ca}          $self->{ca}
737              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
738                 value => '',                 value => '',
739                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
740          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 800  sub _get_next_token ($) { Line 835  sub _get_next_token ($) {
835        } elsif (0x0041 <= $self->{nc} and        } elsif (0x0041 <= $self->{nc} and
836                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
837          !!!cp (63);          !!!cp (63);
838          $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);          $self->{ca}->{name}
839                .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
840          ## Stay in the state          ## Stay in the state
841          !!!next-input-character;          !!!next-input-character;
842          redo A;          redo A;
# Line 884  sub _get_next_token ($) { Line 920  sub _get_next_token ($) {
920                 $self->{nc} <= 0x005A) { # A..Z                 $self->{nc} <= 0x005A) { # A..Z
921          !!!cp (76);          !!!cp (76);
922          $self->{ca}          $self->{ca}
923              = {name => chr ($self->{nc} + 0x0020),              = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
924                 value => '',                 value => '',
925                 line => $self->{line}, column => $self->{column}};                 line => $self->{line}, column => $self->{column}};
926          $self->{state} = ATTRIBUTE_NAME_STATE;          $self->{state} = ATTRIBUTE_NAME_STATE;
# Line 1357  sub _get_next_token ($) { Line 1393  sub _get_next_token ($) {
1393          $self->{s_kwd} = chr $self->{nc};          $self->{s_kwd} = chr $self->{nc};
1394          !!!next-input-character;          !!!next-input-character;
1395          redo A;          redo A;
1396        } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and        } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1397                 $self->{open_elements}->[-1]->[1] & FOREIGN_EL and                   $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1398                    $self->{is_xml}) and
1399                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
1400          !!!cp (135.4);                          !!!cp (135.4);                
1401          $self->{state} = MD_CDATA_STATE;          $self->{state} = MD_CDATA_STATE;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.4

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24