/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.181 by wakaba, Mon Sep 15 02:54:12 2008 UTC revision 1.182 by wakaba, Mon Sep 15 07:19:03 2008 UTC
# Line 3  use strict; Line 3  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4  use Error qw(:try);  use Error qw(:try);
5    
6    ## NOTE: This module don't check all HTML5 parse errors; character
7    ## encoding related parse errors are expected to be handled by relevant
8    ## modules.
9    ## Parse errors for control characters that are not allowed in HTML5
10    ## documents, for surrogate code points, and for noncharacter code
11    ## points, as well as U+FFFD substitions for characters whose code points
12    ## is higher than U+10FFFF may be detected by combining the parser with
13    ## the checker implemented by Whatpm::Charset::UnicodeChecker (for its
14    ## usage example, see |t/HTML-tree.t| in the Whatpm package or the
15    ## WebHACC::Language::HTML module in the WebHACC package).
16    
17  ## ISSUE:  ## ISSUE:
18  ## var doc = implementation.createDocument (null, null, null);  ## var doc = implementation.createDocument (null, null, null);
19  ## doc.write ('');  ## doc.write ('');
# Line 571  sub parse_byte_stream ($$$$;$$) { Line 582  sub parse_byte_stream ($$$$;$$) {
582    my $wrapped_char_stream = $get_wrapper->($char_stream);    my $wrapped_char_stream = $get_wrapper->($char_stream);
583    $wrapped_char_stream->onerror ($char_onerror);    $wrapped_char_stream->onerror ($char_onerror);
584    
585    my @args = @_; shift @args; # $s    my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef;
586    my $return;    my $return;
587    try {    try {
588      $return = $self->parse_char_stream ($wrapped_char_stream, @args);        $return = $self->parse_char_stream ($wrapped_char_stream, @args);  
# Line 621  sub parse_char_string ($$$;$$) { Line 632  sub parse_char_string ($$$;$$) {
632    my $s = ref $_[0] ? $_[0] : \($_[0]);    my $s = ref $_[0] ? $_[0] : \($_[0]);
633    require Whatpm::Charset::DecodeHandle;    require Whatpm::Charset::DecodeHandle;
634    my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);    my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
   if ($_[3]) {  
     $input = $_[3]->($input);  
   }  
635    return $self->parse_char_stream ($input, @_[1..$#_]);    return $self->parse_char_stream ($input, @_[1..$#_]);
636  } # parse_char_string  } # parse_char_string
637  *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.  *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility.
638    
639  my $disallowed_control_chars =  sub parse_char_stream ($$$;$$) {
 {  
  0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,  
  0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,  
  0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,  
  0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,  
  0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,  
  0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,  
  0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,  
  0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,  
  0x10FFFE => 1, 0x10FFFF => 1,  
 };  
 $disallowed_control_chars->{$_} = 1  
     for 0x0001 .. 0x0008, 0x000E .. 0x001F, 0x007F .. 0x009F,  
         0xD800 .. 0xDFFF, 0xFDD0 .. 0xFDDF;  
 ## ISSUE: U+FDE0-U+FDEF are not excluded  
   
 sub parse_char_stream ($$$;$) {  
640    my $self = ref $_[0] ? shift : shift->new;    my $self = ref $_[0] ? shift : shift->new;
641    my $input = $_[0];    my $input = $_[0];
642    $self->{document} = $_[1];    $self->{document} = $_[1];
# Line 658  sub parse_char_stream ($$$;$) { Line 649  sub parse_char_stream ($$$;$) {
649        if defined $self->{input_encoding};        if defined $self->{input_encoding};
650  ## TODO: |{input_encoding}| is needless?  ## TODO: |{input_encoding}| is needless?
651    
   my $i = 0;  
652    $self->{line_prev} = $self->{line} = 1;    $self->{line_prev} = $self->{line} = 1;
653    $self->{column_prev} = -1;    $self->{column_prev} = -1;
654    $self->{column} = 0;    $self->{column} = 0;
# Line 675  sub parse_char_stream ($$$;$) { Line 665  sub parse_char_stream ($$$;$) {
665        $self->{char_buffer_pos} = 0;        $self->{char_buffer_pos} = 0;
666    
667        my $count = $input->manakai_read_until        my $count = $input->manakai_read_until
668           ($self->{char_buffer},           ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
           qr/(?![\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/,  
           $self->{char_buffer_pos});  
669        if ($count) {        if ($count) {
670          $self->{line_prev} = $self->{line};          $self->{line_prev} = $self->{line};
671          $self->{column_prev} = $self->{column};          $self->{column_prev} = $self->{column};
# Line 713  sub parse_char_stream ($$$;$) { Line 701  sub parse_char_stream ($$$;$) {
701        $self->{next_char} = 0x000A; # LF # MUST        $self->{next_char} = 0x000A; # LF # MUST
702        $self->{line}++;        $self->{line}++;
703        $self->{column} = 0;        $self->{column} = 0;
     } elsif ($self->{next_char} > 0x10FFFF) {  
       !!!cp ('j3');  
       $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST  
704      } elsif ($self->{next_char} == 0x0000) { # NULL      } elsif ($self->{next_char} == 0x0000) { # NULL
705        !!!cp ('j4');        !!!cp ('j4');
706        !!!parse-error (type => 'NULL');        !!!parse-error (type => 'NULL');
707        $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST        $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
     } elsif ($disallowed_control_chars->{$self->{next_char}}) {  
       !!!cp ('j5');  
       if ($self->{next_char} < 0x10000) {  
         !!!parse-error (type => 'control char',  
                         text => (sprintf 'U+%04X', $self->{next_char}));  
       } else {  
         !!!parse-error (type => 'control char',  
                         text => (sprintf 'U-%08X', $self->{next_char}));  
       }  
708      }      }
709    };    };
710    
# Line 736  sub parse_char_stream ($$$;$) { Line 712  sub parse_char_stream ($$$;$) {
712      #my ($scalar, $specials_range, $offset) = @_;      #my ($scalar, $specials_range, $offset) = @_;
713      return 0 if defined $self->{next_next_char};      return 0 if defined $self->{next_next_char};
714    
715      my $pattern = qr/(?![$_[1]\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/;      my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
716      my $offset = $_[2] || 0;      my $offset = $_[2] || 0;
717    
718      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 780  sub parse_char_stream ($$$;$) { Line 756  sub parse_char_stream ($$$;$) {
756      $onerror->(line => $self->{line}, column => $self->{column}, @_);      $onerror->(line => $self->{line}, column => $self->{column}, @_);
757    };    };
758    
759      my $char_onerror = sub {
760        my (undef, $type, %opt) = @_;
761        !!!parse-error (layer => 'encode',
762                        line => $self->{line}, column => $self->{column} + 1,
763                        %opt, type => $type);
764      }; # $char_onerror
765    
766      if ($_[3]) {
767        $input = $_[3]->($input);
768        $input->onerror ($char_onerror);
769      } else {
770        $input->onerror ($char_onerror) unless defined $input->onerror;
771      }
772    
773    $self->_initialize_tokenizer;    $self->_initialize_tokenizer;
774    $self->_initialize_tree_constructor;    $self->_initialize_tree_constructor;
775    $self->_construct_tree;    $self->_construct_tree;
# Line 7835  sub set_inner_html ($$$$;$) { Line 7825  sub set_inner_html ($$$$;$) {
7825          $self->{char_buffer_pos} = 0;          $self->{char_buffer_pos} = 0;
7826                    
7827          my $count = $input->manakai_read_until          my $count = $input->manakai_read_until
7828              ($self->{char_buffer},              ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/,
7829               qr/(?![\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/,               $self->{char_buffer_pos});
                $self->{char_buffer_pos});  
7830          if ($count) {          if ($count) {
7831            $self->{line_prev} = $self->{line};            $self->{line_prev} = $self->{line};
7832            $self->{column_prev} = $self->{column};            $self->{column_prev} = $self->{column};
# Line 7873  sub set_inner_html ($$$$;$) { Line 7862  sub set_inner_html ($$$$;$) {
7862          $p->{line}++;          $p->{line}++;
7863          $p->{column} = 0;          $p->{column} = 0;
7864          !!!cp ('i2');          !!!cp ('i2');
       } elsif ($self->{next_char} > 0x10FFFF) {  
         $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST  
         !!!cp ('i3');  
7865        } elsif ($self->{next_char} == 0x0000) { # NULL        } elsif ($self->{next_char} == 0x0000) { # NULL
7866          !!!cp ('i4');          !!!cp ('i4');
7867          !!!parse-error (type => 'NULL');          !!!parse-error (type => 'NULL');
7868          $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST          $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
       } elsif ($disallowed_control_chars->{$self->{next_char}}) {  
         !!!cp ('i4.1');  
         if ($self->{next_char} < 0x10000) {  
           !!!parse-error (type => 'control char',  
                           text => (sprintf 'U+%04X', $self->{next_char}));  
         } else {  
           !!!parse-error (type => 'control char',  
                           text => (sprintf 'U-%08X', $self->{next_char}));  
         }  
7869        }        }
7870      };      };
7871    
# Line 7896  sub set_inner_html ($$$$;$) { Line 7873  sub set_inner_html ($$$$;$) {
7873        #my ($scalar, $specials_range, $offset) = @_;        #my ($scalar, $specials_range, $offset) = @_;
7874        return 0 if defined $p->{next_next_char};        return 0 if defined $p->{next_next_char};
7875    
7876        my $pattern = qr/(?![$_[1]\x{FDD0}-\x{FDDF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}])[\x20-\x7E\xA0-\x{D7FF}\x{E000}-\x{10FFFD}]/;        my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
7877        my $offset = $_[2] || 0;        my $offset = $_[2] || 0;
7878                
7879        if ($p->{char_buffer_pos} < length $p->{char_buffer}) {        if ($p->{char_buffer_pos} < length $p->{char_buffer}) {

Legend:
Removed from v.1.181  
changed lines
  Added in v.1.182

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24