Whatpm/XML/Parser.pm.src

package Whatpm::XML::Parser;
use strict;

push our @ISA, 'Whatpm::HTML';
use Whatpm::HTML::Tokenizer qw/:token/;

sub parse_char_string ($$$;$$) {
  #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
  my $self = shift;
  my $s = ref $_[0] ? $_[0] : \($_[0]);
  require Whatpm::Charset::DecodeHandle;
  my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
  return $self->parse_char_stream ($input, @_[1..$#_]);
} # parse_char_string

sub parse_char_stream ($$$;$$) {
  my $self = ref $_[0] ? shift : shift->new;
  my $input = $_[0];
  $self->{document} = $_[1];
  @{$self->{document}->child_nodes} = ();

  ## NOTE: |set_inner_html| copies most of this method's code

  $self->{confident} = 1 unless exists $self->{confident};
  $self->{document}->input_encoding ($self->{input_encoding})
      if defined $self->{input_encoding};
## TODO: |{input_encoding}| is needless?

  $self->{line_prev} = $self->{line} = 1;
  $self->{column_prev} = -1;
  $self->{column} = 0;
  $self->{set_nc} = sub {
    my $self = shift;

    my $char = '';
    if (defined $self->{next_nc}) {
      $char = $self->{next_nc};
      delete $self->{next_nc};
      $self->{nc} = ord $char;
    } else {
      $self->{char_buffer} = '';
      $self->{char_buffer_pos} = 0;

      my $count = $input->manakai_read_until
         ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
      if ($count) {
        $self->{line_prev} = $self->{line};
        $self->{column_prev} = $self->{column};
        $self->{column}++;
        $self->{nc}
            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
        return;
      }

      if ($input->read ($char, 1)) {
        $self->{nc} = ord $char;
      } else {
        $self->{nc} = -1;
        return;
      }
    }

    ($self->{line_prev}, $self->{column_prev})
        = ($self->{line}, $self->{column});
    $self->{column}++;
    
    if ($self->{nc} == 0x000A) { # LF
      !!!cp ('j1');
      $self->{line}++;
      $self->{column} = 0;
    } elsif ($self->{nc} == 0x000D) { # CR
      !!!cp ('j2');
## TODO: support for abort/streaming
      my $next = '';
      if ($input->read ($next, 1) and $next ne "\x0A") {
        $self->{next_nc} = $next;
      }
      $self->{nc} = 0x000A; # LF # MUST
      $self->{line}++;
      $self->{column} = 0;
    } elsif ($self->{nc} == 0x0000) { # NULL
      !!!cp ('j4');
      !!!parse-error (type => 'NULL');
      $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
    }
  };

  $self->{read_until} = sub {
    #my ($scalar, $specials_range, $offset) = @_;
    return 0 if defined $self->{next_nc};

    my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
    my $offset = $_[2] || 0;

    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
      pos ($self->{char_buffer}) = $self->{char_buffer_pos};
      if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
        substr ($_[0], $offset)
            = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
        my $count = $+[0] - $-[0];
        if ($count) {
          $self->{column} += $count;
          $self->{char_buffer_pos} += $count;
          $self->{line_prev} = $self->{line};
          $self->{column_prev} = $self->{column} - 1;
          $self->{nc} = -1;
        }
        return $count;
      } else {
        return 0;
      }
    } else {
      my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
      if ($count) {
        $self->{column} += $count;
        $self->{line_prev} = $self->{line};
        $self->{column_prev} = $self->{column} - 1;
        $self->{nc} = -1;
      }
      return $count;
    }
  }; # $self->{read_until}

  my $onerror = $_[2] || sub {
    my (%opt) = @_;
    my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
    my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
    warn "Parse error ($opt{type}) at line $line column $column\n";
  };
  $self->{parse_error} = sub {
    $onerror->(line => $self->{line}, column => $self->{column}, @_);
  };

  my $char_onerror = sub {
    my (undef, $type, %opt) = @_;
    !!!parse-error (layer => 'encode',
                    line => $self->{line}, column => $self->{column} + 1,
                    %opt, type => $type);
  }; # $char_onerror

  if ($_[3]) {
    $input = $_[3]->($input);
    $input->onerror ($char_onerror);
  } else {
    $input->onerror ($char_onerror) unless defined $input->onerror;
  }

  $self->_initialize_tokenizer;
  $self->_initialize_tree_constructor;
  $self->_construct_tree;
  $self->_terminate_tree_constructor;

  delete $self->{parse_error}; # remove loop

  return $self->{document};
} # parse_char_stream

sub new ($) {
  my $class = shift;
  my $self = bless {
    level => {must => 'm',
              should => 's',
              warn => 'w',
              info => 'i',
              uncertain => 'u'},
  }, $class;
  $self->{set_nc} = sub {
    $self->{nc} = -1;
  };
  $self->{parse_error} = sub {
    # 
  };
  $self->{change_encoding} = sub {
    # if ($_[0] is a supported encoding) {
    #   run "change the encoding" algorithm;
    #   throw Whatpm::HTML::RestartParser (charset => $new_encoding);
    # }
  };
  $self->{application_cache_selection} = sub {
    #
  };

  $self->{is_xml} = 1;

  return $self;
} # new

sub _initialize_tree_constructor ($) {
  my $self = shift;
  ## NOTE: $self->{document} MUST be specified before this method is called
  $self->{document}->strict_error_checking (0);
  ## TODO: Turn mutation events off # MUST
  $self->{document}->dom_config
      ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
      = 0;
  $self->{document}->manakai_is_html (0);
  $self->{document}->set_user_data (manakai_source_line => 1);
  $self->{document}->set_user_data (manakai_source_column => 1);
} # _initialize_tree_constructor

sub _terminate_tree_constructor ($) {
  my $self = shift;
  $self->{document}->strict_error_checking (1);
  $self->{document}->dom_config
      ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
      = 1;
  ## TODO: Turn mutation events on
} # _terminate_tree_constructor

## Tree construction stage


## NOTE: Differences from the XML5 draft are marked as "XML5:".

## XML5: No namespace support.

## XML5: XML5 has "empty tag token".  In this implementation, it is
## represented as a start tag token with $self->{self_closing} flag
## set to true.

## XML5: XML5 has "short end tag token".  In this implementation, it
## is represented as an end tag token with $token->{tag_name} flag set
## to an empty string.

## XML5: Start, main, end phases.  In this implementation, they are
## represented by insertion modes.

## Insertion modes
sub INITIAL_IM () { 0 }
sub BEFORE_ROOT_ELEMENT_IM () { 1 }
sub IN_ELEMENT_IM () { 2 }
sub AFTER_ROOT_ELEMENT_IM () { 3 }

{
my $token; ## TODO: change to $self->{t}

sub _construct_tree ($) {
  my ($self) = @_;

  !!!next-token;

  delete $self->{tainted};
  $self->{open_elements} = [];
  $self->{insertion_mode} = INITIAL_IM;
  
  while (1) {
    if ($self->{insertion_mode} == IN_ELEMENT_IM) {
      $self->_tree_in_element;
    } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
      $self->_tree_after_root_element;
    } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
      $self->_tree_before_root_element;
    } elsif ($self->{insertion_mode} == INITIAL_IM) {
      $self->_tree_initial;
    } else {
      die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
    }

    last if $token->{type} == ABORT_TOKEN;
  }
} # _construct_tree

sub _tree_initial ($) {
  my $self = shift;

  B: while (1) {
    if ($token->{type} == DOCTYPE_TOKEN) {
      ## XML5: No "DOCTYPE" token.
      
      my $doctype = $self->{document}->create_document_type_definition
          (defined $token->{name} ? $token->{name} : '');
      
      ## NOTE: Default value for both |public_id| and |system_id| attributes
      ## are empty strings, so that we don't set any value in missing cases.
      $doctype->public_id ($token->{public_identifier})
          if defined $token->{public_identifier};
      $doctype->system_id ($token->{system_identifier})
          if defined $token->{system_identifier};
      
      ## TODO: internal_subset
      
      $self->{document}->append_child ($doctype);
      
      $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
      !!!next-token;
      return;
    } elsif ($token->{type} == START_TAG_TOKEN or
             $token->{type} == END_OF_FILE_TOKEN) {
      $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
      ## Reprocess.
      return;
    } elsif ($token->{type} == COMMENT_TOKEN) {
      my $comment = $self->{document}->create_comment ($token->{data});
      $self->{document}->append_child ($comment);
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == PI_TOKEN) {
      my $pi = $self->{document}->create_processing_instruction
          ($token->{target}, $token->{data});
      $self->{document}->append_child ($pi);

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == CHARACTER_TOKEN) {
      if (not $self->{tainted} and
          $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
        #
      }
      
      if (length $token->{data}) {
        ## XML5: Ignore the token.

        unless ($self->{tainted}) {
          !!!parse-error (type => 'text outside of root element',
                          token => $token);
          $self->{tainted} = 1;
        }

        $self->{document}->manakai_append_text ($token->{data});
      }

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == END_TAG_TOKEN) {
      !!!parse-error (type => 'unmatched end tag',
                      text => $token->{tag_name},
                      token => $token);
      ## Ignore the token.
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == ABORT_TOKEN) {
      return;
    } else {
      die "$0: XML parser initial: Unknown token type $token->{type}";
    }
  } # B
} # _tree_initial

sub _tree_before_root_element ($) {
  my $self = shift;

  B: while (1) {
    if ($token->{type} == START_TAG_TOKEN) {
      my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
      ($prefix, $ln) = (undef, $prefix) unless defined $ln;
      my $ns; ## TODO:
      my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
      $el->set_user_data (manakai_source_line => $token->{line});
      $el->set_user_data (manakai_source_column => $token->{column});

      for my $attr_name (keys %{$token->{attributes}}) {
        my $ns; ## TODO
        my ($p, $l) = split /:/, $attr_name, 2;
        ($p, $l) = (undef, $p) unless defined $l;
        my $attr_t = $token->{attributes}->{$attr_name};
        my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
        $attr->value ($attr_t->{value});
        $attr->set_user_data (manakai_source_line => $attr_t->{line});
        $attr->set_user_data (manakai_source_column => $attr_t->{column});
        $el->set_attribute_node_ns ($attr);
      }

      $self->{document}->append_child ($el);

      if ($self->{self_closing}) {
        !!!ack ('ack');
        $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
      } else {
        push @{$self->{open_elements}}, [$el, $token->{tag_name}];
        $self->{insertion_mode} = IN_ELEMENT_IM;
      }

      #delete $self->{tainted};

      !!!next-token;
      return;
    } elsif ($token->{type} == COMMENT_TOKEN) {
      my $comment = $self->{document}->create_comment ($token->{data});
      $self->{document}->append_child ($comment);
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == PI_TOKEN) {
      my $pi = $self->{document}->create_processing_instruction
          ($token->{target}, $token->{data});
      $self->{document}->append_child ($pi);

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == CHARACTER_TOKEN) {
      if (not $self->{tainted} and
          $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
        #
      }
      
      if (length $token->{data}) {
        ## XML5: Ignore the token.

        unless ($self->{tainted}) {
          !!!parse-error (type => 'text outside of root element',
                          token => $token);
          $self->{tainted} = 1;
        }

        $self->{document}->manakai_append_text ($token->{data});
      }

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == END_OF_FILE_TOKEN) {
      !!!parse-error (type => 'no root element',
                      token => $token);
      
      $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
      ## Reprocess.
      return;
    } elsif ($token->{type} == END_TAG_TOKEN) {
      !!!parse-error (type => 'unmatched end tag',
                      text => $token->{tag_name},
                      token => $token);
      ## Ignore the token.

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == DOCTYPE_TOKEN) {
      !!!parse-error (type => 'in html:#doctype',
                      token => $token);
      ## Ignore the token.
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == ABORT_TOKEN) {
      return;
    } else {
      die "$0: XML parser initial: Unknown token type $token->{type}";
    }
  } # B
} # _tree_before_root_element

sub _tree_in_element ($) {
  my $self = shift;
  
  B: while (1) {
    if ($token->{type} == CHARACTER_TOKEN) {
      $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == START_TAG_TOKEN) {
      my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
      ($prefix, $ln) = (undef, $prefix) unless defined $ln;
      my $ns; ## TODO:
      my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
      $el->set_user_data (manakai_source_line => $token->{line});
      $el->set_user_data (manakai_source_column => $token->{column});

      for my $attr_name (keys %{$token->{attributes}}) {
        my $ns; ## TODO
        my ($p, $l) = split /:/, $attr_name, 2;
        ($p, $l) = (undef, $p) unless defined $l;
        my $attr_t = $token->{attributes}->{$attr_name};
        my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
        $attr->value ($attr_t->{value});
        $attr->set_user_data (manakai_source_line => $attr_t->{line});
        $attr->set_user_data (manakai_source_column => $attr_t->{column});
        $el->set_attribute_node_ns ($attr);
      }

      $self->{open_elements}->[-1]->[0]->append_child ($el);

      if ($self->{self_closing}) {
        !!!ack ('ack');
      } else {
        push @{$self->{open_elements}}, [$el, $token->{tag_name}];
      }
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == END_TAG_TOKEN) {
      if ($token->{tag_name} eq '') {
        ## Short end tag token.
        pop @{$self->{open_elements}};
      } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
        pop @{$self->{open_elements}};
      } else {
        !!!parse-error (type => 'unmatched end tag',
                        text => $token->{tag_name},
                        token => $token);
        
        ## Has an element in scope
        INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
          if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
            splice @{$self->{open_elements}}, $i;
            last INSCOPE;
          }
        } # INSCOPE
      }
      
      unless (@{$self->{open_elements}}) {
        $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
        !!!next-token;
        return;
      } else {
        ## Stay in the state.
        !!!next-token;
        redo B;
      }
    } elsif ($token->{type} == COMMENT_TOKEN) {
      my $comment = $self->{document}->create_comment ($token->{data});
      $self->{open_elements}->[-1]->[0]->append_child ($comment);
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == PI_TOKEN) {
      my $pi = $self->{document}->create_processing_instruction
          ($token->{target}, $token->{data});
      $self->{open_elements}->[-1]->[0]->append_child ($pi);

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == END_OF_FILE_TOKEN) {
      !!!parse-error (type => 'in body:#eof',
                      token => $token);
      
      $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
      !!!next-token;
      return;
    } elsif ($token->{type} == DOCTYPE_TOKEN) {
      !!!parse-error (type => 'in html:#doctype',
                      token => $token);
      ## Ignore the token.
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == ABORT_TOKEN) {
      return;
    } else {
      die "$0: XML parser initial: Unknown token type $token->{type}";
    }
  } # B
} # _tree_in_element

sub _tree_after_root_element ($) {
  my $self = shift;

  B: while (1) {
    if ($token->{type} == START_TAG_TOKEN) {
      !!!parse-error (type => 'second root element',
                      token => $token);

      ## XML5: Ignore the token.

      $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
      ## Reprocess.
      return;
    } elsif ($token->{type} == COMMENT_TOKEN) {
      my $comment = $self->{document}->create_comment ($token->{data});
      $self->{document}->append_child ($comment);
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == PI_TOKEN) {
      my $pi = $self->{document}->create_processing_instruction
          ($token->{target}, $token->{data});
      $self->{document}->append_child ($pi);

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == CHARACTER_TOKEN) {
      if (not $self->{tainted} and
          $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
        #
      }
      
      if (length $token->{data}) {
        ## XML5: Ignore the token.

        unless ($self->{tainted}) {
          !!!parse-error (type => 'text outside of root element',
                          token => $token);
          $self->{tainted} = 1;
        }

        $self->{document}->manakai_append_text ($token->{data});
      }

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == END_OF_FILE_TOKEN) {
      ## Stop parsing.

      ## TODO: implement "stop parsing".

      $token = {type => ABORT_TOKEN};
      return;
    } elsif ($token->{type} == END_TAG_TOKEN) {
      !!!parse-error (type => 'unmatched end tag',
                      text => $token->{tag_name},
                      token => $token);
      ## Ignore the token.

      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == DOCTYPE_TOKEN) {
      !!!parse-error (type => 'in html:#doctype',
                      token => $token);
      ## Ignore the token.
      
      ## Stay in the mode.
      !!!next-token;
      next B;
    } elsif ($token->{type} == ABORT_TOKEN) {
      return;
    } else {
      die "$0: XML parser initial: Unknown token type $token->{type}";
    }
  } # B
} # _tree_after_root_element

}

1;
1	wakaba	1.1	package Whatpm::XML::Parser;
2			use strict;
3
4			push our @ISA, 'Whatpm::HTML';
5			use Whatpm::HTML::Tokenizer qw/:token/;
6
7			sub parse_char_string ($$$;$$) {
8			#my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9			my $self = shift;
10			my $s = ref $_[0] ? $_[0] : \($_[0]);
11			require Whatpm::Charset::DecodeHandle;
12			my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13			return $self->parse_char_stream ($input, @_[1..$#_]);
14			} # parse_char_string
15
16			sub parse_char_stream ($$$;$$) {
17			my $self = ref $_[0] ? shift : shift->new;
18			my $input = $_[0];
19			$self->{document} = $_[1];
20			@{$self->{document}->child_nodes} = ();
21
22			## NOTE: \|set_inner_html\| copies most of this method's code
23
24			$self->{confident} = 1 unless exists $self->{confident};
25			$self->{document}->input_encoding ($self->{input_encoding})
26			if defined $self->{input_encoding};
27			## TODO: \|{input_encoding}\| is needless?
28
29			$self->{line_prev} = $self->{line} = 1;
30			$self->{column_prev} = -1;
31			$self->{column} = 0;
32			$self->{set_nc} = sub {
33			my $self = shift;
34
35			my $char = '';
36			if (defined $self->{next_nc}) {
37			$char = $self->{next_nc};
38			delete $self->{next_nc};
39			$self->{nc} = ord $char;
40			} else {
41			$self->{char_buffer} = '';
42			$self->{char_buffer_pos} = 0;
43
44			my $count = $input->manakai_read_until
45			($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46			if ($count) {
47			$self->{line_prev} = $self->{line};
48			$self->{column_prev} = $self->{column};
49			$self->{column}++;
50			$self->{nc}
51			= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52			return;
53			}
54
55			if ($input->read ($char, 1)) {
56			$self->{nc} = ord $char;
57			} else {
58			$self->{nc} = -1;
59			return;
60			}
61			}
62
63			($self->{line_prev}, $self->{column_prev})
64			= ($self->{line}, $self->{column});
65			$self->{column}++;
66
67			if ($self->{nc} == 0x000A) { # LF
68			!!!cp ('j1');
69			$self->{line}++;
70			$self->{column} = 0;
71			} elsif ($self->{nc} == 0x000D) { # CR
72			!!!cp ('j2');
73			## TODO: support for abort/streaming
74			my $next = '';
75			if ($input->read ($next, 1) and $next ne "\x0A") {
76			$self->{next_nc} = $next;
77			}
78			$self->{nc} = 0x000A; # LF # MUST
79			$self->{line}++;
80			$self->{column} = 0;
81			} elsif ($self->{nc} == 0x0000) { # NULL
82			!!!cp ('j4');
83			!!!parse-error (type => 'NULL');
84			$self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85			}
86			};
87
88			$self->{read_until} = sub {
89			#my ($scalar, $specials_range, $offset) = @_;
90			return 0 if defined $self->{next_nc};
91
92			my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93			my $offset = $_[2] \|\| 0;
94
95			if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96			pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97			if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98			substr ($_[0], $offset)
99			= substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100			my $count = $+[0] - $-[0];
101			if ($count) {
102			$self->{column} += $count;
103			$self->{char_buffer_pos} += $count;
104			$self->{line_prev} = $self->{line};
105			$self->{column_prev} = $self->{column} - 1;
106			$self->{nc} = -1;
107			}
108			return $count;
109			} else {
110			return 0;
111			}
112			} else {
113			my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114			if ($count) {
115			$self->{column} += $count;
116			$self->{line_prev} = $self->{line};
117			$self->{column_prev} = $self->{column} - 1;
118			$self->{nc} = -1;
119			}
120			return $count;
121			}
122			}; # $self->{read_until}
123
124			my $onerror = $_[2] \|\| sub {
125			my (%opt) = @_;
126			my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127			my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128			warn "Parse error ($opt{type}) at line $line column $column\n";
129			};
130			$self->{parse_error} = sub {
131			$onerror->(line => $self->{line}, column => $self->{column}, @_);
132			};
133
134			my $char_onerror = sub {
135			my (undef, $type, %opt) = @_;
136			!!!parse-error (layer => 'encode',
137			line => $self->{line}, column => $self->{column} + 1,
138			%opt, type => $type);
139			}; # $char_onerror
140
141			if ($_[3]) {
142			$input = $_[3]->($input);
143			$input->onerror ($char_onerror);
144			} else {
145			$input->onerror ($char_onerror) unless defined $input->onerror;
146			}
147
148			$self->_initialize_tokenizer;
149			$self->_initialize_tree_constructor;
150			$self->_construct_tree;
151			$self->_terminate_tree_constructor;
152
153			delete $self->{parse_error}; # remove loop
154
155			return $self->{document};
156			} # parse_char_stream
157
158			sub new ($) {
159			my $class = shift;
160			my $self = bless {
161			level => {must => 'm',
162			should => 's',
163			warn => 'w',
164			info => 'i',
165			uncertain => 'u'},
166			}, $class;
167			$self->{set_nc} = sub {
168			$self->{nc} = -1;
169			};
170			$self->{parse_error} = sub {
171			#
172			};
173			$self->{change_encoding} = sub {
174			# if ($_[0] is a supported encoding) {
175			# run "change the encoding" algorithm;
176			# throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177			# }
178			};
179			$self->{application_cache_selection} = sub {
180			#
181			};
182	wakaba	1.2
183			$self->{is_xml} = 1;
184
185	wakaba	1.1	return $self;
186			} # new
187
188			sub _initialize_tree_constructor ($) {
189			my $self = shift;
190			## NOTE: $self->{document} MUST be specified before this method is called
191			$self->{document}->strict_error_checking (0);
192			## TODO: Turn mutation events off # MUST
193			$self->{document}->dom_config
194			->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195			= 0;
196			$self->{document}->manakai_is_html (0);
197			$self->{document}->set_user_data (manakai_source_line => 1);
198			$self->{document}->set_user_data (manakai_source_column => 1);
199			} # _initialize_tree_constructor
200
201			sub _terminate_tree_constructor ($) {
202			my $self = shift;
203			$self->{document}->strict_error_checking (1);
204			$self->{document}->dom_config
205			->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206			= 1;
207			## TODO: Turn mutation events on
208			} # _terminate_tree_constructor
209
210			## Tree construction stage
211
212
213			## NOTE: Differences from the XML5 draft are marked as "XML5:".
214
215			## XML5: No namespace support.
216
217			## XML5: XML5 has "empty tag token". In this implementation, it is
218	wakaba	1.3	## represented as a start tag token with $self->{self_closing} flag
219	wakaba	1.1	## set to true.
220
221			## XML5: XML5 has "short end tag token". In this implementation, it
222			## is represented as an end tag token with $token->{tag_name} flag set
223			## to an empty string.
224
225			## XML5: Start, main, end phases. In this implementation, they are
226			## represented by insertion modes.
227
228			## Insertion modes
229			sub INITIAL_IM () { 0 }
230			sub BEFORE_ROOT_ELEMENT_IM () { 1 }
231			sub IN_ELEMENT_IM () { 2 }
232			sub AFTER_ROOT_ELEMENT_IM () { 3 }
233
234			{
235			my $token; ## TODO: change to $self->{t}
236
237			sub _construct_tree ($) {
238			my ($self) = @_;
239
240			!!!next-token;
241
242			delete $self->{tainted};
243			$self->{open_elements} = [];
244			$self->{insertion_mode} = INITIAL_IM;
245
246			while (1) {
247			if ($self->{insertion_mode} == IN_ELEMENT_IM) {
248			$self->_tree_in_element;
249			} elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
250			$self->_tree_after_root_element;
251			} elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
252			$self->_tree_before_root_element;
253			} elsif ($self->{insertion_mode} == INITIAL_IM) {
254			$self->_tree_initial;
255			} else {
256			die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
257			}
258
259			last if $token->{type} == ABORT_TOKEN;
260			}
261			} # _construct_tree
262
263			sub _tree_initial ($) {
264			my $self = shift;
265
266			B: while (1) {
267			if ($token->{type} == DOCTYPE_TOKEN) {
268			## XML5: No "DOCTYPE" token.
269
270			my $doctype = $self->{document}->create_document_type_definition
271			(defined $token->{name} ? $token->{name} : '');
272
273			## NOTE: Default value for both \|public_id\| and \|system_id\| attributes
274			## are empty strings, so that we don't set any value in missing cases.
275			$doctype->public_id ($token->{public_identifier})
276			if defined $token->{public_identifier};
277			$doctype->system_id ($token->{system_identifier})
278			if defined $token->{system_identifier};
279
280			## TODO: internal_subset
281
282			$self->{document}->append_child ($doctype);
283
284			$self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
285			!!!next-token;
286			return;
287			} elsif ($token->{type} == START_TAG_TOKEN or
288			$token->{type} == END_OF_FILE_TOKEN) {
289			$self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
290			## Reprocess.
291			return;
292			} elsif ($token->{type} == COMMENT_TOKEN) {
293			my $comment = $self->{document}->create_comment ($token->{data});
294			$self->{document}->append_child ($comment);
295
296			## Stay in the mode.
297			!!!next-token;
298			next B;
299			} elsif ($token->{type} == PI_TOKEN) {
300			my $pi = $self->{document}->create_processing_instruction
301			($token->{target}, $token->{data});
302			$self->{document}->append_child ($pi);
303
304			## Stay in the mode.
305			!!!next-token;
306			next B;
307			} elsif ($token->{type} == CHARACTER_TOKEN) {
308			if (not $self->{tainted} and
309			$token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
310			#
311			}
312
313			if (length $token->{data}) {
314			## XML5: Ignore the token.
315
316			unless ($self->{tainted}) {
317			!!!parse-error (type => 'text outside of root element',
318			token => $token);
319			$self->{tainted} = 1;
320			}
321
322			$self->{document}->manakai_append_text ($token->{data});
323			}
324
325			## Stay in the mode.
326			!!!next-token;
327			next B;
328			} elsif ($token->{type} == END_TAG_TOKEN) {
329			!!!parse-error (type => 'unmatched end tag',
330			text => $token->{tag_name},
331			token => $token);
332			## Ignore the token.
333
334			## Stay in the mode.
335			!!!next-token;
336			next B;
337			} elsif ($token->{type} == ABORT_TOKEN) {
338			return;
339			} else {
340			die "$0: XML parser initial: Unknown token type $token->{type}";
341			}
342			} # B
343			} # _tree_initial
344
345			sub _tree_before_root_element ($) {
346			my $self = shift;
347
348			B: while (1) {
349			if ($token->{type} == START_TAG_TOKEN) {
350	wakaba	1.3	my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
351			($prefix, $ln) = (undef, $prefix) unless defined $ln;
352	wakaba	1.1	my $ns; ## TODO:
353	wakaba	1.3	my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
354	wakaba	1.4	$el->set_user_data (manakai_source_line => $token->{line});
355			$el->set_user_data (manakai_source_column => $token->{column});
356
357			for my $attr_name (keys %{$token->{attributes}}) {
358			my $ns; ## TODO
359			my ($p, $l) = split /:/, $attr_name, 2;
360			($p, $l) = (undef, $p) unless defined $l;
361			my $attr_t = $token->{attributes}->{$attr_name};
362			my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
363			$attr->value ($attr_t->{value});
364			$attr->set_user_data (manakai_source_line => $attr_t->{line});
365			$attr->set_user_data (manakai_source_column => $attr_t->{column});
366			$el->set_attribute_node_ns ($attr);
367			}
368
369	wakaba	1.1	$self->{document}->append_child ($el);
370
371	wakaba	1.3	if ($self->{self_closing}) {
372	wakaba	1.1	!!!ack ('ack');
373			$self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
374			} else {
375			push @{$self->{open_elements}}, [$el, $token->{tag_name}];
376			$self->{insertion_mode} = IN_ELEMENT_IM;
377			}
378
379			#delete $self->{tainted};
380
381			!!!next-token;
382			return;
383			} elsif ($token->{type} == COMMENT_TOKEN) {
384			my $comment = $self->{document}->create_comment ($token->{data});
385			$self->{document}->append_child ($comment);
386
387			## Stay in the mode.
388			!!!next-token;
389			next B;
390			} elsif ($token->{type} == PI_TOKEN) {
391			my $pi = $self->{document}->create_processing_instruction
392			($token->{target}, $token->{data});
393			$self->{document}->append_child ($pi);
394
395			## Stay in the mode.
396			!!!next-token;
397			next B;
398			} elsif ($token->{type} == CHARACTER_TOKEN) {
399			if (not $self->{tainted} and
400			$token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
401			#
402			}
403
404			if (length $token->{data}) {
405			## XML5: Ignore the token.
406
407			unless ($self->{tainted}) {
408			!!!parse-error (type => 'text outside of root element',
409			token => $token);
410			$self->{tainted} = 1;
411			}
412
413			$self->{document}->manakai_append_text ($token->{data});
414			}
415
416			## Stay in the mode.
417			!!!next-token;
418			next B;
419			} elsif ($token->{type} == END_OF_FILE_TOKEN) {
420			!!!parse-error (type => 'no root element',
421			token => $token);
422
423			$self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
424			## Reprocess.
425			return;
426			} elsif ($token->{type} == END_TAG_TOKEN) {
427			!!!parse-error (type => 'unmatched end tag',
428			text => $token->{tag_name},
429			token => $token);
430			## Ignore the token.
431
432			## Stay in the mode.
433			!!!next-token;
434			next B;
435			} elsif ($token->{type} == DOCTYPE_TOKEN) {
436			!!!parse-error (type => 'in html:#doctype',
437			token => $token);
438			## Ignore the token.
439
440			## Stay in the mode.
441			!!!next-token;
442			next B;
443			} elsif ($token->{type} == ABORT_TOKEN) {
444			return;
445			} else {
446			die "$0: XML parser initial: Unknown token type $token->{type}";
447			}
448			} # B
449			} # _tree_before_root_element
450
451			sub _tree_in_element ($) {
452			my $self = shift;
453
454			B: while (1) {
455			if ($token->{type} == CHARACTER_TOKEN) {
456			$self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
457
458			## Stay in the mode.
459			!!!next-token;
460			next B;
461			} elsif ($token->{type} == START_TAG_TOKEN) {
462	wakaba	1.3	my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
463			($prefix, $ln) = (undef, $prefix) unless defined $ln;
464	wakaba	1.1	my $ns; ## TODO:
465	wakaba	1.3	my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
466	wakaba	1.4	$el->set_user_data (manakai_source_line => $token->{line});
467			$el->set_user_data (manakai_source_column => $token->{column});
468
469			for my $attr_name (keys %{$token->{attributes}}) {
470			my $ns; ## TODO
471			my ($p, $l) = split /:/, $attr_name, 2;
472			($p, $l) = (undef, $p) unless defined $l;
473			my $attr_t = $token->{attributes}->{$attr_name};
474			my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
475			$attr->value ($attr_t->{value});
476			$attr->set_user_data (manakai_source_line => $attr_t->{line});
477			$attr->set_user_data (manakai_source_column => $attr_t->{column});
478			$el->set_attribute_node_ns ($attr);
479			}
480
481	wakaba	1.1	$self->{open_elements}->[-1]->[0]->append_child ($el);
482
483	wakaba	1.3	if ($self->{self_closing}) {
484	wakaba	1.1	!!!ack ('ack');
485			} else {
486			push @{$self->{open_elements}}, [$el, $token->{tag_name}];
487			}
488
489			## Stay in the mode.
490			!!!next-token;
491			next B;
492			} elsif ($token->{type} == END_TAG_TOKEN) {
493	wakaba	1.2	if ($token->{tag_name} eq '') {
494	wakaba	1.1	## Short end tag token.
495			pop @{$self->{open_elements}};
496			} elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
497			pop @{$self->{open_elements}};
498			} else {
499			!!!parse-error (type => 'unmatched end tag',
500			text => $token->{tag_name},
501			token => $token);
502
503			## Has an element in scope
504	wakaba	1.2	INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
505	wakaba	1.1	if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
506			splice @{$self->{open_elements}}, $i;
507			last INSCOPE;
508			}
509			} # INSCOPE
510			}
511
512			unless (@{$self->{open_elements}}) {
513			$self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
514			!!!next-token;
515			return;
516			} else {
517			## Stay in the state.
518			!!!next-token;
519			redo B;
520			}
521			} elsif ($token->{type} == COMMENT_TOKEN) {
522			my $comment = $self->{document}->create_comment ($token->{data});
523			$self->{open_elements}->[-1]->[0]->append_child ($comment);
524
525			## Stay in the mode.
526			!!!next-token;
527			next B;
528			} elsif ($token->{type} == PI_TOKEN) {
529			my $pi = $self->{document}->create_processing_instruction
530			($token->{target}, $token->{data});
531			$self->{open_elements}->[-1]->[0]->append_child ($pi);
532
533			## Stay in the mode.
534			!!!next-token;
535			next B;
536			} elsif ($token->{type} == END_OF_FILE_TOKEN) {
537			!!!parse-error (type => 'in body:#eof',
538			token => $token);
539
540			$self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
541			!!!next-token;
542			return;
543			} elsif ($token->{type} == DOCTYPE_TOKEN) {
544			!!!parse-error (type => 'in html:#doctype',
545			token => $token);
546			## Ignore the token.
547
548			## Stay in the mode.
549			!!!next-token;
550			next B;
551			} elsif ($token->{type} == ABORT_TOKEN) {
552			return;
553			} else {
554			die "$0: XML parser initial: Unknown token type $token->{type}";
555			}
556			} # B
557			} # _tree_in_element
558
559			sub _tree_after_root_element ($) {
560			my $self = shift;
561
562			B: while (1) {
563			if ($token->{type} == START_TAG_TOKEN) {
564			!!!parse-error (type => 'second root element',
565			token => $token);
566
567			## XML5: Ignore the token.
568	wakaba	1.4
569	wakaba	1.5	$self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
570			## Reprocess.
571	wakaba	1.1	return;
572			} elsif ($token->{type} == COMMENT_TOKEN) {
573			my $comment = $self->{document}->create_comment ($token->{data});
574			$self->{document}->append_child ($comment);
575
576			## Stay in the mode.
577			!!!next-token;
578			next B;
579			} elsif ($token->{type} == PI_TOKEN) {
580			my $pi = $self->{document}->create_processing_instruction
581			($token->{target}, $token->{data});
582			$self->{document}->append_child ($pi);
583
584			## Stay in the mode.
585			!!!next-token;
586			next B;
587			} elsif ($token->{type} == CHARACTER_TOKEN) {
588			if (not $self->{tainted} and
589			$token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
590			#
591			}
592
593			if (length $token->{data}) {
594			## XML5: Ignore the token.
595
596			unless ($self->{tainted}) {
597			!!!parse-error (type => 'text outside of root element',
598			token => $token);
599			$self->{tainted} = 1;
600			}
601
602			$self->{document}->manakai_append_text ($token->{data});
603			}
604
605			## Stay in the mode.
606			!!!next-token;
607			next B;
608			} elsif ($token->{type} == END_OF_FILE_TOKEN) {
609			## Stop parsing.
610
611			## TODO: implement "stop parsing".
612
613			$token = {type => ABORT_TOKEN};
614			return;
615			} elsif ($token->{type} == END_TAG_TOKEN) {
616			!!!parse-error (type => 'unmatched end tag',
617			text => $token->{tag_name},
618			token => $token);
619			## Ignore the token.
620
621			## Stay in the mode.
622			!!!next-token;
623			next B;
624			} elsif ($token->{type} == DOCTYPE_TOKEN) {
625			!!!parse-error (type => 'in html:#doctype',
626			token => $token);
627			## Ignore the token.
628
629			## Stay in the mode.
630			!!!next-token;
631			next B;
632			} elsif ($token->{type} == ABORT_TOKEN) {
633			return;
634			} else {
635			die "$0: XML parser initial: Unknown token type $token->{type}";
636			}
637			} # B
638			} # _tree_after_root_element
639
640			}
641
642			1;