Whatpm/XML/Parser.pm

package Whatpm::XML::Parser;
use strict;

push our @ISA, 'Whatpm::HTML';
use Whatpm::HTML::Tokenizer qw/:token/;

sub parse_char_string ($$$;$$) {
  #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
  my $self = shift;
  my $s = ref $_[0] ? $_[0] : \($_[0]);
  require Whatpm::Charset::DecodeHandle;
  my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
  return $self->parse_char_stream ($input, @_[1..$#_]);
} # parse_char_string

sub parse_char_stream ($$$;$$) {
  my $self = ref $_[0] ? shift : shift->new;
  my $input = $_[0];
  $self->{document} = $_[1];
  @{$self->{document}->child_nodes} = ();

  ## NOTE: |set_inner_html| copies most of this method's code

  $self->{confident} = 1 unless exists $self->{confident};
  $self->{document}->input_encoding ($self->{input_encoding})
      if defined $self->{input_encoding};
## TODO: |{input_encoding}| is needless?

  $self->{line_prev} = $self->{line} = 1;
  $self->{column_prev} = -1;
  $self->{column} = 0;
  $self->{set_nc} = sub {
    my $self = shift;

    my $char = '';
    if (defined $self->{next_nc}) {
      $char = $self->{next_nc};
      delete $self->{next_nc};
      $self->{nc} = ord $char;
    } else {
      $self->{char_buffer} = '';
      $self->{char_buffer_pos} = 0;

      my $count = $input->manakai_read_until
         ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
      if ($count) {
        $self->{line_prev} = $self->{line};
        $self->{column_prev} = $self->{column};
        $self->{column}++;
        $self->{nc}
            = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
        return;
      }

      if ($input->read ($char, 1)) {
        $self->{nc} = ord $char;
      } else {
        $self->{nc} = -1;
        return;
      }
    }

    ($self->{line_prev}, $self->{column_prev})
        = ($self->{line}, $self->{column});
    $self->{column}++;
    
    if ($self->{nc} == 0x000A) { # LF
      
      $self->{line}++;
      $self->{column} = 0;
    } elsif ($self->{nc} == 0x000D) { # CR
      
## TODO: support for abort/streaming
      my $next = '';
      if ($input->read ($next, 1) and $next ne "\x0A") {
        $self->{next_nc} = $next;
      }
      $self->{nc} = 0x000A; # LF # MUST
      $self->{line}++;
      $self->{column} = 0;
    } elsif ($self->{nc} == 0x0000) { # NULL
      
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
      $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
    }
  };

  $self->{read_until} = sub {
    #my ($scalar, $specials_range, $offset) = @_;
    return 0 if defined $self->{next_nc};

    my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
    my $offset = $_[2] || 0;

    if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
      pos ($self->{char_buffer}) = $self->{char_buffer_pos};
      if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
        substr ($_[0], $offset)
            = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
        my $count = $+[0] - $-[0];
        if ($count) {
          $self->{column} += $count;
          $self->{char_buffer_pos} += $count;
          $self->{line_prev} = $self->{line};
          $self->{column_prev} = $self->{column} - 1;
          $self->{nc} = -1;
        }
        return $count;
      } else {
        return 0;
      }
    } else {
      my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
      if ($count) {
        $self->{column} += $count;
        $self->{line_prev} = $self->{line};
        $self->{column_prev} = $self->{column} - 1;
        $self->{nc} = -1;
      }
      return $count;
    }
  }; # $self->{read_until}

  my $onerror = $_[2] || sub {
    my (%opt) = @_;
    my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
    my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
    warn "Parse error ($opt{type}) at line $line column $column\n";
  };
  $self->{parse_error} = sub {
    $onerror->(line => $self->{line}, column => $self->{column}, @_);
  };

  my $char_onerror = sub {
    my (undef, $type, %opt) = @_;
    $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
                    line => $self->{line}, column => $self->{column} + 1,
                    %opt, type => $type);
  }; # $char_onerror

  if ($_[3]) {
    $input = $_[3]->($input);
    $input->onerror ($char_onerror);
  } else {
    $input->onerror ($char_onerror) unless defined $input->onerror;
  }

  $self->_initialize_tokenizer;
  $self->_initialize_tree_constructor;
  $self->_construct_tree;
  $self->_terminate_tree_constructor;

  delete $self->{parse_error}; # remove loop

  return $self->{document};
} # parse_char_stream

sub new ($) {
  my $class = shift;
  my $self = bless {
    level => {must => 'm',
              should => 's',
              warn => 'w',
              info => 'i',
              uncertain => 'u'},
  }, $class;
  $self->{set_nc} = sub {
    $self->{nc} = -1;
  };
  $self->{parse_error} = sub {
    # 
  };
  $self->{change_encoding} = sub {
    # if ($_[0] is a supported encoding) {
    #   run "change the encoding" algorithm;
    #   throw Whatpm::HTML::RestartParser (charset => $new_encoding);
    # }
  };
  $self->{application_cache_selection} = sub {
    #
  };
  return $self;
} # new

sub _initialize_tree_constructor ($) {
  my $self = shift;
  ## NOTE: $self->{document} MUST be specified before this method is called
  $self->{document}->strict_error_checking (0);
  ## TODO: Turn mutation events off # MUST
  $self->{document}->dom_config
      ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
      = 0;
  $self->{document}->manakai_is_html (0);
  $self->{document}->set_user_data (manakai_source_line => 1);
  $self->{document}->set_user_data (manakai_source_column => 1);
} # _initialize_tree_constructor

sub _terminate_tree_constructor ($) {
  my $self = shift;
  $self->{document}->strict_error_checking (1);
  $self->{document}->dom_config
      ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
      = 1;
  ## TODO: Turn mutation events on
} # _terminate_tree_constructor

## Tree construction stage


## NOTE: Differences from the XML5 draft are marked as "XML5:".

## XML5: No namespace support.

## XML5: XML5 has "empty tag token".  In this implementation, it is
## represented as a start tag token with $token->{self_closing} flag
## set to true.

## XML5: XML5 has "short end tag token".  In this implementation, it
## is represented as an end tag token with $token->{tag_name} flag set
## to an empty string.

## XML5: Start, main, end phases.  In this implementation, they are
## represented by insertion modes.

## Insertion modes
sub INITIAL_IM () { 0 }
sub BEFORE_ROOT_ELEMENT_IM () { 1 }
sub IN_ELEMENT_IM () { 2 }
sub AFTER_ROOT_ELEMENT_IM () { 3 }

{
my $token; ## TODO: change to $self->{t}

sub _construct_tree ($) {
  my ($self) = @_;

  $token = $self->_get_next_token;

  delete $self->{tainted};
  $self->{open_elements} = [];
  $self->{insertion_mode} = INITIAL_IM;
  
  while (1) {
    if ($self->{insertion_mode} == IN_ELEMENT_IM) {
      $self->_tree_in_element;
    } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
      $self->_tree_after_root_element;
    } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
      $self->_tree_before_root_element;
    } elsif ($self->{insertion_mode} == INITIAL_IM) {
      $self->_tree_initial;
    } else {
      die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
    }

    last if $token->{type} == ABORT_TOKEN;
  }
} # _construct_tree

sub _tree_initial ($) {
  my $self = shift;

  B: while (1) {
    if ($token->{type} == DOCTYPE_TOKEN) {
      ## XML5: No "DOCTYPE" token.
      
      my $doctype = $self->{document}->create_document_type_definition
          (defined $token->{name} ? $token->{name} : '');
      
      ## NOTE: Default value for both |public_id| and |system_id| attributes
      ## are empty strings, so that we don't set any value in missing cases.
      $doctype->public_id ($token->{public_identifier})
          if defined $token->{public_identifier};
      $doctype->system_id ($token->{system_identifier})
          if defined $token->{system_identifier};
      
      ## TODO: internal_subset
      
      $self->{document}->append_child ($doctype);
      
      $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
      $token = $self->_get_next_token;
      return;
    } elsif ($token->{type} == START_TAG_TOKEN or
             $token->{type} == END_OF_FILE_TOKEN) {
      $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
      ## Reprocess.
      return;
    } elsif ($token->{type} == COMMENT_TOKEN) {
      my $comment = $self->{document}->create_comment ($token->{data});
      $self->{document}->append_child ($comment);
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == PI_TOKEN) {
      my $pi = $self->{document}->create_processing_instruction
          ($token->{target}, $token->{data});
      $self->{document}->append_child ($pi);

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == CHARACTER_TOKEN) {
      if (not $self->{tainted} and
          $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
        #
      }
      
      if (length $token->{data}) {
        ## XML5: Ignore the token.

        unless ($self->{tainted}) {
          $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
                          token => $token);
          $self->{tainted} = 1;
        }

        $self->{document}->manakai_append_text ($token->{data});
      }

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == END_TAG_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
                      text => $token->{tag_name},
                      token => $token);
      ## Ignore the token.
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == ABORT_TOKEN) {
      return;
    } else {
      die "$0: XML parser initial: Unknown token type $token->{type}";
    }
  } # B
} # _tree_initial

sub _tree_before_root_element ($) {
  my $self = shift;

  B: while (1) {
    if ($token->{type} == START_TAG_TOKEN) {
      my $ns; ## TODO:
      my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
      $self->{document}->append_child ($el);

      if ($token->{self_closing}) {
        delete $self->{self_closing};
        $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
      } else {
        push @{$self->{open_elements}}, [$el, $token->{tag_name}];
        $self->{insertion_mode} = IN_ELEMENT_IM;
      }

      #delete $self->{tainted};

      $token = $self->_get_next_token;
      return;
    } elsif ($token->{type} == COMMENT_TOKEN) {
      my $comment = $self->{document}->create_comment ($token->{data});
      $self->{document}->append_child ($comment);
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == PI_TOKEN) {
      my $pi = $self->{document}->create_processing_instruction
          ($token->{target}, $token->{data});
      $self->{document}->append_child ($pi);

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == CHARACTER_TOKEN) {
      if (not $self->{tainted} and
          $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
        #
      }
      
      if (length $token->{data}) {
        ## XML5: Ignore the token.

        unless ($self->{tainted}) {
          $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
                          token => $token);
          $self->{tainted} = 1;
        }

        $self->{document}->manakai_append_text ($token->{data});
      }

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == END_OF_FILE_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
                      token => $token);
      
      $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
      ## Reprocess.
      return;
    } elsif ($token->{type} == END_TAG_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
                      text => $token->{tag_name},
                      token => $token);
      ## Ignore the token.

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == DOCTYPE_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
                      token => $token);
      ## Ignore the token.
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == ABORT_TOKEN) {
      return;
    } else {
      die "$0: XML parser initial: Unknown token type $token->{type}";
    }
  } # B
} # _tree_before_root_element

sub _tree_in_element ($) {
  my $self = shift;
  
  B: while (1) {
    if ($token->{type} == CHARACTER_TOKEN) {
      $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == START_TAG_TOKEN) {
      my $ns; ## TODO:
      my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
      $self->{open_elements}->[-1]->[0]->append_child ($el);

      if ($token->{self_closing}) {
        delete $self->{self_closing};
      } else {
        push @{$self->{open_elements}}, [$el, $token->{tag_name}];
      }
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == END_TAG_TOKEN) {
      if ($self->{tag_name} eq '') {
        ## Short end tag token.
        pop @{$self->{open_elements}};
      } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
        pop @{$self->{open_elements}};
      } else {
        $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
                        text => $token->{tag_name},
                        token => $token);
        
        ## Has an element in scope
        INSCOPE: for my $i (reverse $#{$self->{open_elements}}..0) {
          if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
            splice @{$self->{open_elements}}, $i;
            last INSCOPE;
          }
        } # INSCOPE
      }
      
      unless (@{$self->{open_elements}}) {
        $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
        $token = $self->_get_next_token;
        return;
      } else {
        ## Stay in the state.
        $token = $self->_get_next_token;
        redo B;
      }
    } elsif ($token->{type} == COMMENT_TOKEN) {
      my $comment = $self->{document}->create_comment ($token->{data});
      $self->{open_elements}->[-1]->[0]->append_child ($comment);
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == PI_TOKEN) {
      my $pi = $self->{document}->create_processing_instruction
          ($token->{target}, $token->{data});
      $self->{open_elements}->[-1]->[0]->append_child ($pi);

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == END_OF_FILE_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
                      token => $token);
      
      $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
      $token = $self->_get_next_token;
      return;
    } elsif ($token->{type} == DOCTYPE_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
                      token => $token);
      ## Ignore the token.
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == ABORT_TOKEN) {
      return;
    } else {
      die "$0: XML parser initial: Unknown token type $token->{type}";
    }
  } # B
} # _tree_in_element

sub _tree_after_root_element ($) {
  my $self = shift;

  B: while (1) {
    if ($token->{type} == START_TAG_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
                      token => $token);

      ## XML5: Ignore the token.
      
      my $ns; ## TODO:
      my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
      $self->{document}->append_child ($el);
      
      if ($token->{self_closing}) {
        delete $self->{self_closing};
        ## Stay in the mode.
      } else {
        push @{$self->{open_elements}}, [$el, $token->{tag_name}];
        $self->{insertion_mode} = IN_ELEMENT_IM;
      }
      
      #delete $self->{tainted};

      $token = $self->_get_next_token;
      return;
    } elsif ($token->{type} == COMMENT_TOKEN) {
      my $comment = $self->{document}->create_comment ($token->{data});
      $self->{document}->append_child ($comment);
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == PI_TOKEN) {
      my $pi = $self->{document}->create_processing_instruction
          ($token->{target}, $token->{data});
      $self->{document}->append_child ($pi);

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == CHARACTER_TOKEN) {
      if (not $self->{tainted} and
          $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
        #
      }
      
      if (length $token->{data}) {
        ## XML5: Ignore the token.

        unless ($self->{tainted}) {
          $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
                          token => $token);
          $self->{tainted} = 1;
        }

        $self->{document}->manakai_append_text ($token->{data});
      }

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == END_OF_FILE_TOKEN) {
      ## Stop parsing.

      ## TODO: implement "stop parsing".

      $token = {type => ABORT_TOKEN};
      return;
    } elsif ($token->{type} == END_TAG_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
                      text => $token->{tag_name},
                      token => $token);
      ## Ignore the token.

      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == DOCTYPE_TOKEN) {
      $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
                      token => $token);
      ## Ignore the token.
      
      ## Stay in the mode.
      $token = $self->_get_next_token;
      next B;
    } elsif ($token->{type} == ABORT_TOKEN) {
      return;
    } else {
      die "$0: XML parser initial: Unknown token type $token->{type}";
    }
  } # B
} # _tree_after_root_element

}

1;
1	package Whatpm::XML::Parser;
2	use strict;
3
4	push our @ISA, 'Whatpm::HTML';
5	use Whatpm::HTML::Tokenizer qw/:token/;
6
7	sub parse_char_string ($$$;$$) {
8	#my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9	my $self = shift;
10	my $s = ref $_[0] ? $_[0] : \($_[0]);
11	require Whatpm::Charset::DecodeHandle;
12	my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13	return $self->parse_char_stream ($input, @_[1..$#_]);
14	} # parse_char_string
15
16	sub parse_char_stream ($$$;$$) {
17	my $self = ref $_[0] ? shift : shift->new;
18	my $input = $_[0];
19	$self->{document} = $_[1];
20	@{$self->{document}->child_nodes} = ();
21
22	## NOTE: \|set_inner_html\| copies most of this method's code
23
24	$self->{confident} = 1 unless exists $self->{confident};
25	$self->{document}->input_encoding ($self->{input_encoding})
26	if defined $self->{input_encoding};
27	## TODO: \|{input_encoding}\| is needless?
28
29	$self->{line_prev} = $self->{line} = 1;
30	$self->{column_prev} = -1;
31	$self->{column} = 0;
32	$self->{set_nc} = sub {
33	my $self = shift;
34
35	my $char = '';
36	if (defined $self->{next_nc}) {
37	$char = $self->{next_nc};
38	delete $self->{next_nc};
39	$self->{nc} = ord $char;
40	} else {
41	$self->{char_buffer} = '';
42	$self->{char_buffer_pos} = 0;
43
44	my $count = $input->manakai_read_until
45	($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46	if ($count) {
47	$self->{line_prev} = $self->{line};
48	$self->{column_prev} = $self->{column};
49	$self->{column}++;
50	$self->{nc}
51	= ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52	return;
53	}
54
55	if ($input->read ($char, 1)) {
56	$self->{nc} = ord $char;
57	} else {
58	$self->{nc} = -1;
59	return;
60	}
61	}
62
63	($self->{line_prev}, $self->{column_prev})
64	= ($self->{line}, $self->{column});
65	$self->{column}++;
66
67	if ($self->{nc} == 0x000A) { # LF
68
69	$self->{line}++;
70	$self->{column} = 0;
71	} elsif ($self->{nc} == 0x000D) { # CR
72
73	## TODO: support for abort/streaming
74	my $next = '';
75	if ($input->read ($next, 1) and $next ne "\x0A") {
76	$self->{next_nc} = $next;
77	}
78	$self->{nc} = 0x000A; # LF # MUST
79	$self->{line}++;
80	$self->{column} = 0;
81	} elsif ($self->{nc} == 0x0000) { # NULL
82
83	$self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84	$self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85	}
86	};
87
88	$self->{read_until} = sub {
89	#my ($scalar, $specials_range, $offset) = @_;
90	return 0 if defined $self->{next_nc};
91
92	my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93	my $offset = $_[2] \|\| 0;
94
95	if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96	pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97	if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98	substr ($_[0], $offset)
99	= substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100	my $count = $+[0] - $-[0];
101	if ($count) {
102	$self->{column} += $count;
103	$self->{char_buffer_pos} += $count;
104	$self->{line_prev} = $self->{line};
105	$self->{column_prev} = $self->{column} - 1;
106	$self->{nc} = -1;
107	}
108	return $count;
109	} else {
110	return 0;
111	}
112	} else {
113	my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114	if ($count) {
115	$self->{column} += $count;
116	$self->{line_prev} = $self->{line};
117	$self->{column_prev} = $self->{column} - 1;
118	$self->{nc} = -1;
119	}
120	return $count;
121	}
122	}; # $self->{read_until}
123
124	my $onerror = $_[2] \|\| sub {
125	my (%opt) = @_;
126	my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127	my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128	warn "Parse error ($opt{type}) at line $line column $column\n";
129	};
130	$self->{parse_error} = sub {
131	$onerror->(line => $self->{line}, column => $self->{column}, @_);
132	};
133
134	my $char_onerror = sub {
135	my (undef, $type, %opt) = @_;
136	$self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137	line => $self->{line}, column => $self->{column} + 1,
138	%opt, type => $type);
139	}; # $char_onerror
140
141	if ($_[3]) {
142	$input = $_[3]->($input);
143	$input->onerror ($char_onerror);
144	} else {
145	$input->onerror ($char_onerror) unless defined $input->onerror;
146	}
147
148	$self->_initialize_tokenizer;
149	$self->_initialize_tree_constructor;
150	$self->_construct_tree;
151	$self->_terminate_tree_constructor;
152
153	delete $self->{parse_error}; # remove loop
154
155	return $self->{document};
156	} # parse_char_stream
157
158	sub new ($) {
159	my $class = shift;
160	my $self = bless {
161	level => {must => 'm',
162	should => 's',
163	warn => 'w',
164	info => 'i',
165	uncertain => 'u'},
166	}, $class;
167	$self->{set_nc} = sub {
168	$self->{nc} = -1;
169	};
170	$self->{parse_error} = sub {
171	#
172	};
173	$self->{change_encoding} = sub {
174	# if ($_[0] is a supported encoding) {
175	# run "change the encoding" algorithm;
176	# throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177	# }
178	};
179	$self->{application_cache_selection} = sub {
180	#
181	};
182	return $self;
183	} # new
184
185	sub _initialize_tree_constructor ($) {
186	my $self = shift;
187	## NOTE: $self->{document} MUST be specified before this method is called
188	$self->{document}->strict_error_checking (0);
189	## TODO: Turn mutation events off # MUST
190	$self->{document}->dom_config
191	->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
192	= 0;
193	$self->{document}->manakai_is_html (0);
194	$self->{document}->set_user_data (manakai_source_line => 1);
195	$self->{document}->set_user_data (manakai_source_column => 1);
196	} # _initialize_tree_constructor
197
198	sub _terminate_tree_constructor ($) {
199	my $self = shift;
200	$self->{document}->strict_error_checking (1);
201	$self->{document}->dom_config
202	->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
203	= 1;
204	## TODO: Turn mutation events on
205	} # _terminate_tree_constructor
206
207	## Tree construction stage
208
209
210	## NOTE: Differences from the XML5 draft are marked as "XML5:".
211
212	## XML5: No namespace support.
213
214	## XML5: XML5 has "empty tag token". In this implementation, it is
215	## represented as a start tag token with $token->{self_closing} flag
216	## set to true.
217
218	## XML5: XML5 has "short end tag token". In this implementation, it
219	## is represented as an end tag token with $token->{tag_name} flag set
220	## to an empty string.
221
222	## XML5: Start, main, end phases. In this implementation, they are
223	## represented by insertion modes.
224
225	## Insertion modes
226	sub INITIAL_IM () { 0 }
227	sub BEFORE_ROOT_ELEMENT_IM () { 1 }
228	sub IN_ELEMENT_IM () { 2 }
229	sub AFTER_ROOT_ELEMENT_IM () { 3 }
230
231	{
232	my $token; ## TODO: change to $self->{t}
233
234	sub _construct_tree ($) {
235	my ($self) = @_;
236
237	$token = $self->_get_next_token;
238
239	delete $self->{tainted};
240	$self->{open_elements} = [];
241	$self->{insertion_mode} = INITIAL_IM;
242
243	while (1) {
244	if ($self->{insertion_mode} == IN_ELEMENT_IM) {
245	$self->_tree_in_element;
246	} elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
247	$self->_tree_after_root_element;
248	} elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
249	$self->_tree_before_root_element;
250	} elsif ($self->{insertion_mode} == INITIAL_IM) {
251	$self->_tree_initial;
252	} else {
253	die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
254	}
255
256	last if $token->{type} == ABORT_TOKEN;
257	}
258	} # _construct_tree
259
260	sub _tree_initial ($) {
261	my $self = shift;
262
263	B: while (1) {
264	if ($token->{type} == DOCTYPE_TOKEN) {
265	## XML5: No "DOCTYPE" token.
266
267	my $doctype = $self->{document}->create_document_type_definition
268	(defined $token->{name} ? $token->{name} : '');
269
270	## NOTE: Default value for both \|public_id\| and \|system_id\| attributes
271	## are empty strings, so that we don't set any value in missing cases.
272	$doctype->public_id ($token->{public_identifier})
273	if defined $token->{public_identifier};
274	$doctype->system_id ($token->{system_identifier})
275	if defined $token->{system_identifier};
276
277	## TODO: internal_subset
278
279	$self->{document}->append_child ($doctype);
280
281	$self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
282	$token = $self->_get_next_token;
283	return;
284	} elsif ($token->{type} == START_TAG_TOKEN or
285	$token->{type} == END_OF_FILE_TOKEN) {
286	$self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
287	## Reprocess.
288	return;
289	} elsif ($token->{type} == COMMENT_TOKEN) {
290	my $comment = $self->{document}->create_comment ($token->{data});
291	$self->{document}->append_child ($comment);
292
293	## Stay in the mode.
294	$token = $self->_get_next_token;
295	next B;
296	} elsif ($token->{type} == PI_TOKEN) {
297	my $pi = $self->{document}->create_processing_instruction
298	($token->{target}, $token->{data});
299	$self->{document}->append_child ($pi);
300
301	## Stay in the mode.
302	$token = $self->_get_next_token;
303	next B;
304	} elsif ($token->{type} == CHARACTER_TOKEN) {
305	if (not $self->{tainted} and
306	$token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
307	#
308	}
309
310	if (length $token->{data}) {
311	## XML5: Ignore the token.
312
313	unless ($self->{tainted}) {
314	$self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
315	token => $token);
316	$self->{tainted} = 1;
317	}
318
319	$self->{document}->manakai_append_text ($token->{data});
320	}
321
322	## Stay in the mode.
323	$token = $self->_get_next_token;
324	next B;
325	} elsif ($token->{type} == END_TAG_TOKEN) {
326	$self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
327	text => $token->{tag_name},
328	token => $token);
329	## Ignore the token.
330
331	## Stay in the mode.
332	$token = $self->_get_next_token;
333	next B;
334	} elsif ($token->{type} == ABORT_TOKEN) {
335	return;
336	} else {
337	die "$0: XML parser initial: Unknown token type $token->{type}";
338	}
339	} # B
340	} # _tree_initial
341
342	sub _tree_before_root_element ($) {
343	my $self = shift;
344
345	B: while (1) {
346	if ($token->{type} == START_TAG_TOKEN) {
347	my $ns; ## TODO:
348	my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
349	$self->{document}->append_child ($el);
350
351	if ($token->{self_closing}) {
352	delete $self->{self_closing};
353	$self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
354	} else {
355	push @{$self->{open_elements}}, [$el, $token->{tag_name}];
356	$self->{insertion_mode} = IN_ELEMENT_IM;
357	}
358
359	#delete $self->{tainted};
360
361	$token = $self->_get_next_token;
362	return;
363	} elsif ($token->{type} == COMMENT_TOKEN) {
364	my $comment = $self->{document}->create_comment ($token->{data});
365	$self->{document}->append_child ($comment);
366
367	## Stay in the mode.
368	$token = $self->_get_next_token;
369	next B;
370	} elsif ($token->{type} == PI_TOKEN) {
371	my $pi = $self->{document}->create_processing_instruction
372	($token->{target}, $token->{data});
373	$self->{document}->append_child ($pi);
374
375	## Stay in the mode.
376	$token = $self->_get_next_token;
377	next B;
378	} elsif ($token->{type} == CHARACTER_TOKEN) {
379	if (not $self->{tainted} and
380	$token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
381	#
382	}
383
384	if (length $token->{data}) {
385	## XML5: Ignore the token.
386
387	unless ($self->{tainted}) {
388	$self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
389	token => $token);
390	$self->{tainted} = 1;
391	}
392
393	$self->{document}->manakai_append_text ($token->{data});
394	}
395
396	## Stay in the mode.
397	$token = $self->_get_next_token;
398	next B;
399	} elsif ($token->{type} == END_OF_FILE_TOKEN) {
400	$self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
401	token => $token);
402
403	$self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
404	## Reprocess.
405	return;
406	} elsif ($token->{type} == END_TAG_TOKEN) {
407	$self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
408	text => $token->{tag_name},
409	token => $token);
410	## Ignore the token.
411
412	## Stay in the mode.
413	$token = $self->_get_next_token;
414	next B;
415	} elsif ($token->{type} == DOCTYPE_TOKEN) {
416	$self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
417	token => $token);
418	## Ignore the token.
419
420	## Stay in the mode.
421	$token = $self->_get_next_token;
422	next B;
423	} elsif ($token->{type} == ABORT_TOKEN) {
424	return;
425	} else {
426	die "$0: XML parser initial: Unknown token type $token->{type}";
427	}
428	} # B
429	} # _tree_before_root_element
430
431	sub _tree_in_element ($) {
432	my $self = shift;
433
434	B: while (1) {
435	if ($token->{type} == CHARACTER_TOKEN) {
436	$self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
437
438	## Stay in the mode.
439	$token = $self->_get_next_token;
440	next B;
441	} elsif ($token->{type} == START_TAG_TOKEN) {
442	my $ns; ## TODO:
443	my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
444	$self->{open_elements}->[-1]->[0]->append_child ($el);
445
446	if ($token->{self_closing}) {
447	delete $self->{self_closing};
448	} else {
449	push @{$self->{open_elements}}, [$el, $token->{tag_name}];
450	}
451
452	## Stay in the mode.
453	$token = $self->_get_next_token;
454	next B;
455	} elsif ($token->{type} == END_TAG_TOKEN) {
456	if ($self->{tag_name} eq '') {
457	## Short end tag token.
458	pop @{$self->{open_elements}};
459	} elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
460	pop @{$self->{open_elements}};
461	} else {
462	$self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
463	text => $token->{tag_name},
464	token => $token);
465
466	## Has an element in scope
467	INSCOPE: for my $i (reverse $#{$self->{open_elements}}..0) {
468	if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
469	splice @{$self->{open_elements}}, $i;
470	last INSCOPE;
471	}
472	} # INSCOPE
473	}
474
475	unless (@{$self->{open_elements}}) {
476	$self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
477	$token = $self->_get_next_token;
478	return;
479	} else {
480	## Stay in the state.
481	$token = $self->_get_next_token;
482	redo B;
483	}
484	} elsif ($token->{type} == COMMENT_TOKEN) {
485	my $comment = $self->{document}->create_comment ($token->{data});
486	$self->{open_elements}->[-1]->[0]->append_child ($comment);
487
488	## Stay in the mode.
489	$token = $self->_get_next_token;
490	next B;
491	} elsif ($token->{type} == PI_TOKEN) {
492	my $pi = $self->{document}->create_processing_instruction
493	($token->{target}, $token->{data});
494	$self->{open_elements}->[-1]->[0]->append_child ($pi);
495
496	## Stay in the mode.
497	$token = $self->_get_next_token;
498	next B;
499	} elsif ($token->{type} == END_OF_FILE_TOKEN) {
500	$self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
501	token => $token);
502
503	$self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
504	$token = $self->_get_next_token;
505	return;
506	} elsif ($token->{type} == DOCTYPE_TOKEN) {
507	$self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
508	token => $token);
509	## Ignore the token.
510
511	## Stay in the mode.
512	$token = $self->_get_next_token;
513	next B;
514	} elsif ($token->{type} == ABORT_TOKEN) {
515	return;
516	} else {
517	die "$0: XML parser initial: Unknown token type $token->{type}";
518	}
519	} # B
520	} # _tree_in_element
521
522	sub _tree_after_root_element ($) {
523	my $self = shift;
524
525	B: while (1) {
526	if ($token->{type} == START_TAG_TOKEN) {
527	$self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
528	token => $token);
529
530	## XML5: Ignore the token.
531
532	my $ns; ## TODO:
533	my $el = $self->{document}->create_element_ns ($ns, $token->{tag_name});
534	$self->{document}->append_child ($el);
535
536	if ($token->{self_closing}) {
537	delete $self->{self_closing};
538	## Stay in the mode.
539	} else {
540	push @{$self->{open_elements}}, [$el, $token->{tag_name}];
541	$self->{insertion_mode} = IN_ELEMENT_IM;
542	}
543
544	#delete $self->{tainted};
545
546	$token = $self->_get_next_token;
547	return;
548	} elsif ($token->{type} == COMMENT_TOKEN) {
549	my $comment = $self->{document}->create_comment ($token->{data});
550	$self->{document}->append_child ($comment);
551
552	## Stay in the mode.
553	$token = $self->_get_next_token;
554	next B;
555	} elsif ($token->{type} == PI_TOKEN) {
556	my $pi = $self->{document}->create_processing_instruction
557	($token->{target}, $token->{data});
558	$self->{document}->append_child ($pi);
559
560	## Stay in the mode.
561	$token = $self->_get_next_token;
562	next B;
563	} elsif ($token->{type} == CHARACTER_TOKEN) {
564	if (not $self->{tainted} and
565	$token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
566	#
567	}
568
569	if (length $token->{data}) {
570	## XML5: Ignore the token.
571
572	unless ($self->{tainted}) {
573	$self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
574	token => $token);
575	$self->{tainted} = 1;
576	}
577
578	$self->{document}->manakai_append_text ($token->{data});
579	}
580
581	## Stay in the mode.
582	$token = $self->_get_next_token;
583	next B;
584	} elsif ($token->{type} == END_OF_FILE_TOKEN) {
585	## Stop parsing.
586
587	## TODO: implement "stop parsing".
588
589	$token = {type => ABORT_TOKEN};
590	return;
591	} elsif ($token->{type} == END_TAG_TOKEN) {
592	$self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
593	text => $token->{tag_name},
594	token => $token);
595	## Ignore the token.
596
597	## Stay in the mode.
598	$token = $self->_get_next_token;
599	next B;
600	} elsif ($token->{type} == DOCTYPE_TOKEN) {
601	$self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
602	token => $token);
603	## Ignore the token.
604
605	## Stay in the mode.
606	$token = $self->_get_next_token;
607	next B;
608	} elsif ($token->{type} == ABORT_TOKEN) {
609	return;
610	} else {
611	die "$0: XML parser initial: Unknown token type $token->{type}";
612	}
613	} # B
614	} # _tree_after_root_element
615
616	}
617
618	1;