whatpm/t/HTML-tree.t

#!/usr/bin/perl
use strict;

my $DEBUG = $ENV{DEBUG};

use lib qw[/home/wakaba/work/manakai2/lib];

my $dir_name;
my $test_dir_name;
BEGIN {
  $test_dir_name = 't/';
  $dir_name = 't/tree-construction/';
  my $skip = "You don't have make command";
  eval q{
         system ("cd $test_dir_name; make tree-construction-files") == 0 or die
           unless -f $dir_name.'tests1.dat';
         $skip = '';
        };
  if ($skip) {
    print "1..1\n";
    print "ok 1 # $skip\n";
    exit;
  }
}

use Test;
BEGIN { plan tests => 3105 }

use Data::Dumper;
$Data::Dumper::Useqq = 1;
sub Data::Dumper::qquote {
  my $s = shift;
  $s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
  return q<qq'> . $s . q<'>;
} # Data::Dumper::qquote


if ($DEBUG) {
  my $not_found = {%{$Whatpm::HTML::Debug::cp or {}}};
  $Whatpm::HTML::Debug::cp_pass = sub {
    my $id = shift;
    delete $not_found->{$id};
  };

  END {
    for my $id (sort {$a <=> $b || $a cmp $b} keys %$not_found) {
      print "# checkpoint $id is not reached\n";
    }
  }
}

for my $file_name (grep {$_} split /\s+/, qq[
                      ${test_dir_name}tokenizer-test-2.dat
                      ${test_dir_name}tokenizer-test-3.dat
                      ${dir_name}tests1.dat
                      ${dir_name}tests2.dat
                      ${dir_name}tests3.dat
                      ${dir_name}tests4.dat
                      ${dir_name}tests5.dat
                      ${dir_name}tests6.dat
                      ${dir_name}tests7.dat
                      ${dir_name}tests8.dat
                      ${dir_name}tests9.dat
                      ${dir_name}tests10.dat
                      ${dir_name}tests11.dat
                      ${dir_name}tests12.dat
                      ${test_dir_name}tree-test-1.dat
                      ${test_dir_name}tree-test-2.dat
                      ${test_dir_name}tree-test-3.dat
                      ${test_dir_name}tree-test-void.dat
                      ${test_dir_name}tree-test-flow.dat
                      ${test_dir_name}tree-test-phrasing.dat
                      ${test_dir_name}tree-test-form.dat
                      ${test_dir_name}tree-test-foreign.dat
                     ]) {
  open my $file, '<', $file_name
    or die "$0: $file_name: $!";
  print "# $file_name\n";

  my $test;
  my $mode = 'data';
  my $escaped;
  while (<$file>) {
    s/\x0D\x0A/\x0A/;
    if (/^#data$/) {
      undef $test;
      $test->{data} = '';
      $mode = 'data';
      undef $escaped;
    } elsif (/^#data escaped$/) {
      undef $test;
      $test->{data} = '';
      $mode = 'data';
      $escaped = 1;
    } elsif (/^#errors$/) {
      $test->{errors} = [];
      $mode = 'errors';
      $test->{data} =~ s/\x0D?\x0A\z//;       
      $test->{data} =~ s/\\u([0-9A-Fa-f]{4})/chr hex $1/ge if $escaped;
      $test->{data} =~ s/\\U([0-9A-Fa-f]{8})/chr hex $1/ge if $escaped;
      undef $escaped;
    } elsif (/^#shoulds$/) {
      $test->{shoulds} = [];
      $mode = 'shoulds';
    } elsif (/^#document$/) {
      $test->{document} = '';
      $mode = 'document';
      undef $escaped;
    } elsif (/^#document escaped$/) {
      $test->{document} = '';
      $mode = 'document';
      $escaped = 1;
    } elsif (/^#document-fragment$/) {
      $test->{element} = '';
      $mode = 'element';
      undef $escaped;
    } elsif (/^#document-fragment (\S+)$/) {
      $test->{document} = '';
      $mode = 'document';
      $test->{element} = $1;
      undef $escaped;
    } elsif (/^#document-fragment (\S+) escaped$/) {
      $test->{document} = '';
      $mode = 'document';
      $test->{element} = $1;
      $escaped = 1;
    } elsif (defined $test->{document} and /^$/) {
      $test->{document} =~ s/\\u([0-9A-Fa-f]{4})/chr hex $1/ge if $escaped;
      $test->{document} =~ s/\\U([0-9A-Fa-f]{8})/chr hex $1/ge if $escaped;
      test ($test);
      undef $test;
    } else {
      if ($mode eq 'data' or $mode eq 'document') {
        $test->{$mode} .= $_;
      } elsif ($mode eq 'element') {
        tr/\x0D\x0A//d;
        $test->{$mode} .= $_;
      } elsif ($mode eq 'errors') {
        tr/\x0D\x0A//d;
        push @{$test->{errors}}, $_;
      } elsif ($mode eq 'shoulds') {
        tr/\x0D\x0A//d;
        push @{$test->{shoulds}}, $_;
      }
    }
  }
  test ($test) if $test->{errors};
}

use Whatpm::HTML;
use Whatpm::NanoDOM;
use Whatpm::Charset::UnicodeChecker;

sub test ($) {
  my $test = shift;

  my $doc = Whatpm::NanoDOM::Document->new;
  my @errors;
  my @shoulds;
  
  $SIG{INT} = sub {
    print scalar serialize ($doc);
    exit;
  };

  my $onerror = sub {
    my %opt = @_;
    if ($opt{level} eq 's') {
      push @shoulds, join ':', $opt{line}, $opt{column}, $opt{type};
    } else {
      push @errors, join ':', $opt{line}, $opt{column}, $opt{type};
    }
  };

  my $chk = sub {
    return Whatpm::Charset::UnicodeChecker->new_handle ($_[0], 'html5');
  }; # $chk

  my $result;
  unless (defined $test->{element}) {
    Whatpm::HTML->parse_char_string ($test->{data} => $doc, $onerror, $chk);
    $result = serialize ($doc);
  } else {
    my $el = $doc->create_element_ns
      ('http://www.w3.org/1999/xhtml', [undef, $test->{element}]);
    Whatpm::HTML->set_inner_html ($el, $test->{data}, $onerror, $chk);
    $result = serialize ($el);
  }
    
  ok scalar @errors, scalar @{$test->{errors}},
    'Parse error: ' . Data::Dumper::qquote ($test->{data}) . '; ' . 
    join (', ', @errors) . ';' . join (', ', @{$test->{errors}});
  ok scalar @shoulds, scalar @{$test->{shoulds} or []},
    'SHOULD-level error: ' . Data::Dumper::qquote ($test->{data}) . '; ' . 
    join (', ', @shoulds) . ';' . join (', ', @{$test->{shoulds} or []});

  ok $result, $test->{document},
      'Document tree: ' . Data::Dumper::qquote ($test->{data});
} # test

sub serialize ($) {
  my $node = shift;
  my $r = '';

  my @node = map { [$_, ''] } @{$node->child_nodes};
  while (@node) {
    my $child = shift @node;
    my $nt = $child->[0]->node_type;
    if ($nt == $child->[0]->ELEMENT_NODE) {
      $r .= '| ' . $child->[1] . '<' . $child->[0]->tag_name . ">\x0A"; ## ISSUE: case?

      for my $attr (sort {$a->[0] cmp $b->[0]} map { [$_->name, $_->value] }
                    @{$child->[0]->attributes}) {
        $r .= '| ' . $child->[1] . '  ' . $attr->[0] . '="'; ## ISSUE: case?
        $r .= $attr->[1] . '"' . "\x0A";
      }
      
      unshift @node,
        map { [$_, $child->[1] . '  '] } @{$child->[0]->child_nodes};
    } elsif ($nt == $child->[0]->TEXT_NODE) {
      $r .= '| ' . $child->[1] . '"' . $child->[0]->data . '"' . "\x0A";
    } elsif ($nt == $child->[0]->COMMENT_NODE) {
      $r .= '| ' . $child->[1] . '<!-- ' . $child->[0]->data . " -->\x0A";
    } elsif ($nt == $child->[0]->DOCUMENT_TYPE_NODE) {
      $r .= '| ' . $child->[1] . '<!DOCTYPE ' . $child->[0]->name;
      my $pubid = $child->[0]->public_id;
      $r .= ' PUBLIC "' . $pubid . '"' if length $pubid;
      my $sysid = $child->[0]->system_id;
      $r .= ' SYSTEM' if not length $pubid and length $sysid;
      $r .= ' "' . $sysid . '"' if length $sysid;
      $r .= ">\x0A";
    } else {
      $r .= '| ' . $child->[1] . $child->[0]->node_type . "\x0A"; # error
    }
  }
  
  return $r;
} # serialize

## License: Public Domain.
## $Date: 2008/10/04 12:20:36 $
1	wakaba	1.1	#!/usr/bin/perl
2			use strict;
3
4	wakaba	1.23	my $DEBUG = $ENV{DEBUG};
5	wakaba	1.33
6	wakaba	1.32	use lib qw[/home/wakaba/work/manakai2/lib];
7	wakaba	1.33
8	wakaba	1.1	my $dir_name;
9	wakaba	1.2	my $test_dir_name;
10	wakaba	1.1	BEGIN {
11	wakaba	1.2	$test_dir_name = 't/';
12	wakaba	1.1	$dir_name = 't/tree-construction/';
13			my $skip = "You don't have make command";
14			eval q{
15			system ("cd $test_dir_name; make tree-construction-files") == 0 or die
16			unless -f $dir_name.'tests1.dat';
17			$skip = '';
18			};
19			if ($skip) {
20			print "1..1\n";
21			print "ok 1 # $skip\n";
22			exit;
23			}
24			}
25
26			use Test;
27	wakaba	1.30	BEGIN { plan tests => 3105 }
28	wakaba	1.1
29			use Data::Dumper;
30			$Data::Dumper::Useqq = 1;
31			sub Data::Dumper::qquote {
32			my $s = shift;
33			$s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
34			return q<qq'> . $s . q<'>;
35			} # Data::Dumper::qquote
36
37	wakaba	1.23
38			if ($DEBUG) {
39	wakaba	1.25	my $not_found = {%{$Whatpm::HTML::Debug::cp or {}}};
40	wakaba	1.23	$Whatpm::HTML::Debug::cp_pass = sub {
41			my $id = shift;
42			delete $not_found->{$id};
43			};
44
45			END {
46			for my $id (sort {$a <=> $b \|\| $a cmp $b} keys %$not_found) {
47			print "# checkpoint $id is not reached\n";
48			}
49			}
50			}
51
52	wakaba	1.2	for my $file_name (grep {$_} split /\s+/, qq[
53	wakaba	1.12	${test_dir_name}tokenizer-test-2.dat
54	wakaba	1.31	${test_dir_name}tokenizer-test-3.dat
55	wakaba	1.2	${dir_name}tests1.dat
56			${dir_name}tests2.dat
57			${dir_name}tests3.dat
58			${dir_name}tests4.dat
59	wakaba	1.11	${dir_name}tests5.dat
60			${dir_name}tests6.dat
61	wakaba	1.27	${dir_name}tests7.dat
62	wakaba	1.38	${dir_name}tests8.dat
63			${dir_name}tests9.dat
64			${dir_name}tests10.dat
65			${dir_name}tests11.dat
66			${dir_name}tests12.dat
67	wakaba	1.2	${test_dir_name}tree-test-1.dat
68	wakaba	1.14	${test_dir_name}tree-test-2.dat
69	wakaba	1.29	${test_dir_name}tree-test-3.dat
70	wakaba	1.34	${test_dir_name}tree-test-void.dat
71	wakaba	1.35	${test_dir_name}tree-test-flow.dat
72			${test_dir_name}tree-test-phrasing.dat
73	wakaba	1.37	${test_dir_name}tree-test-form.dat
74	wakaba	1.36	${test_dir_name}tree-test-foreign.dat
75	wakaba	1.1	]) {
76	wakaba	1.2	open my $file, '<', $file_name
77			or die "$0: $file_name: $!";
78	wakaba	1.13	print "# $file_name\n";
79	wakaba	1.1
80			my $test;
81			my $mode = 'data';
82	wakaba	1.12	my $escaped;
83	wakaba	1.1	while (<$file>) {
84			s/\x0D\x0A/\x0A/;
85			if (/^#data$/) {
86			undef $test;
87			$test->{data} = '';
88			$mode = 'data';
89	wakaba	1.12	undef $escaped;
90			} elsif (/^#data escaped$/) {
91			undef $test;
92			$test->{data} = '';
93			$mode = 'data';
94			$escaped = 1;
95	wakaba	1.1	} elsif (/^#errors$/) {
96			$test->{errors} = [];
97			$mode = 'errors';
98	wakaba	1.13	$test->{data} =~ s/\x0D?\x0A\z//;
99			$test->{data} =~ s/\\u([0-9A-Fa-f]{4})/chr hex $1/ge if $escaped;
100	wakaba	1.24	$test->{data} =~ s/\\U([0-9A-Fa-f]{8})/chr hex $1/ge if $escaped;
101	wakaba	1.12	undef $escaped;
102	wakaba	1.30	} elsif (/^#shoulds$/) {
103			$test->{shoulds} = [];
104			$mode = 'shoulds';
105	wakaba	1.1	} elsif (/^#document$/) {
106			$test->{document} = '';
107			$mode = 'document';
108	wakaba	1.12	undef $escaped;
109			} elsif (/^#document escaped$/) {
110			$test->{document} = '';
111			$mode = 'document';
112			$escaped = 1;
113	wakaba	1.19	} elsif (/^#document-fragment$/) {
114			$test->{element} = '';
115			$mode = 'element';
116			undef $escaped;
117	wakaba	1.5	} elsif (/^#document-fragment (\S+)$/) {
118			$test->{document} = '';
119			$mode = 'document';
120			$test->{element} = $1;
121	wakaba	1.12	undef $escaped;
122			} elsif (/^#document-fragment (\S+) escaped$/) {
123			$test->{document} = '';
124			$mode = 'document';
125			$test->{element} = $1;
126			$escaped = 1;
127	wakaba	1.2	} elsif (defined $test->{document} and /^$/) {
128	wakaba	1.13	$test->{document} =~ s/\\u([0-9A-Fa-f]{4})/chr hex $1/ge if $escaped;
129	wakaba	1.24	$test->{document} =~ s/\\U([0-9A-Fa-f]{8})/chr hex $1/ge if $escaped;
130	wakaba	1.2	test ($test);
131	wakaba	1.1	undef $test;
132			} else {
133			if ($mode eq 'data' or $mode eq 'document') {
134	wakaba	1.13	$test->{$mode} .= $_;
135	wakaba	1.19	} elsif ($mode eq 'element') {
136			tr/\x0D\x0A//d;
137			$test->{$mode} .= $_;
138	wakaba	1.1	} elsif ($mode eq 'errors') {
139			tr/\x0D\x0A//d;
140			push @{$test->{errors}}, $_;
141	wakaba	1.30	} elsif ($mode eq 'shoulds') {
142			tr/\x0D\x0A//d;
143			push @{$test->{shoulds}}, $_;
144	wakaba	1.1	}
145			}
146			}
147			test ($test) if $test->{errors};
148			}
149
150	wakaba	1.4	use Whatpm::HTML;
151			use Whatpm::NanoDOM;
152	wakaba	1.33	use Whatpm::Charset::UnicodeChecker;
153	wakaba	1.1
154			sub test ($) {
155			my $test = shift;
156
157	wakaba	1.4	my $doc = Whatpm::NanoDOM::Document->new;
158	wakaba	1.1	my @errors;
159	wakaba	1.30	my @shoulds;
160	wakaba	1.1
161			$SIG{INT} = sub {
162	wakaba	1.3	print scalar serialize ($doc);
163	wakaba	1.1	exit;
164			};
165	wakaba	1.3
166	wakaba	1.5	my $onerror = sub {
167			my %opt = @_;
168	wakaba	1.30	if ($opt{level} eq 's') {
169			push @shoulds, join ':', $opt{line}, $opt{column}, $opt{type};
170			} else {
171			push @errors, join ':', $opt{line}, $opt{column}, $opt{type};
172			}
173	wakaba	1.5	};
174	wakaba	1.33
175			my $chk = sub {
176			return Whatpm::Charset::UnicodeChecker->new_handle ($_[0], 'html5');
177			}; # $chk
178
179	wakaba	1.5	my $result;
180			unless (defined $test->{element}) {
181	wakaba	1.33	Whatpm::HTML->parse_char_string ($test->{data} => $doc, $onerror, $chk);
182	wakaba	1.5	$result = serialize ($doc);
183			} else {
184			my $el = $doc->create_element_ns
185			('http://www.w3.org/1999/xhtml', [undef, $test->{element}]);
186	wakaba	1.33	Whatpm::HTML->set_inner_html ($el, $test->{data}, $onerror, $chk);
187	wakaba	1.5	$result = serialize ($el);
188			}
189
190	wakaba	1.1	ok scalar @errors, scalar @{$test->{errors}},
191	wakaba	1.21	'Parse error: ' . Data::Dumper::qquote ($test->{data}) . '; ' .
192	wakaba	1.1	join (', ', @errors) . ';' . join (', ', @{$test->{errors}});
193	wakaba	1.30	ok scalar @shoulds, scalar @{$test->{shoulds} or []},
194			'SHOULD-level error: ' . Data::Dumper::qquote ($test->{data}) . '; ' .
195			join (', ', @shoulds) . ';' . join (', ', @{$test->{shoulds} or []});
196	wakaba	1.1
197	wakaba	1.21	ok $result, $test->{document},
198			'Document tree: ' . Data::Dumper::qquote ($test->{data});
199	wakaba	1.1	} # test
200
201			sub serialize ($) {
202			my $node = shift;
203			my $r = '';
204
205			my @node = map { [$_, ''] } @{$node->child_nodes};
206			while (@node) {
207			my $child = shift @node;
208			my $nt = $child->[0]->node_type;
209			if ($nt == $child->[0]->ELEMENT_NODE) {
210			$r .= '\| ' . $child->[1] . '<' . $child->[0]->tag_name . ">\x0A"; ## ISSUE: case?
211
212	wakaba	1.2	for my $attr (sort {$a->[0] cmp $b->[0]} map { [$_->name, $_->value] }
213	wakaba	1.1	@{$child->[0]->attributes}) {
214			$r .= '\| ' . $child->[1] . ' ' . $attr->[0] . '="'; ## ISSUE: case?
215			$r .= $attr->[1] . '"' . "\x0A";
216			}
217
218			unshift @node,
219			map { [$_, $child->[1] . ' '] } @{$child->[0]->child_nodes};
220			} elsif ($nt == $child->[0]->TEXT_NODE) {
221			$r .= '\| ' . $child->[1] . '"' . $child->[0]->data . '"' . "\x0A";
222			} elsif ($nt == $child->[0]->COMMENT_NODE) {
223			$r .= '\| ' . $child->[1] . '<!-- ' . $child->[0]->data . " -->\x0A";
224			} elsif ($nt == $child->[0]->DOCUMENT_TYPE_NODE) {
225	wakaba	1.24	$r .= '\| ' . $child->[1] . '<!DOCTYPE ' . $child->[0]->name;
226			my $pubid = $child->[0]->public_id;
227			$r .= ' PUBLIC "' . $pubid . '"' if length $pubid;
228			my $sysid = $child->[0]->system_id;
229			$r .= ' SYSTEM' if not length $pubid and length $sysid;
230			$r .= ' "' . $sysid . '"' if length $sysid;
231			$r .= ">\x0A";
232	wakaba	1.1	} else {
233			$r .= '\| ' . $child->[1] . $child->[0]->node_type . "\x0A"; # error
234			}
235			}
236
237			return $r;
238			} # serialize
239
240			## License: Public Domain.
241	wakaba	1.38	## $Date: 2008/10/04 12:20:36 $