whatpm/t/HTML-tokenizer.t

#!/usr/bin/perl
use strict;

my $dir_name;
my $test_dir_name;
BEGIN {
  $test_dir_name = 't/';
  $dir_name = 't/tokenizer/';
  my $skip = "You don't have JSON module";
  eval q{
         use JSON 1.00;
         $skip = "You don't have make command";
         system ("cd $test_dir_name; make tokenizer-files") == 0 or die
           unless -f $dir_name.'test1.test';
         $skip = '';
        };
  if ($skip) {
    print "1..1\n";
    print "ok 1 # $skip\n";
    exit;
  }
  $JSON::UnMapping = 1;
  $JSON::UTF8 = 1;
}

use Test;
BEGIN { plan tests => 94 }

use Data::Dumper;
$Data::Dumper::Useqq = 1;
sub Data::Dumper::qquote {
  my $s = shift;
  $s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
  return q<qq'> . $s . q<'>;
} # Data::Dumper::qquote

use Whatpm::HTML;

for my $file_name (grep {$_} split /\s+/, qq[
                      ${dir_name}test1.test
                      ${dir_name}test2.test
                      ${dir_name}contentModelFlags.test
                      ${dir_name}escapeFlag.test
                      ${test_dir_name}tokenizer-test-1.test
                     ]) {
  open my $file, '<', $file_name
    or die "$0: $file_name: $!";
  local $/ = undef;
  my $js = <$file>;
  close $file;

  print "# $file_name\n";
  my $tests = jsonToObj ($js)->{tests};
  TEST: for my $test (@$tests) {
    my $s = $test->{input};
    
    my $j = 1;
    while ($j < @{$test->{output}}) {
      if (ref $test->{output}->[$j - 1] and
          $test->{output}->[$j - 1]->[0] eq 'Character' and
          ref $test->{output}->[$j] and 
          $test->{output}->[$j]->[0] eq 'Character') {
        $test->{output}->[$j - 1]->[1]
          .= $test->{output}->[$j]->[1];
        splice @{$test->{output}}, $j, 1;
      }
      $j++;
    }

    my @cm = @{$test->{contentModelFlags} || ['PCDATA']};
    my $last_start_tag = $test->{lastStartTag};
    for my $cm (@cm) {
      my $p = Whatpm::HTML->new;
      my $i = 0;
      my @token;
      $p->{set_next_input_character} = sub {
        my $self = shift;
        $self->{next_input_character} = -1 and return if $i >= length $s;
        $self->{next_input_character} = ord substr $s, $i++, 1;

        if ($self->{next_input_character} == 0x000D) { # CR
          if ($i >= length $s) {
            #
          } else {
            my $next_char = ord substr $s, $i++, 1;
            if ($next_char == 0x000A) { # LF
              #
            } else {
              push @{$self->{char}}, $next_char;
            }
          }
          $self->{next_input_character} = 0x000A; # LF # MUST
        } elsif ($self->{next_input_character} > 0x10FFFF) {
          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
          push @token, 'ParseError';
        } elsif ($self->{next_input_character} == 0x0000) { # NULL
          $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
          push @token, 'ParseError';
        }
      };
      
      $p->{parse_error} = sub {
        push @token, 'ParseError';
      };
      
      $p->_initialize_tokenizer;
      $p->{content_model_flag} = $cm;
      $p->{last_emitted_start_tag_name} = $last_start_tag;

      while (1) {
        my $token = $p->_get_next_token;
        last if $token->{type} eq 'end-of-file';
        
        my $test_token = [
         {
          DOCTYPE => 'DOCTYPE',
          'start tag' => 'StartTag',
          'end tag' => 'EndTag',
          comment => 'Comment',
          character => 'Character',
         }->{$token->{type}} || $token->{type},
        ];
        $test_token->[1] = $token->{name} if defined $token->{name};
        $test_token->[1] = $token->{tag_name} if defined $token->{tag_name};
        $test_token->[1] = $token->{data} if defined $token->{data};
        $test_token->[2] = $token->{error} ? 1 : 0 if $token->{type} eq 'DOCTYPE';
        $test_token->[2] = {map {$_->{name} => $_->{value}} values %{$token->{attributes}}}
          if $token->{type} eq 'start tag';
        
        if (@token and ref $token[-1] and $token[-1]->[0] eq 'Character' and
            $test_token->[0] eq 'Character') {
          $token[-1]->[1] .= $test_token->[1];
        } else {
          push @token, $test_token;
        }
      }
      
      my $expected_dump = Dumper ($test->{output});
      my $parser_dump = Dumper (\@token);
      ok $parser_dump, $expected_dump,
        $test->{description} . ': ' . $test->{input};
    }
  }
}

## $Date: 2007/06/23 02:26:51 $
1	wakaba	1.1	#!/usr/bin/perl
2			use strict;
3
4	wakaba	1.2	my $dir_name;
5	wakaba	1.4	my $test_dir_name;
6	wakaba	1.1	BEGIN {
7	wakaba	1.4	$test_dir_name = 't/';
8	wakaba	1.2	$dir_name = 't/tokenizer/';
9	wakaba	1.1	my $skip = "You don't have JSON module";
10			eval q{
11			use JSON 1.00;
12			$skip = "You don't have make command";
13	wakaba	1.3	system ("cd $test_dir_name; make tokenizer-files") == 0 or die
14	wakaba	1.2	unless -f $dir_name.'test1.test';
15	wakaba	1.1	$skip = '';
16			};
17			if ($skip) {
18			print "1..1\n";
19			print "ok 1 # $skip\n";
20			exit;
21			}
22			$JSON::UnMapping = 1;
23	wakaba	1.2	$JSON::UTF8 = 1;
24	wakaba	1.1	}
25
26			use Test;
27	wakaba	1.7	BEGIN { plan tests => 94 }
28	wakaba	1.2
29	wakaba	1.1	use Data::Dumper;
30	wakaba	1.2	$Data::Dumper::Useqq = 1;
31			sub Data::Dumper::qquote {
32			my $s = shift;
33			$s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
34			return q<qq'> . $s . q<'>;
35			} # Data::Dumper::qquote
36	wakaba	1.1
37	wakaba	1.5	use Whatpm::HTML;
38	wakaba	1.1
39	wakaba	1.4	for my $file_name (grep {$_} split /\s+/, qq[
40			${dir_name}test1.test
41			${dir_name}test2.test
42			${dir_name}contentModelFlags.test
43	wakaba	1.8	${dir_name}escapeFlag.test
44	wakaba	1.4	${test_dir_name}tokenizer-test-1.test
45	wakaba	1.1	]) {
46	wakaba	1.4	open my $file, '<', $file_name
47			or die "$0: $file_name: $!";
48	wakaba	1.1	local $/ = undef;
49			my $js = <$file>;
50			close $file;
51	wakaba	1.9
52			print "# $file_name\n";
53	wakaba	1.1	my $tests = jsonToObj ($js)->{tests};
54			TEST: for my $test (@$tests) {
55			my $s = $test->{input};
56
57			my $j = 1;
58			while ($j < @{$test->{output}}) {
59			if (ref $test->{output}->[$j - 1] and
60			$test->{output}->[$j - 1]->[0] eq 'Character' and
61			ref $test->{output}->[$j] and
62			$test->{output}->[$j]->[0] eq 'Character') {
63			$test->{output}->[$j - 1]->[1]
64			.= $test->{output}->[$j]->[1];
65			splice @{$test->{output}}, $j, 1;
66			}
67			$j++;
68			}
69
70	wakaba	1.2	my @cm = @{$test->{contentModelFlags} \|\| ['PCDATA']};
71			my $last_start_tag = $test->{lastStartTag};
72	wakaba	1.1	for my $cm (@cm) {
73	wakaba	1.5	my $p = Whatpm::HTML->new;
74	wakaba	1.1	my $i = 0;
75	wakaba	1.9	my @token;
76	wakaba	1.1	$p->{set_next_input_character} = sub {
77			my $self = shift;
78			$self->{next_input_character} = -1 and return if $i >= length $s;
79			$self->{next_input_character} = ord substr $s, $i++, 1;
80	wakaba	1.2
81			if ($self->{next_input_character} == 0x000D) { # CR
82			if ($i >= length $s) {
83			#
84			} else {
85			my $next_char = ord substr $s, $i++, 1;
86			if ($next_char == 0x000A) { # LF
87			#
88			} else {
89			push @{$self->{char}}, $next_char;
90			}
91			}
92			$self->{next_input_character} = 0x000A; # LF # MUST
93			} elsif ($self->{next_input_character} > 0x10FFFF) {
94			$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
95	wakaba	1.9	push @token, 'ParseError';
96	wakaba	1.2	} elsif ($self->{next_input_character} == 0x0000) { # NULL
97			$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
98	wakaba	1.9	push @token, 'ParseError';
99	wakaba	1.2	}
100	wakaba	1.1	};
101
102			$p->{parse_error} = sub {
103			push @token, 'ParseError';
104			};
105
106			$p->_initialize_tokenizer;
107			$p->{content_model_flag} = $cm;
108	wakaba	1.2	$p->{last_emitted_start_tag_name} = $last_start_tag;
109	wakaba	1.1
110			while (1) {
111			my $token = $p->_get_next_token;
112			last if $token->{type} eq 'end-of-file';
113
114			my $test_token = [
115			{
116			DOCTYPE => 'DOCTYPE',
117			'start tag' => 'StartTag',
118			'end tag' => 'EndTag',
119			comment => 'Comment',
120			character => 'Character',
121			}->{$token->{type}} \|\| $token->{type},
122			];
123			$test_token->[1] = $token->{name} if defined $token->{name};
124			$test_token->[1] = $token->{tag_name} if defined $token->{tag_name};
125			$test_token->[1] = $token->{data} if defined $token->{data};
126			$test_token->[2] = $token->{error} ? 1 : 0 if $token->{type} eq 'DOCTYPE';
127			$test_token->[2] = {map {$_->{name} => $_->{value}} values %{$token->{attributes}}}
128			if $token->{type} eq 'start tag';
129
130			if (@token and ref $token[-1] and $token[-1]->[0] eq 'Character' and
131			$test_token->[0] eq 'Character') {
132			$token[-1]->[1] .= $test_token->[1];
133			} else {
134			push @token, $test_token;
135			}
136			}
137
138			my $expected_dump = Dumper ($test->{output});
139			my $parser_dump = Dumper (\@token);
140	wakaba	1.2	ok $parser_dump, $expected_dump,
141			$test->{description} . ': ' . $test->{input};
142	wakaba	1.1	}
143			}
144			}
145
146	wakaba	1.9	## $Date: 2007/06/23 02:26:51 $