whatpm/t/HTML-tokenizer.t

#!/usr/bin/perl
use strict;

my $DEBUG = $ENV{DEBUG};

my $dir_name;
my $test_dir_name;
BEGIN {
  $test_dir_name = 't/';
  $dir_name = 't/tokenizer/';
  my $skip = "You don't have JSON module";
  eval q{
         use JSON 1.07;
         $skip = "You don't have make command";
         system ("cd $test_dir_name; make tokenizer-files") == 0 or die
           unless -f $dir_name.'test1.test';
         $skip = '';
        };
  if ($skip) {
    print "1..1\n";
    print "ok 1 # $skip\n";
    exit;
  }
  $JSON::UnMapping = 1;
  $JSON::UTF8 = 1;
}

use Test;
BEGIN { plan tests => 1073 }

use Data::Dumper;
$Data::Dumper::Useqq = 1;
$Data::Dumper::Sortkeys = 1;
sub Data::Dumper::qquote {
  my $s = shift;
  $s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
  return q<qq'> . $s . q<'>;
} # Data::Dumper::qquote

if ($DEBUG) {
  my $not_found = {%{$Whatpm::HTML::Debug::cp or {}}};

  $Whatpm::HTML::Debug::cp_pass = sub {
    my $id = shift;
    delete $not_found->{$id};
  };

  END {
    for my $id (sort {$a <=> $b || $a cmp $b} grep {!/^[ti]/}
                keys %$not_found) {
      print "# checkpoint $id is not reached\n";
    }
  }
}

use Whatpm::HTML;

for my $file_name (grep {$_} split /\s+/, qq[
                      ${dir_name}test1.test
                      ${dir_name}test2.test
                      ${dir_name}test3.test
                      ${dir_name}test4.test
                      ${dir_name}contentModelFlags.test
                      ${dir_name}escapeFlag.test
                      ${dir_name}entities.test
                      ${dir_name}xmlViolation.test
                      ${test_dir_name}tokenizer-test-1.test
                     ]) {
  open my $file, '<', $file_name
    or die "$0: $file_name: $!";
  local $/ = undef;
  my $js = <$file>;
  close $file;

  print "# $file_name\n";
  $js =~ s{\\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])
      \\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])}{
    ## NOTE: JSON::Parser does not decode surrogate pair escapes
    ## NOTE: In older version of JSON::Parser, utf8 string will be broken
    ## by parsing.  Use latest version!
    ## NOTE: Encode.pm is broken; it converts e.g. U+10FFFF to U+FFFD.
    my $c = 0x10000;
    $c += ((((hex $1) & 0b1111111111) << 10) | ((hex $2) & 0b1111111111));
    chr $c;
  }gex;
  my $json = jsonToObj ($js);
  my $tests = $json->{tests} || $json->{xmlViolationTests};
  TEST: for my $test (@$tests) {
    my $s = $test->{input};
    
    my $j = 1;
    while ($j < @{$test->{output}}) {
      if (ref $test->{output}->[$j - 1] and
          $test->{output}->[$j - 1]->[0] eq 'Character' and
          ref $test->{output}->[$j] and 
          $test->{output}->[$j]->[0] eq 'Character') {
        $test->{output}->[$j - 1]->[1]
          .= $test->{output}->[$j]->[1];
        splice @{$test->{output}}, $j, 1;
      }
      $j++;
    }

    my @cm = @{$test->{contentModelFlags} || ['PCDATA']};
    my $last_start_tag = $test->{lastStartTag};
    for my $cm (@cm) {
      my $p = Whatpm::HTML->new;
      my $i = 0;
      my @token;
      $p->{set_next_char} = sub {
        my $self = shift;

#        pop @{$self->{prev_char}};
#        unshift @{$self->{prev_char}}, $self->{next_char};

        $self->{next_char} = -1 and return if $i >= length $s;
        $self->{next_char} = ord substr $s, $i++, 1;

        if ($self->{next_char} == 0x000D) { # CR
          $i++ if substr ($s, $i, 1) eq "\x0A";
          $self->{next_char} = 0x000A; # LF # MUST
        } elsif ($self->{next_char} > 0x10FFFF) {
          $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
          push @token, 'ParseError';
        } elsif ($self->{next_char} == 0x0000) { # NULL
          $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
          push @token, 'ParseError';
        } elsif ($self->{next_char} <= 0x0008 or
                 (0x000E <= $self->{next_char} and
                  $self->{next_char} <= 0x001F) or
                 (0x007F <= $self->{next_char} and
                  $self->{next_char} <= 0x009F) or
                 (0xD800 <= $self->{next_char} and
                  $self->{next_char} <= 0xDFFF) or
                 (0xFDD0 <= $self->{next_char} and
                  $self->{next_char} <= 0xFDDF) or
                 {
                   0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
                   0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
                   0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
                   0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
                   0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
                   0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
                   0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
                   0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
                   0x10FFFE => 1, 0x10FFFF => 1,
                  }->{$self->{next_char}}) {
          push @token, 'ParseError';
        }
      };


      $p->{read_until} = sub { return 0 };
      
      $p->{parse_error} = sub {
        push @token, 'ParseError';
      };
      
      $p->_initialize_tokenizer;
      $p->{content_model} = {
        CDATA => Whatpm::HTML::CDATA_CONTENT_MODEL (),
        RCDATA => Whatpm::HTML::RCDATA_CONTENT_MODEL (),
        PCDATA => Whatpm::HTML::PCDATA_CONTENT_MODEL (),
        PLAINTEXT => Whatpm::HTML::PLAINTEXT_CONTENT_MODEL (),
      }->{$cm};
      $p->{last_emitted_start_tag_name} = $last_start_tag;

      while (1) {
        my $token = $p->_get_next_token;
        last if $token->{type} == Whatpm::HTML::END_OF_FILE_TOKEN ();
        
        my $test_token = [
         {
          Whatpm::HTML::DOCTYPE_TOKEN () => 'DOCTYPE',
          Whatpm::HTML::START_TAG_TOKEN () => 'StartTag',
          Whatpm::HTML::END_TAG_TOKEN () => 'EndTag',
          Whatpm::HTML::COMMENT_TOKEN () => 'Comment',
          Whatpm::HTML::CHARACTER_TOKEN () => 'Character',
         }->{$token->{type}} || $token->{type},
        ];
        $test_token->[1] = $token->{tag_name} if defined $token->{tag_name};
        $test_token->[1] = $token->{data} if defined $token->{data};
        if ($token->{type} == Whatpm::HTML::START_TAG_TOKEN ()) {
          $test_token->[2] = {map {$_->{name} => $_->{value}} values %{$token->{attributes}}};
          $test_token->[3] = 1 if $p->{self_closing};
          delete $p->{self_closing};
        } elsif ($token->{type} == Whatpm::HTML::DOCTYPE_TOKEN ()) {
          $test_token->[1] = $token->{name};
          $test_token->[2] = $token->{public_identifier};
          $test_token->[3] = $token->{system_identifier};
          $test_token->[4] = $token->{quirks} ? 0 : 1;
        }

        if (@token and ref $token[-1] and $token[-1]->[0] eq 'Character' and
            $test_token->[0] eq 'Character') {
          $token[-1]->[1] .= $test_token->[1];
        } else {
          push @token, $test_token;
        }
      }
      
      my $expected_dump = Dumper ($test->{output});
      my $parser_dump = Dumper (\@token);
      ok $parser_dump, $expected_dump,
        $test->{description} . ': ' . Data::Dumper::qquote ($test->{input});
    }
  }
}

## License: Public Domain.
## $Date: 2008/09/14 03:07:58 $
1	wakaba	1.1	#!/usr/bin/perl
2			use strict;
3
4	wakaba	1.27	my $DEBUG = $ENV{DEBUG};
5
6	wakaba	1.2	my $dir_name;
7	wakaba	1.4	my $test_dir_name;
8	wakaba	1.1	BEGIN {
9	wakaba	1.4	$test_dir_name = 't/';
10	wakaba	1.2	$dir_name = 't/tokenizer/';
11	wakaba	1.1	my $skip = "You don't have JSON module";
12			eval q{
13	wakaba	1.15	use JSON 1.07;
14	wakaba	1.1	$skip = "You don't have make command";
15	wakaba	1.3	system ("cd $test_dir_name; make tokenizer-files") == 0 or die
16	wakaba	1.2	unless -f $dir_name.'test1.test';
17	wakaba	1.1	$skip = '';
18			};
19			if ($skip) {
20			print "1..1\n";
21			print "ok 1 # $skip\n";
22			exit;
23			}
24			$JSON::UnMapping = 1;
25	wakaba	1.2	$JSON::UTF8 = 1;
26	wakaba	1.1	}
27
28			use Test;
29	wakaba	1.36	BEGIN { plan tests => 1073 }
30	wakaba	1.2
31	wakaba	1.1	use Data::Dumper;
32	wakaba	1.2	$Data::Dumper::Useqq = 1;
33	wakaba	1.38	$Data::Dumper::Sortkeys = 1;
34	wakaba	1.2	sub Data::Dumper::qquote {
35			my $s = shift;
36			$s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
37			return q<qq'> . $s . q<'>;
38			} # Data::Dumper::qquote
39	wakaba	1.1
40	wakaba	1.27	if ($DEBUG) {
41	wakaba	1.31	my $not_found = {%{$Whatpm::HTML::Debug::cp or {}}};
42	wakaba	1.27
43			$Whatpm::HTML::Debug::cp_pass = sub {
44			my $id = shift;
45			delete $not_found->{$id};
46			};
47
48			END {
49	wakaba	1.29	for my $id (sort {$a <=> $b \|\| $a cmp $b} grep {!/^[ti]/}
50			keys %$not_found) {
51	wakaba	1.28	print "# checkpoint $id is not reached\n";
52	wakaba	1.27	}
53			}
54			}
55
56	wakaba	1.5	use Whatpm::HTML;
57	wakaba	1.1
58	wakaba	1.4	for my $file_name (grep {$_} split /\s+/, qq[
59			${dir_name}test1.test
60			${dir_name}test2.test
61	wakaba	1.15	${dir_name}test3.test
62			${dir_name}test4.test
63	wakaba	1.4	${dir_name}contentModelFlags.test
64	wakaba	1.8	${dir_name}escapeFlag.test
65	wakaba	1.34	${dir_name}entities.test
66			${dir_name}xmlViolation.test
67	wakaba	1.4	${test_dir_name}tokenizer-test-1.test
68	wakaba	1.1	]) {
69	wakaba	1.4	open my $file, '<', $file_name
70			or die "$0: $file_name: $!";
71	wakaba	1.1	local $/ = undef;
72			my $js = <$file>;
73			close $file;
74	wakaba	1.9
75			print "# $file_name\n";
76	wakaba	1.15	$js =~ s{\\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])
77			\\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])}{
78			## NOTE: JSON::Parser does not decode surrogate pair escapes
79			## NOTE: In older version of JSON::Parser, utf8 string will be broken
80			## by parsing. Use latest version!
81			## NOTE: Encode.pm is broken; it converts e.g. U+10FFFF to U+FFFD.
82			my $c = 0x10000;
83			$c += ((((hex $1) & 0b1111111111) << 10) \| ((hex $2) & 0b1111111111));
84			chr $c;
85			}gex;
86	wakaba	1.34	my $json = jsonToObj ($js);
87			my $tests = $json->{tests} \|\| $json->{xmlViolationTests};
88	wakaba	1.1	TEST: for my $test (@$tests) {
89			my $s = $test->{input};
90
91			my $j = 1;
92			while ($j < @{$test->{output}}) {
93			if (ref $test->{output}->[$j - 1] and
94			$test->{output}->[$j - 1]->[0] eq 'Character' and
95			ref $test->{output}->[$j] and
96			$test->{output}->[$j]->[0] eq 'Character') {
97			$test->{output}->[$j - 1]->[1]
98			.= $test->{output}->[$j]->[1];
99			splice @{$test->{output}}, $j, 1;
100			}
101			$j++;
102			}
103
104	wakaba	1.2	my @cm = @{$test->{contentModelFlags} \|\| ['PCDATA']};
105			my $last_start_tag = $test->{lastStartTag};
106	wakaba	1.1	for my $cm (@cm) {
107	wakaba	1.5	my $p = Whatpm::HTML->new;
108	wakaba	1.1	my $i = 0;
109	wakaba	1.9	my @token;
110	wakaba	1.26	$p->{set_next_char} = sub {
111	wakaba	1.1	my $self = shift;
112	wakaba	1.10
113	wakaba	1.41	# pop @{$self->{prev_char}};
114			# unshift @{$self->{prev_char}}, $self->{next_char};
115	wakaba	1.10
116	wakaba	1.26	$self->{next_char} = -1 and return if $i >= length $s;
117			$self->{next_char} = ord substr $s, $i++, 1;
118	wakaba	1.2
119	wakaba	1.26	if ($self->{next_char} == 0x000D) { # CR
120	wakaba	1.15	$i++ if substr ($s, $i, 1) eq "\x0A";
121	wakaba	1.26	$self->{next_char} = 0x000A; # LF # MUST
122			} elsif ($self->{next_char} > 0x10FFFF) {
123			$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
124	wakaba	1.9	push @token, 'ParseError';
125	wakaba	1.26	} elsif ($self->{next_char} == 0x0000) { # NULL
126			$self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
127	wakaba	1.9	push @token, 'ParseError';
128	wakaba	1.35	} elsif ($self->{next_char} <= 0x0008 or
129			(0x000E <= $self->{next_char} and
130			$self->{next_char} <= 0x001F) or
131			(0x007F <= $self->{next_char} and
132			$self->{next_char} <= 0x009F) or
133			(0xD800 <= $self->{next_char} and
134			$self->{next_char} <= 0xDFFF) or
135			(0xFDD0 <= $self->{next_char} and
136			$self->{next_char} <= 0xFDDF) or
137			{
138			0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
139			0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
140			0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
141			0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
142			0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
143			0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
144			0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
145			0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
146			0x10FFFE => 1, 0x10FFFF => 1,
147			}->{$self->{next_char}}) {
148			push @token, 'ParseError';
149	wakaba	1.2	}
150	wakaba	1.1	};
151	wakaba	1.41
152
153	wakaba	1.39
154	wakaba	1.40	$p->{read_until} = sub { return 0 };
155	wakaba	1.1
156			$p->{parse_error} = sub {
157			push @token, 'ParseError';
158			};
159
160			$p->_initialize_tokenizer;
161	wakaba	1.18	$p->{content_model} = {
162			CDATA => Whatpm::HTML::CDATA_CONTENT_MODEL (),
163			RCDATA => Whatpm::HTML::RCDATA_CONTENT_MODEL (),
164			PCDATA => Whatpm::HTML::PCDATA_CONTENT_MODEL (),
165			PLAINTEXT => Whatpm::HTML::PLAINTEXT_CONTENT_MODEL (),
166			}->{$cm};
167	wakaba	1.2	$p->{last_emitted_start_tag_name} = $last_start_tag;
168	wakaba	1.1
169			while (1) {
170			my $token = $p->_get_next_token;
171	wakaba	1.19	last if $token->{type} == Whatpm::HTML::END_OF_FILE_TOKEN ();
172	wakaba	1.1
173			my $test_token = [
174			{
175	wakaba	1.19	Whatpm::HTML::DOCTYPE_TOKEN () => 'DOCTYPE',
176			Whatpm::HTML::START_TAG_TOKEN () => 'StartTag',
177			Whatpm::HTML::END_TAG_TOKEN () => 'EndTag',
178			Whatpm::HTML::COMMENT_TOKEN () => 'Comment',
179			Whatpm::HTML::CHARACTER_TOKEN () => 'Character',
180	wakaba	1.1	}->{$token->{type}} \|\| $token->{type},
181			];
182			$test_token->[1] = $token->{tag_name} if defined $token->{tag_name};
183			$test_token->[1] = $token->{data} if defined $token->{data};
184	wakaba	1.19	if ($token->{type} == Whatpm::HTML::START_TAG_TOKEN ()) {
185	wakaba	1.11	$test_token->[2] = {map {$_->{name} => $_->{value}} values %{$token->{attributes}}};
186	wakaba	1.37	$test_token->[3] = 1 if $p->{self_closing};
187			delete $p->{self_closing};
188	wakaba	1.19	} elsif ($token->{type} == Whatpm::HTML::DOCTYPE_TOKEN ()) {
189	wakaba	1.11	$test_token->[1] = $token->{name};
190			$test_token->[2] = $token->{public_identifier};
191			$test_token->[3] = $token->{system_identifier};
192	wakaba	1.25	$test_token->[4] = $token->{quirks} ? 0 : 1;
193	wakaba	1.11	}
194
195	wakaba	1.1	if (@token and ref $token[-1] and $token[-1]->[0] eq 'Character' and
196			$test_token->[0] eq 'Character') {
197			$token[-1]->[1] .= $test_token->[1];
198			} else {
199			push @token, $test_token;
200			}
201			}
202
203			my $expected_dump = Dumper ($test->{output});
204			my $parser_dump = Dumper (\@token);
205	wakaba	1.2	ok $parser_dump, $expected_dump,
206	wakaba	1.17	$test->{description} . ': ' . Data::Dumper::qquote ($test->{input});
207	wakaba	1.1	}
208			}
209			}
210
211	wakaba	1.19	## License: Public Domain.
212	wakaba	1.41	## $Date: 2008/09/14 03:07:58 $