/[suikacvs]/markup/html/whatpm/t/HTML-tokenizer.t
Suika

Contents of /markup/html/whatpm/t/HTML-tokenizer.t

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.33 - (hide annotations) (download) (as text)
Sat Apr 12 10:41:31 2008 UTC (17 years, 3 months ago) by wakaba
Branch: MAIN
Changes since 1.32: +7 -1 lines
File MIME type: application/x-troff
++ whatpm/t/ChangeLog	12 Apr 2008 10:41:08 -0000
	* HTML-tokenizer.t: Remove "self-closing flag" if the start
	tag token is that of a slash permitted element (This is necessary
	to maintain compatibility with current test data, since in the
	new algorithm whether slash is permitted or not is decided in
	tree construction stage).

2008-04-12  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	12 Apr 2008 10:38:11 -0000
2008-04-12  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src, mkhtmlparser.pl: The way permitted slash errors
	are raised is changed (HTML5 revision 1404).

1 wakaba 1.1 #!/usr/bin/perl
2     use strict;
3    
4 wakaba 1.27 my $DEBUG = $ENV{DEBUG};
5    
6 wakaba 1.2 my $dir_name;
7 wakaba 1.4 my $test_dir_name;
8 wakaba 1.1 BEGIN {
9 wakaba 1.4 $test_dir_name = 't/';
10 wakaba 1.2 $dir_name = 't/tokenizer/';
11 wakaba 1.1 my $skip = "You don't have JSON module";
12     eval q{
13 wakaba 1.15 use JSON 1.07;
14 wakaba 1.1 $skip = "You don't have make command";
15 wakaba 1.3 system ("cd $test_dir_name; make tokenizer-files") == 0 or die
16 wakaba 1.2 unless -f $dir_name.'test1.test';
17 wakaba 1.1 $skip = '';
18     };
19     if ($skip) {
20     print "1..1\n";
21     print "ok 1 # $skip\n";
22     exit;
23     }
24     $JSON::UnMapping = 1;
25 wakaba 1.2 $JSON::UTF8 = 1;
26 wakaba 1.1 }
27    
28     use Test;
29 wakaba 1.32 BEGIN { plan tests => 477 }
30 wakaba 1.2
31 wakaba 1.1 use Data::Dumper;
32 wakaba 1.2 $Data::Dumper::Useqq = 1;
33     sub Data::Dumper::qquote {
34     my $s = shift;
35     $s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
36     return q<qq'> . $s . q<'>;
37     } # Data::Dumper::qquote
38 wakaba 1.1
39 wakaba 1.27 if ($DEBUG) {
40 wakaba 1.31 my $not_found = {%{$Whatpm::HTML::Debug::cp or {}}};
41 wakaba 1.27
42     $Whatpm::HTML::Debug::cp_pass = sub {
43     my $id = shift;
44     delete $not_found->{$id};
45     };
46    
47     END {
48 wakaba 1.29 for my $id (sort {$a <=> $b || $a cmp $b} grep {!/^[ti]/}
49     keys %$not_found) {
50 wakaba 1.28 print "# checkpoint $id is not reached\n";
51 wakaba 1.27 }
52     }
53     }
54    
55 wakaba 1.5 use Whatpm::HTML;
56 wakaba 1.1
57 wakaba 1.4 for my $file_name (grep {$_} split /\s+/, qq[
58     ${dir_name}test1.test
59     ${dir_name}test2.test
60 wakaba 1.15 ${dir_name}test3.test
61     ${dir_name}test4.test
62 wakaba 1.4 ${dir_name}contentModelFlags.test
63 wakaba 1.8 ${dir_name}escapeFlag.test
64 wakaba 1.4 ${test_dir_name}tokenizer-test-1.test
65 wakaba 1.1 ]) {
66 wakaba 1.4 open my $file, '<', $file_name
67     or die "$0: $file_name: $!";
68 wakaba 1.1 local $/ = undef;
69     my $js = <$file>;
70     close $file;
71 wakaba 1.9
72     print "# $file_name\n";
73 wakaba 1.15 $js =~ s{\\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])
74     \\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])}{
75     ## NOTE: JSON::Parser does not decode surrogate pair escapes
76     ## NOTE: In older version of JSON::Parser, utf8 string will be broken
77     ## by parsing. Use latest version!
78     ## NOTE: Encode.pm is broken; it converts e.g. U+10FFFF to U+FFFD.
79     my $c = 0x10000;
80     $c += ((((hex $1) & 0b1111111111) << 10) | ((hex $2) & 0b1111111111));
81     chr $c;
82     }gex;
83 wakaba 1.1 my $tests = jsonToObj ($js)->{tests};
84     TEST: for my $test (@$tests) {
85     my $s = $test->{input};
86    
87     my $j = 1;
88     while ($j < @{$test->{output}}) {
89     if (ref $test->{output}->[$j - 1] and
90     $test->{output}->[$j - 1]->[0] eq 'Character' and
91     ref $test->{output}->[$j] and
92     $test->{output}->[$j]->[0] eq 'Character') {
93     $test->{output}->[$j - 1]->[1]
94     .= $test->{output}->[$j]->[1];
95     splice @{$test->{output}}, $j, 1;
96     }
97     $j++;
98     }
99    
100 wakaba 1.2 my @cm = @{$test->{contentModelFlags} || ['PCDATA']};
101     my $last_start_tag = $test->{lastStartTag};
102 wakaba 1.1 for my $cm (@cm) {
103 wakaba 1.5 my $p = Whatpm::HTML->new;
104 wakaba 1.1 my $i = 0;
105 wakaba 1.9 my @token;
106 wakaba 1.26 $p->{set_next_char} = sub {
107 wakaba 1.1 my $self = shift;
108 wakaba 1.10
109 wakaba 1.26 pop @{$self->{prev_char}};
110     unshift @{$self->{prev_char}}, $self->{next_char};
111 wakaba 1.10
112 wakaba 1.26 $self->{next_char} = -1 and return if $i >= length $s;
113     $self->{next_char} = ord substr $s, $i++, 1;
114 wakaba 1.2
115 wakaba 1.26 if ($self->{next_char} == 0x000D) { # CR
116 wakaba 1.15 $i++ if substr ($s, $i, 1) eq "\x0A";
117 wakaba 1.26 $self->{next_char} = 0x000A; # LF # MUST
118     } elsif ($self->{next_char} > 0x10FFFF) {
119     $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
120 wakaba 1.9 push @token, 'ParseError';
121 wakaba 1.26 } elsif ($self->{next_char} == 0x0000) { # NULL
122     $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
123 wakaba 1.9 push @token, 'ParseError';
124 wakaba 1.2 }
125 wakaba 1.1 };
126 wakaba 1.26 $p->{prev_char} = [-1, -1, -1];
127     $p->{next_char} = -1;
128 wakaba 1.1
129     $p->{parse_error} = sub {
130     push @token, 'ParseError';
131     };
132    
133     $p->_initialize_tokenizer;
134 wakaba 1.18 $p->{content_model} = {
135     CDATA => Whatpm::HTML::CDATA_CONTENT_MODEL (),
136     RCDATA => Whatpm::HTML::RCDATA_CONTENT_MODEL (),
137     PCDATA => Whatpm::HTML::PCDATA_CONTENT_MODEL (),
138     PLAINTEXT => Whatpm::HTML::PLAINTEXT_CONTENT_MODEL (),
139     }->{$cm};
140 wakaba 1.2 $p->{last_emitted_start_tag_name} = $last_start_tag;
141 wakaba 1.1
142     while (1) {
143     my $token = $p->_get_next_token;
144 wakaba 1.19 last if $token->{type} == Whatpm::HTML::END_OF_FILE_TOKEN ();
145 wakaba 1.1
146     my $test_token = [
147     {
148 wakaba 1.19 Whatpm::HTML::DOCTYPE_TOKEN () => 'DOCTYPE',
149     Whatpm::HTML::START_TAG_TOKEN () => 'StartTag',
150     Whatpm::HTML::END_TAG_TOKEN () => 'EndTag',
151     Whatpm::HTML::COMMENT_TOKEN () => 'Comment',
152     Whatpm::HTML::CHARACTER_TOKEN () => 'Character',
153 wakaba 1.1 }->{$token->{type}} || $token->{type},
154     ];
155     $test_token->[1] = $token->{tag_name} if defined $token->{tag_name};
156     $test_token->[1] = $token->{data} if defined $token->{data};
157 wakaba 1.19 if ($token->{type} == Whatpm::HTML::START_TAG_TOKEN ()) {
158 wakaba 1.11 $test_token->[2] = {map {$_->{name} => $_->{value}} values %{$token->{attributes}}};
159 wakaba 1.33 if ({
160     ## NOTE: Permitted slash
161     br => 1, link => 1,
162     }->{$token->{tag_name}}) {
163     delete $p->{self_closing};
164     }
165 wakaba 1.19 } elsif ($token->{type} == Whatpm::HTML::DOCTYPE_TOKEN ()) {
166 wakaba 1.11 $test_token->[1] = $token->{name};
167     $test_token->[2] = $token->{public_identifier};
168     $test_token->[3] = $token->{system_identifier};
169 wakaba 1.25 $test_token->[4] = $token->{quirks} ? 0 : 1;
170 wakaba 1.11 }
171    
172 wakaba 1.1 if (@token and ref $token[-1] and $token[-1]->[0] eq 'Character' and
173     $test_token->[0] eq 'Character') {
174     $token[-1]->[1] .= $test_token->[1];
175     } else {
176     push @token, $test_token;
177     }
178     }
179    
180     my $expected_dump = Dumper ($test->{output});
181     my $parser_dump = Dumper (\@token);
182 wakaba 1.2 ok $parser_dump, $expected_dump,
183 wakaba 1.17 $test->{description} . ': ' . Data::Dumper::qquote ($test->{input});
184 wakaba 1.1 }
185     }
186     }
187    
188 wakaba 1.19 ## License: Public Domain.
189 wakaba 1.33 ## $Date: 2008/03/28 14:23:09 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24