/[suikacvs]/markup/html/whatpm/t/HTML-tokenizer.t
Suika

Contents of /markup/html/whatpm/t/HTML-tokenizer.t

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.45 - (show annotations) (download) (as text)
Sat Oct 4 17:16:02 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.44: +2 -2 lines
File MIME type: application/x-troff
Error occurred while calculating annotation data.
++ whatpm/t/ChangeLog	4 Oct 2008 17:15:55 -0000
2008-10-05  Wakaba  <wakaba@suika.fam.cx>

	* HTML-tree.t: New test files added.

	* Makefile: New test files added.

++ whatpm/Whatpm/ChangeLog	4 Oct 2008 17:15:20 -0000
2008-10-05  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm.src: An AAA bug fixed.

1 #!/usr/bin/perl
2 use strict;
3
4 my $DEBUG = $ENV{DEBUG};
5
6 my $dir_name;
7 my $test_dir_name;
8 BEGIN {
9 $test_dir_name = 't/';
10 $dir_name = 't/tokenizer/';
11 my $skip = "You don't have JSON module";
12 eval q{
13 use JSON 1.07;
14 $skip = "You don't have make command";
15 system ("cd $test_dir_name; make tokenizer-files") == 0 or die
16 unless -f $dir_name.'test1.test';
17 $skip = '';
18 };
19 if ($skip) {
20 print "1..1\n";
21 print "ok 1 # $skip\n";
22 exit;
23 }
24 $JSON::UnMapping = 1;
25 $JSON::UTF8 = 1;
26 }
27
28 use Test;
29 BEGIN { plan tests => 1129 }
30
31 use Data::Dumper;
32 $Data::Dumper::Useqq = 1;
33 $Data::Dumper::Sortkeys = 1;
34 sub Data::Dumper::qquote {
35 my $s = shift;
36 $s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
37 return q<qq'> . $s . q<'>;
38 } # Data::Dumper::qquote
39
40 if ($DEBUG) {
41 my $not_found = {%{$Whatpm::HTML::Debug::cp or {}}};
42
43 $Whatpm::HTML::Debug::cp_pass = sub {
44 my $id = shift;
45 delete $not_found->{$id};
46 };
47
48 END {
49 for my $id (sort {$a <=> $b || $a cmp $b} grep {!/^[ti]/}
50 keys %$not_found) {
51 print "# checkpoint $id is not reached\n";
52 }
53 }
54 }
55
56 use Whatpm::HTML;
57
58 for my $file_name (grep {$_} split /\s+/, qq[
59 ${dir_name}test1.test
60 ${dir_name}test2.test
61 ${dir_name}test3.test
62 ${dir_name}test4.test
63 ${dir_name}contentModelFlags.test
64 ${dir_name}escapeFlag.test
65 ${dir_name}entities.test
66 ${dir_name}xmlViolation.test
67 ${test_dir_name}tokenizer-test-1.test
68 ]) {
69 open my $file, '<', $file_name
70 or die "$0: $file_name: $!";
71 local $/ = undef;
72 my $js = <$file>;
73 close $file;
74
75 print "# $file_name\n";
76 $js =~ s{\\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])
77 \\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])}{
78 ## NOTE: JSON::Parser does not decode surrogate pair escapes
79 ## NOTE: In older version of JSON::Parser, utf8 string will be broken
80 ## by parsing. Use latest version!
81 ## NOTE: Encode.pm is broken; it converts e.g. U+10FFFF to U+FFFD.
82 my $c = 0x10000;
83 $c += ((((hex $1) & 0b1111111111) << 10) | ((hex $2) & 0b1111111111));
84 chr $c;
85 }gex;
86 my $json = jsonToObj ($js);
87 my $tests = $json->{tests} || $json->{xmlViolationTests};
88 TEST: for my $test (@$tests) {
89 my $s = $test->{input};
90
91 my $j = 1;
92 while ($j < @{$test->{output}}) {
93 if (ref $test->{output}->[$j - 1] and
94 $test->{output}->[$j - 1]->[0] eq 'Character' and
95 ref $test->{output}->[$j] and
96 $test->{output}->[$j]->[0] eq 'Character') {
97 $test->{output}->[$j - 1]->[1]
98 .= $test->{output}->[$j]->[1];
99 splice @{$test->{output}}, $j, 1;
100 }
101 $j++;
102 }
103
104 my @cm = @{$test->{contentModelFlags} || ['PCDATA']};
105 my $last_start_tag = $test->{lastStartTag};
106 for my $cm (@cm) {
107 my $p = Whatpm::HTML->new;
108 my $i = 0;
109 my @token;
110 $p->{set_nc} = sub {
111 my $self = shift;
112
113 # pop @{$self->{prev_char}};
114 # unshift @{$self->{prev_char}}, $self->{nc};
115
116 $self->{nc} = -1 and return if $i >= length $s;
117 $self->{nc} = ord substr $s, $i++, 1;
118
119 if ($self->{nc} == 0x000D) { # CR
120 $i++ if substr ($s, $i, 1) eq "\x0A";
121 $self->{nc} = 0x000A; # LF # MUST
122 } elsif ($self->{nc} > 0x10FFFF) {
123 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
124 push @token, 'ParseError';
125 } elsif ($self->{nc} == 0x0000) { # NULL
126 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
127 push @token, 'ParseError';
128 } elsif ($self->{nc} <= 0x0008 or
129 (0x000E <= $self->{nc} and
130 $self->{nc} <= 0x001F) or
131 (0x007F <= $self->{nc} and
132 $self->{nc} <= 0x009F) or
133 (0xD800 <= $self->{nc} and
134 $self->{nc} <= 0xDFFF) or
135 (0xFDD0 <= $self->{nc} and
136 $self->{nc} <= 0xFDDF) or
137 {
138 0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
139 0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
140 0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
141 0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
142 0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
143 0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
144 0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
145 0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
146 0x10FFFE => 1, 0x10FFFF => 1,
147 }->{$self->{nc}}) {
148 push @token, 'ParseError';
149 }
150 };
151
152
153
154 $p->{read_until} = sub { return 0 };
155
156 $p->{parse_error} = sub {
157 push @token, 'ParseError';
158 };
159
160 $p->_initialize_tokenizer;
161 $p->{content_model} = {
162 CDATA => Whatpm::HTML::CDATA_CONTENT_MODEL (),
163 RCDATA => Whatpm::HTML::RCDATA_CONTENT_MODEL (),
164 PCDATA => Whatpm::HTML::PCDATA_CONTENT_MODEL (),
165 PLAINTEXT => Whatpm::HTML::PLAINTEXT_CONTENT_MODEL (),
166 }->{$cm};
167 $p->{last_stag_name} = $last_start_tag;
168
169 while (1) {
170 my $token = $p->_get_next_token;
171 last if $token->{type} == Whatpm::HTML::END_OF_FILE_TOKEN ();
172
173 my $test_token = [
174 {
175 Whatpm::HTML::DOCTYPE_TOKEN () => 'DOCTYPE',
176 Whatpm::HTML::START_TAG_TOKEN () => 'StartTag',
177 Whatpm::HTML::END_TAG_TOKEN () => 'EndTag',
178 Whatpm::HTML::COMMENT_TOKEN () => 'Comment',
179 Whatpm::HTML::CHARACTER_TOKEN () => 'Character',
180 }->{$token->{type}} || $token->{type},
181 ];
182 $test_token->[1] = $token->{tag_name} if defined $token->{tag_name};
183 $test_token->[1] = $token->{data} if defined $token->{data};
184 if ($token->{type} == Whatpm::HTML::START_TAG_TOKEN ()) {
185 $test_token->[2] = {map {$_->{name} => $_->{value}} values %{$token->{attributes}}};
186 $test_token->[3] = 1 if $p->{self_closing};
187 delete $p->{self_closing};
188 } elsif ($token->{type} == Whatpm::HTML::DOCTYPE_TOKEN ()) {
189 $test_token->[1] = $token->{name};
190 $test_token->[2] = $token->{pubid};
191 $test_token->[3] = $token->{sysid};
192 $test_token->[4] = $token->{quirks} ? 0 : 1;
193 }
194
195 if (@token and ref $token[-1] and $token[-1]->[0] eq 'Character' and
196 $test_token->[0] eq 'Character') {
197 $token[-1]->[1] .= $test_token->[1];
198 } else {
199 push @token, $test_token;
200 }
201 }
202
203 my $expected_dump = Dumper ($test->{output});
204 my $parser_dump = Dumper (\@token);
205 ok $parser_dump, $expected_dump,
206 $test->{description} . ': ' . Data::Dumper::qquote ($test->{input});
207 }
208 }
209 }
210
211 ## License: Public Domain.
212 ## $Date: 2008/09/22 06:04:29 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24