/[suikacvs]/markup/html/whatpm/t/HTML-tokenizer.t
Suika

Contents of /markup/html/whatpm/t/HTML-tokenizer.t

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.38 - (hide annotations) (download) (as text)
Sat Aug 30 13:43:50 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.37: +2 -1 lines
File MIME type: application/x-troff
++ whatpm/t/ChangeLog	30 Aug 2008 13:30:24 -0000
	* tokenizer-test-1.dat: '"' and "'" at the end of attribute
	name (after another attribute) now raise parse error (HTML5
	revision 2123).  Empty unquoted attribute is no
	longer allowed (HTML5 revision 2122).

	* HTML-tokenizer.t: Hash keys were not sorted when dumped.

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	30 Aug 2008 13:43:44 -0000
	* HTML.pm.src: '"' and "'" at the end of attribute
	name (after another attribute) now raise parse error (HTML5
	revision 2123).  Empty unquoted attribute values are no
	longer allowed (HTML5 revision 2122).

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 #!/usr/bin/perl
2     use strict;
3    
4 wakaba 1.27 my $DEBUG = $ENV{DEBUG};
5    
6 wakaba 1.2 my $dir_name;
7 wakaba 1.4 my $test_dir_name;
8 wakaba 1.1 BEGIN {
9 wakaba 1.4 $test_dir_name = 't/';
10 wakaba 1.2 $dir_name = 't/tokenizer/';
11 wakaba 1.1 my $skip = "You don't have JSON module";
12     eval q{
13 wakaba 1.15 use JSON 1.07;
14 wakaba 1.1 $skip = "You don't have make command";
15 wakaba 1.3 system ("cd $test_dir_name; make tokenizer-files") == 0 or die
16 wakaba 1.2 unless -f $dir_name.'test1.test';
17 wakaba 1.1 $skip = '';
18     };
19     if ($skip) {
20     print "1..1\n";
21     print "ok 1 # $skip\n";
22     exit;
23     }
24     $JSON::UnMapping = 1;
25 wakaba 1.2 $JSON::UTF8 = 1;
26 wakaba 1.1 }
27    
28     use Test;
29 wakaba 1.36 BEGIN { plan tests => 1073 }
30 wakaba 1.2
31 wakaba 1.1 use Data::Dumper;
32 wakaba 1.2 $Data::Dumper::Useqq = 1;
33 wakaba 1.38 $Data::Dumper::Sortkeys = 1;
34 wakaba 1.2 sub Data::Dumper::qquote {
35     my $s = shift;
36     $s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
37     return q<qq'> . $s . q<'>;
38     } # Data::Dumper::qquote
39 wakaba 1.1
40 wakaba 1.27 if ($DEBUG) {
41 wakaba 1.31 my $not_found = {%{$Whatpm::HTML::Debug::cp or {}}};
42 wakaba 1.27
43     $Whatpm::HTML::Debug::cp_pass = sub {
44     my $id = shift;
45     delete $not_found->{$id};
46     };
47    
48     END {
49 wakaba 1.29 for my $id (sort {$a <=> $b || $a cmp $b} grep {!/^[ti]/}
50     keys %$not_found) {
51 wakaba 1.28 print "# checkpoint $id is not reached\n";
52 wakaba 1.27 }
53     }
54     }
55    
56 wakaba 1.5 use Whatpm::HTML;
57 wakaba 1.1
58 wakaba 1.4 for my $file_name (grep {$_} split /\s+/, qq[
59     ${dir_name}test1.test
60     ${dir_name}test2.test
61 wakaba 1.15 ${dir_name}test3.test
62     ${dir_name}test4.test
63 wakaba 1.4 ${dir_name}contentModelFlags.test
64 wakaba 1.8 ${dir_name}escapeFlag.test
65 wakaba 1.34 ${dir_name}entities.test
66     ${dir_name}xmlViolation.test
67 wakaba 1.4 ${test_dir_name}tokenizer-test-1.test
68 wakaba 1.1 ]) {
69 wakaba 1.4 open my $file, '<', $file_name
70     or die "$0: $file_name: $!";
71 wakaba 1.1 local $/ = undef;
72     my $js = <$file>;
73     close $file;
74 wakaba 1.9
75     print "# $file_name\n";
76 wakaba 1.15 $js =~ s{\\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])
77     \\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])}{
78     ## NOTE: JSON::Parser does not decode surrogate pair escapes
79     ## NOTE: In older version of JSON::Parser, utf8 string will be broken
80     ## by parsing. Use latest version!
81     ## NOTE: Encode.pm is broken; it converts e.g. U+10FFFF to U+FFFD.
82     my $c = 0x10000;
83     $c += ((((hex $1) & 0b1111111111) << 10) | ((hex $2) & 0b1111111111));
84     chr $c;
85     }gex;
86 wakaba 1.34 my $json = jsonToObj ($js);
87     my $tests = $json->{tests} || $json->{xmlViolationTests};
88 wakaba 1.1 TEST: for my $test (@$tests) {
89     my $s = $test->{input};
90    
91     my $j = 1;
92     while ($j < @{$test->{output}}) {
93     if (ref $test->{output}->[$j - 1] and
94     $test->{output}->[$j - 1]->[0] eq 'Character' and
95     ref $test->{output}->[$j] and
96     $test->{output}->[$j]->[0] eq 'Character') {
97     $test->{output}->[$j - 1]->[1]
98     .= $test->{output}->[$j]->[1];
99     splice @{$test->{output}}, $j, 1;
100     }
101     $j++;
102     }
103    
104 wakaba 1.2 my @cm = @{$test->{contentModelFlags} || ['PCDATA']};
105     my $last_start_tag = $test->{lastStartTag};
106 wakaba 1.1 for my $cm (@cm) {
107 wakaba 1.5 my $p = Whatpm::HTML->new;
108 wakaba 1.1 my $i = 0;
109 wakaba 1.9 my @token;
110 wakaba 1.26 $p->{set_next_char} = sub {
111 wakaba 1.1 my $self = shift;
112 wakaba 1.10
113 wakaba 1.26 pop @{$self->{prev_char}};
114     unshift @{$self->{prev_char}}, $self->{next_char};
115 wakaba 1.10
116 wakaba 1.26 $self->{next_char} = -1 and return if $i >= length $s;
117     $self->{next_char} = ord substr $s, $i++, 1;
118 wakaba 1.2
119 wakaba 1.26 if ($self->{next_char} == 0x000D) { # CR
120 wakaba 1.15 $i++ if substr ($s, $i, 1) eq "\x0A";
121 wakaba 1.26 $self->{next_char} = 0x000A; # LF # MUST
122     } elsif ($self->{next_char} > 0x10FFFF) {
123     $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
124 wakaba 1.9 push @token, 'ParseError';
125 wakaba 1.26 } elsif ($self->{next_char} == 0x0000) { # NULL
126     $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
127 wakaba 1.9 push @token, 'ParseError';
128 wakaba 1.35 } elsif ($self->{next_char} <= 0x0008 or
129     (0x000E <= $self->{next_char} and
130     $self->{next_char} <= 0x001F) or
131     (0x007F <= $self->{next_char} and
132     $self->{next_char} <= 0x009F) or
133     (0xD800 <= $self->{next_char} and
134     $self->{next_char} <= 0xDFFF) or
135     (0xFDD0 <= $self->{next_char} and
136     $self->{next_char} <= 0xFDDF) or
137     {
138     0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
139     0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
140     0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
141     0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
142     0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
143     0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
144     0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
145     0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
146     0x10FFFE => 1, 0x10FFFF => 1,
147     }->{$self->{next_char}}) {
148     push @token, 'ParseError';
149 wakaba 1.2 }
150 wakaba 1.1 };
151 wakaba 1.26 $p->{prev_char} = [-1, -1, -1];
152     $p->{next_char} = -1;
153 wakaba 1.1
154     $p->{parse_error} = sub {
155     push @token, 'ParseError';
156     };
157    
158     $p->_initialize_tokenizer;
159 wakaba 1.18 $p->{content_model} = {
160     CDATA => Whatpm::HTML::CDATA_CONTENT_MODEL (),
161     RCDATA => Whatpm::HTML::RCDATA_CONTENT_MODEL (),
162     PCDATA => Whatpm::HTML::PCDATA_CONTENT_MODEL (),
163     PLAINTEXT => Whatpm::HTML::PLAINTEXT_CONTENT_MODEL (),
164     }->{$cm};
165 wakaba 1.2 $p->{last_emitted_start_tag_name} = $last_start_tag;
166 wakaba 1.1
167     while (1) {
168     my $token = $p->_get_next_token;
169 wakaba 1.19 last if $token->{type} == Whatpm::HTML::END_OF_FILE_TOKEN ();
170 wakaba 1.1
171     my $test_token = [
172     {
173 wakaba 1.19 Whatpm::HTML::DOCTYPE_TOKEN () => 'DOCTYPE',
174     Whatpm::HTML::START_TAG_TOKEN () => 'StartTag',
175     Whatpm::HTML::END_TAG_TOKEN () => 'EndTag',
176     Whatpm::HTML::COMMENT_TOKEN () => 'Comment',
177     Whatpm::HTML::CHARACTER_TOKEN () => 'Character',
178 wakaba 1.1 }->{$token->{type}} || $token->{type},
179     ];
180     $test_token->[1] = $token->{tag_name} if defined $token->{tag_name};
181     $test_token->[1] = $token->{data} if defined $token->{data};
182 wakaba 1.19 if ($token->{type} == Whatpm::HTML::START_TAG_TOKEN ()) {
183 wakaba 1.11 $test_token->[2] = {map {$_->{name} => $_->{value}} values %{$token->{attributes}}};
184 wakaba 1.37 $test_token->[3] = 1 if $p->{self_closing};
185     delete $p->{self_closing};
186 wakaba 1.19 } elsif ($token->{type} == Whatpm::HTML::DOCTYPE_TOKEN ()) {
187 wakaba 1.11 $test_token->[1] = $token->{name};
188     $test_token->[2] = $token->{public_identifier};
189     $test_token->[3] = $token->{system_identifier};
190 wakaba 1.25 $test_token->[4] = $token->{quirks} ? 0 : 1;
191 wakaba 1.11 }
192    
193 wakaba 1.1 if (@token and ref $token[-1] and $token[-1]->[0] eq 'Character' and
194     $test_token->[0] eq 'Character') {
195     $token[-1]->[1] .= $test_token->[1];
196     } else {
197     push @token, $test_token;
198     }
199     }
200    
201     my $expected_dump = Dumper ($test->{output});
202     my $parser_dump = Dumper (\@token);
203 wakaba 1.2 ok $parser_dump, $expected_dump,
204 wakaba 1.17 $test->{description} . ': ' . Data::Dumper::qquote ($test->{input});
205 wakaba 1.1 }
206     }
207     }
208    
209 wakaba 1.19 ## License: Public Domain.
210 wakaba 1.38 ## $Date: 2008/06/01 06:47:12 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24