/[suikacvs]/markup/html/whatpm/t/HTML-tokenizer.t
Suika

Contents of /markup/html/whatpm/t/HTML-tokenizer.t

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.22 - (hide annotations) (download) (as text)
Sun Mar 2 14:32:27 2008 UTC (17 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.21: +2 -2 lines
File MIME type: application/x-troff
++ whatpm/t/ChangeLog	2 Mar 2008 14:06:22 -0000
	* tokenizer-test-1.test: Tests for |<span ===>| is added (HTML5
	revision 1292).  Tests for & at the end of attribute value
	are added (HTML5 revision 1296).  Tests for bogus comments
	are added (HTML5 revision 1297).  Tests for |=| in
	unquoted attribute values are added (HTML5 revision 1299).
	Tests for single or double quotes in unquoted attribute
	values or attribute names and tests for missing spaces
	between attributes are added (HTML5 revision 1303).

2008-03-02  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	2 Mar 2008 14:05:38 -0000
	* HTML.pm.src: Raise a parse error for |<span ===>| (HTML5 revision
	1292).  Entities are not parsed in comment-like part in RCDATA
	elements (HTML5 revision 1294).  Allow bare & at the end
	of attribute value literals (HTML5 revision 1296).  More
	quirks mode doctypes (HTML5 revision 1302).  Requires spaces
	between attributes and ban attribute names or unquoted
	attribute values containing single or double quotes (HTML5
	revision 1303).

2008-03-02  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 #!/usr/bin/perl
2     use strict;
3    
4 wakaba 1.2 my $dir_name;
5 wakaba 1.4 my $test_dir_name;
6 wakaba 1.1 BEGIN {
7 wakaba 1.4 $test_dir_name = 't/';
8 wakaba 1.2 $dir_name = 't/tokenizer/';
9 wakaba 1.1 my $skip = "You don't have JSON module";
10     eval q{
11 wakaba 1.15 use JSON 1.07;
12 wakaba 1.1 $skip = "You don't have make command";
13 wakaba 1.3 system ("cd $test_dir_name; make tokenizer-files") == 0 or die
14 wakaba 1.2 unless -f $dir_name.'test1.test';
15 wakaba 1.1 $skip = '';
16     };
17     if ($skip) {
18     print "1..1\n";
19     print "ok 1 # $skip\n";
20     exit;
21     }
22     $JSON::UnMapping = 1;
23 wakaba 1.2 $JSON::UTF8 = 1;
24 wakaba 1.1 }
25    
26     use Test;
27 wakaba 1.22 BEGIN { plan tests => 396 }
28 wakaba 1.2
29 wakaba 1.1 use Data::Dumper;
30 wakaba 1.2 $Data::Dumper::Useqq = 1;
31     sub Data::Dumper::qquote {
32     my $s = shift;
33     $s =~ s/([^\x20\x21-\x26\x28-\x5B\x5D-\x7E])/sprintf '\x{%02X}', ord $1/ge;
34     return q<qq'> . $s . q<'>;
35     } # Data::Dumper::qquote
36 wakaba 1.1
37 wakaba 1.5 use Whatpm::HTML;
38 wakaba 1.1
39 wakaba 1.4 for my $file_name (grep {$_} split /\s+/, qq[
40     ${dir_name}test1.test
41     ${dir_name}test2.test
42 wakaba 1.15 ${dir_name}test3.test
43     ${dir_name}test4.test
44 wakaba 1.4 ${dir_name}contentModelFlags.test
45 wakaba 1.8 ${dir_name}escapeFlag.test
46 wakaba 1.4 ${test_dir_name}tokenizer-test-1.test
47 wakaba 1.1 ]) {
48 wakaba 1.4 open my $file, '<', $file_name
49     or die "$0: $file_name: $!";
50 wakaba 1.1 local $/ = undef;
51     my $js = <$file>;
52     close $file;
53 wakaba 1.9
54     print "# $file_name\n";
55 wakaba 1.15 $js =~ s{\\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])
56     \\u[Dd]([89A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])}{
57     ## NOTE: JSON::Parser does not decode surrogate pair escapes
58     ## NOTE: In older version of JSON::Parser, utf8 string will be broken
59     ## by parsing. Use latest version!
60     ## NOTE: Encode.pm is broken; it converts e.g. U+10FFFF to U+FFFD.
61     my $c = 0x10000;
62     $c += ((((hex $1) & 0b1111111111) << 10) | ((hex $2) & 0b1111111111));
63     chr $c;
64     }gex;
65 wakaba 1.1 my $tests = jsonToObj ($js)->{tests};
66     TEST: for my $test (@$tests) {
67     my $s = $test->{input};
68    
69     my $j = 1;
70     while ($j < @{$test->{output}}) {
71     if (ref $test->{output}->[$j - 1] and
72     $test->{output}->[$j - 1]->[0] eq 'Character' and
73     ref $test->{output}->[$j] and
74     $test->{output}->[$j]->[0] eq 'Character') {
75     $test->{output}->[$j - 1]->[1]
76     .= $test->{output}->[$j]->[1];
77     splice @{$test->{output}}, $j, 1;
78     }
79     $j++;
80     }
81    
82 wakaba 1.2 my @cm = @{$test->{contentModelFlags} || ['PCDATA']};
83     my $last_start_tag = $test->{lastStartTag};
84 wakaba 1.1 for my $cm (@cm) {
85 wakaba 1.5 my $p = Whatpm::HTML->new;
86 wakaba 1.1 my $i = 0;
87 wakaba 1.9 my @token;
88 wakaba 1.1 $p->{set_next_input_character} = sub {
89     my $self = shift;
90 wakaba 1.10
91     pop @{$self->{prev_input_character}};
92     unshift @{$self->{prev_input_character}}, $self->{next_input_character};
93    
94 wakaba 1.1 $self->{next_input_character} = -1 and return if $i >= length $s;
95     $self->{next_input_character} = ord substr $s, $i++, 1;
96 wakaba 1.2
97     if ($self->{next_input_character} == 0x000D) { # CR
98 wakaba 1.15 $i++ if substr ($s, $i, 1) eq "\x0A";
99 wakaba 1.2 $self->{next_input_character} = 0x000A; # LF # MUST
100     } elsif ($self->{next_input_character} > 0x10FFFF) {
101     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
102 wakaba 1.9 push @token, 'ParseError';
103 wakaba 1.2 } elsif ($self->{next_input_character} == 0x0000) { # NULL
104     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
105 wakaba 1.9 push @token, 'ParseError';
106 wakaba 1.2 }
107 wakaba 1.1 };
108 wakaba 1.10 $p->{prev_input_character} = [-1, -1, -1];
109     $p->{next_input_character} = -1;
110 wakaba 1.1
111     $p->{parse_error} = sub {
112     push @token, 'ParseError';
113     };
114    
115     $p->_initialize_tokenizer;
116 wakaba 1.18 $p->{content_model} = {
117     CDATA => Whatpm::HTML::CDATA_CONTENT_MODEL (),
118     RCDATA => Whatpm::HTML::RCDATA_CONTENT_MODEL (),
119     PCDATA => Whatpm::HTML::PCDATA_CONTENT_MODEL (),
120     PLAINTEXT => Whatpm::HTML::PLAINTEXT_CONTENT_MODEL (),
121     }->{$cm};
122 wakaba 1.2 $p->{last_emitted_start_tag_name} = $last_start_tag;
123 wakaba 1.1
124     while (1) {
125     my $token = $p->_get_next_token;
126 wakaba 1.19 last if $token->{type} == Whatpm::HTML::END_OF_FILE_TOKEN ();
127 wakaba 1.1
128     my $test_token = [
129     {
130 wakaba 1.19 Whatpm::HTML::DOCTYPE_TOKEN () => 'DOCTYPE',
131     Whatpm::HTML::START_TAG_TOKEN () => 'StartTag',
132     Whatpm::HTML::END_TAG_TOKEN () => 'EndTag',
133     Whatpm::HTML::COMMENT_TOKEN () => 'Comment',
134     Whatpm::HTML::CHARACTER_TOKEN () => 'Character',
135 wakaba 1.1 }->{$token->{type}} || $token->{type},
136     ];
137     $test_token->[1] = $token->{tag_name} if defined $token->{tag_name};
138     $test_token->[1] = $token->{data} if defined $token->{data};
139 wakaba 1.19 if ($token->{type} == Whatpm::HTML::START_TAG_TOKEN ()) {
140 wakaba 1.11 $test_token->[2] = {map {$_->{name} => $_->{value}} values %{$token->{attributes}}};
141 wakaba 1.19 } elsif ($token->{type} == Whatpm::HTML::DOCTYPE_TOKEN ()) {
142 wakaba 1.11 $test_token->[1] = $token->{name};
143     $test_token->[2] = $token->{public_identifier};
144     $test_token->[3] = $token->{system_identifier};
145     $test_token->[4] = $token->{correct} ? 1 : 0;
146     }
147    
148 wakaba 1.1 if (@token and ref $token[-1] and $token[-1]->[0] eq 'Character' and
149     $test_token->[0] eq 'Character') {
150     $token[-1]->[1] .= $test_token->[1];
151     } else {
152     push @token, $test_token;
153     }
154     }
155    
156     my $expected_dump = Dumper ($test->{output});
157     my $parser_dump = Dumper (\@token);
158 wakaba 1.2 ok $parser_dump, $expected_dump,
159 wakaba 1.17 $test->{description} . ': ' . Data::Dumper::qquote ($test->{input});
160 wakaba 1.1 }
161     }
162     }
163    
164 wakaba 1.19 ## License: Public Domain.
165 wakaba 1.22 ## $Date: 2007/09/29 04:45:10 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24