/[suikacvs]/markup/html/whatpm/Whatpm/URIChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/URIChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (hide annotations) (download)
Sat Jun 30 13:12:33 2007 UTC (17 years, 4 months ago) by wakaba
Branch: MAIN
Changes since 1.4: +2 -3 lines
++ whatpm/t/ChangeLog	30 Jun 2007 12:28:52 -0000
2007-06-30  Wakaba  <wakaba@suika.fam.cx>

	* URIChecker.t: Error level names in test results has
	been changed.

	* tokenizer-test-1.test: A test for bogus SYSTEM identifier
	is added.

	* content-model-1.dat, content-model-2.dat, content-model-3.dat,
	content-model-4.dat: Error messages has been changed.

	* ContentChecker.t: Appends error level to the error
	message if any.

++ whatpm/Whatpm/ChangeLog	30 Jun 2007 13:03:50 -0000
2007-06-30  Wakaba  <wakaba@suika.fam.cx>

	* IMTChecker.pm: Report warning for unregistered
	and private types/subtypes.

	* ContentChecker.pm, HTML.pm.src, IMTChecker.pm,
	URIChecker.pm, HTMLTable.pm: Error messages are now
	consistent; they are all listed in
	<http://suika.fam.cx/gate/2005/sw/Whatpm%20Error%20Types>.

1 wakaba 1.1 package Whatpm::URIChecker;
2     use strict;
3    
4 wakaba 1.2 require Encode;
5    
6     our $DefaultPort = {
7     http => 80,
8     };
9    
10 wakaba 1.1 sub check_iri_reference ($$$) {
11     my $onerror = $_[2];
12    
13     require Message::URI::URIReference; ## From manakai
14     my $dom = 'Message::DOM::DOMImplementation'; ## ISSUE: This is not a formal way to instantiate it.
15    
16     my $uri_o = $dom->create_uri_reference ($_[1]);
17     my $uri_s = $uri_o->uri_reference;
18    
19     local $Error::Depth = $Error::Depth + 1;
20    
21 wakaba 1.2 ## RFC 3987 4.1.
22 wakaba 1.1 unless ($uri_o->is_iri_reference_3987) {
23     $onerror->(type => 'syntax error', level => 'm');
24 wakaba 1.2 ## MUST
25 wakaba 1.1 }
26    
27 wakaba 1.2 ## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
28     pos ($uri_s) = 0;
29 wakaba 1.1 while ($uri_s =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
30     $onerror->(type => 'lowercase hexadecimal digit',
31     position => $-[0] + 1, level => 's');
32     ## shoult not
33     }
34    
35     ## RFC 3986 2.2.
36     ## URI producing applications should percent-encode ... reserved ...
37     ## unless ... allowed by the URI scheme .... --- This is not testable.
38    
39 wakaba 1.2 ## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
40     pos ($uri_s) = 0;
41 wakaba 1.1 while ($uri_s =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
42     $onerror->(type => 'percent-encoded unreserved',
43     position => $-[0] + 1, level => 's');
44 wakaba 1.2 ## should
45     ## should
46 wakaba 1.1 }
47    
48     ## RFC 3986 2.4.
49     ## ... "%" ... must be percent-encoded as "%25" ...
50     ## --- Either syntax error or undetectable if followed by two hexadecimals
51    
52 wakaba 1.2 ## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
53 wakaba 1.1 my $scheme = $uri_o->uri_scheme;
54 wakaba 1.2 my $scheme_canon;
55 wakaba 1.1 if (defined $scheme) {
56 wakaba 1.4 $scheme_canon = Encode::encode ('utf8', $scheme);
57 wakaba 1.2 $scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
58     if ($scheme_canon =~ tr/A-Z/a-z/) {
59     $onerror->(type => 'uppercase scheme name', level => 's');
60     ## should
61 wakaba 1.1 }
62     }
63    
64     ## Note that nothing prevent a conforming URI (if there is one)
65     ## using an unregistered URI scheme...
66    
67 wakaba 1.2 ## RFC 3986 3.2.1., 7.5.
68 wakaba 1.1 my $ui = $uri_o->uri_userinfo;
69     if (defined $ui and $ui =~ /:/) {
70     $onerror->(type => 'password', level => 's');
71     ## deprecated
72     }
73    
74 wakaba 1.2 ## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
75 wakaba 1.1 my $host = $uri_o->uri_host;
76     if (defined $host) {
77     if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
78 wakaba 1.5 $onerror->(type => 'address format:'.$1, level => 'unsupported');
79 wakaba 1.2 }
80     my $hostnp = $host;
81     $hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
82     if ($hostnp =~ /[A-Z]/) {
83     $onerror->(type => 'uppercase host',
84     level => 's');
85     ## should
86     }
87    
88     if ($host =~ /^\[/) {
89     #
90 wakaba 1.1 } else {
91 wakaba 1.2 $host = Encode::encode ('utf8', $host);
92     $host =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
93    
94     if ($host eq '') {
95     ## NOTE: Although not explicitly mentioned, an empty host
96     ## should be considered as an exception for the recommendation
97     ## that a host "should" be a DNS name.
98     } elsif ($host !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
99     $onerror->(type => 'non-DNS host', level => 's');
100     ## should
101     ## should be IDNA encoding if wish to maximize interoperability
102     } elsif (length $host > 255) {
103     ## NOTE: This length might be incorrect if there were percent-encoded
104     ## UTF-8 bytes; however, the above condition catches all non-ASCII.
105     $onerror->(type => 'long host', level => 's');
106     ## should
107 wakaba 1.1 }
108    
109 wakaba 1.2 ## FQDN should be followed by "." if necessary --- untestable
110 wakaba 1.1
111 wakaba 1.2 ## must be UTF-8
112     unless ($host =~ /\A(?>
113     [\x00-\x7F] |
114     [\xC2-\xDF][\x80-\xBF] | # UTF8-2
115     [\xE0][\xA0-\xBF][\x80-\xBF] |
116     [\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
117     [\xED][\x80-\x9F][\x80-\xBF] |
118     [\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
119     [\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
120     [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
121     [\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
122     )*\z/x) {
123     $onerror->(type => 'non UTF-8 host', level => 'm');
124     # must
125     }
126     }
127     }
128 wakaba 1.1
129 wakaba 1.2 ## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
130     my $port = $uri_o->uri_port;
131     if (defined $port) {
132     if ($port =~ /\A([0-9]+)\z/) {
133     if ($DefaultPort->{$scheme_canon} == $1) {
134     $onerror->(type => 'default port', level => 's');
135     ## should
136     }
137     } elsif ($port eq '') {
138     $onerror->(type => 'empty port', level => 's');
139     ## should
140     }
141     }
142 wakaba 1.1
143 wakaba 1.2 ## RFC 3986 3.4.
144     ## ... says that "/" or "?" in query might be problematic for
145     ## old implementations, but also suggest that for readability percent-encoding
146     ## might not be good idea. It provides no recommendation on this issue.
147     ## Therefore, we do no check for this matter.
148    
149     ## RFC 3986 3.5.
150     ## ... says again that "/" or "?" in fragment might be problematic,
151     ## without any recommendation.
152     ## We again left this unchecked.
153    
154     ## RFC 3986 4.4.
155     ## Authors should not assume ... different, though equivalent,
156     ## URI will (or will not) be interpreted as a same-document reference ...
157     ## This is not testable.
158    
159     ## RFC 3986 5.4.2.
160     ## "scheme:relative" should be avoided
161     ## This is not testable without scheme specific information.
162 wakaba 1.1
163 wakaba 1.2 ## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
164     my $path = $uri_o->uri_path;
165     if (defined $scheme) {
166     if (
167     $path =~ m!/\.\.! or
168     $path =~ m!/\./! or
169     $path =~ m!/\.\.\z! or
170     $path =~ m!/\.\z! or
171     $path =~ m!\A\.\./! or
172     $path =~ m!\A\./! or
173     $path eq '.,' or
174     $path eq '.'
175     ) {
176     $onerror->(type => 'dot-segment', level => 's');
177     ## should
178 wakaba 1.1 }
179     }
180    
181 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
182     my $authority = $uri_o->uri_authority;
183     if (defined $authority) {
184     if ($path eq '') {
185     $onerror->(type => 'empty path', level => 's');
186     ## should
187     }
188     }
189 wakaba 1.1
190 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
191     ## Scheme dependent default authority should be omitted
192 wakaba 1.1
193 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
194     if (defined $host and $host eq '' and
195     (defined $ui or defined $port)) {
196     $onerror->(type => 'empty host', level => 's');
197     ## should # when empty authority is allowed
198     }
199    
200     ## RFC 3986 7.5.
201     ## should not ... username or password that is intended to be secret
202     ## This is not testable.
203    
204     ## RFC 3987 4.1.
205     ## MUST be in full logical order
206     ## This is not testable.
207    
208     ## RFC 3987 4.1., 6.4.
209     ## URI scheme dependent syntax
210     ## MUST
211     ## TODO
212    
213     ## RFC 3987 4.2.
214     ## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
215     ## SHOULD NOT use both rtl and ltr characters
216     ## SHOULD start with rtl if using rtl characters
217     ## TODO
218    
219     ## RFC 3987 5.3.2.2.
220     ## SHOULD be NFC
221     ## NFKC may avoid even more problems
222     ## TODO
223    
224     ## RFC 3987 5.3.3.
225     ## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
226     ## SHOULD be normalized by Nameprep
227     ## TODO
228 wakaba 1.3
229     ## TODO: If it is a relative reference, then resolve and then check against scheme dependent requirements
230 wakaba 1.1 } # check_iri_reference
231    
232     1;
233 wakaba 1.5 ## $Date: 2007/06/24 14:24:21 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24