/[suikacvs]/markup/html/whatpm/Whatpm/URIChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/URIChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (show annotations) (download)
Thu Dec 11 03:18:17 2008 UTC (15 years, 10 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.13: +4 -4 lines
++ whatpm/Whatpm/ChangeLog	11 Dec 2008 03:17:21 -0000
2008-12-11  Wakaba  <wakaba@suika.fam.cx>

	* URIChecker.pm: Some of |pos_end| values were wrong.

++ whatpm/Whatpm/ContentChecker/ChangeLog	11 Dec 2008 03:18:04 -0000
2008-12-11  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm: Added support for |pattern| attributes.

1 package Whatpm::URIChecker;
2 use strict;
3
4 require Encode;
5
6 our $DefaultPort = {
7 http => 80,
8 };
9
10 my $default_error_levels = {
11 uri_fact => 'm',
12 uri_lc_must => 'm', ## Non-RFC 2119 "must" (or fact)
13 uri_lc_should => 'w', ## Non-RFC 2119 "should"
14 uri_syntax => 'm',
15
16 rdf_fact => 'm',
17
18 warn => 'w',
19 uncertain => 'u',
20 };
21
22 sub check_iri ($$$;$) {
23 require Message::URI::URIReference;
24 my $dom = 'Message::DOM::DOMImplementation';
25 my $uri_o = $dom->create_uri_reference ($_[1]);
26 my $uri_s = $uri_o->uri_reference;
27
28 local $Error::Depth = $Error::Depth + 1;
29
30 unless ($uri_o->is_iri_3987) {
31 $_[2]->(type => 'syntax error:iri3987',
32 level => ($_[3] or $default_error_levels)->{uri_syntax});
33 }
34
35 Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
36 } # check_iri
37
38 sub check_iri_reference ($$$;$) {
39 my $onerror = $_[2];
40 my $levels = $_[3] || $default_error_levels;
41
42 require Message::DOM::DOMImplementation;
43 my $dom = 'Message::DOM::DOMImplementation';
44 my $uri_o = $dom->create_uri_reference ($_[1]);
45 my $uri_s = $uri_o->uri_reference;
46
47 ## RFC 3987 4.1.
48 unless ($uri_o->is_iri_reference_3987) {
49 $onerror->(type => 'syntax error:iriref3987',
50 level => $levels->{uri_syntax});
51 ## MUST (NOTE: A requirement for bidi IRIs.)
52 }
53
54 ## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
55 pos ($uri_s) = 0;
56 while ($uri_s =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
57 $onerror->(type => 'URL:lowercase hexadecimal digit',
58 level => $levels->{uri_lc_should},
59 value => $uri_s,
60 pos_start => $-[0], pos_end => $+[0]);
61 ## shoult not
62 }
63
64 ## RFC 3986 2.2.
65 ## URI producing applications should percent-encode ... reserved ...
66 ## unless ... allowed by the URI scheme .... --- This is not testable.
67
68 ## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
69 pos ($uri_s) = 0;
70 while ($uri_s =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
71 $onerror->(type => 'URL:percent-encoded unreserved',
72 level => $levels->{uri_lc_should},
73 value => $uri_s,
74 pos_start => $-[0], pos_end => $+[0]);
75 ## should
76 ## should
77 }
78
79 ## RFC 3986 2.4.
80 ## ... "%" ... must be percent-encoded as "%25" ...
81 ## --- Either syntax error or undetectable if followed by two hexadecimals
82
83 ## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
84 my $scheme = $uri_o->uri_scheme;
85 my $scheme_canon;
86 if (defined $scheme) {
87 $scheme_canon = Encode::encode ('utf8', $scheme);
88 $scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
89 if ($scheme_canon =~ tr/A-Z/a-z/) {
90 $onerror->(type => 'URL:uppercase scheme name',
91 level => $levels->{uri_lc_should},
92 value => $scheme, value_mark => qr/[A-Z]+/);
93 ## should
94 }
95 }
96
97 ## Note that nothing prevent a conforming URI (if there is one)
98 ## using an unregistered URI scheme...
99
100 ## RFC 3986 3.2.1., 7.5.
101 my $ui = $uri_o->uri_userinfo;
102 if (defined $ui and $ui =~ /:/) {
103 $onerror->(type => 'URL:password', level => $levels->{uri_lc_should});
104 # deprecated, should be considered an error
105 ## NOTE: We intentionally don't set |value| parameter.
106 }
107
108 ## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
109 my $host = $uri_o->uri_host;
110 if (defined $host) {
111 if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
112 $onerror->(type => 'URL:address format',
113 level => $levels->{warn},
114 text => $1,
115 value => $host, pos_start => $-[1], pos_end => $+[1]);
116 ## NOTE: No conformance creteria is defined for new address format,
117 ## nor is any standardization process.
118 }
119 my $hostnp = $host;
120 $hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
121 if ($hostnp =~ /[A-Z]/) {
122 $onerror->(type => 'URL:uppercase host',
123 level => $levels->{uri_lc_should},
124 value => $hostnp, value_mark => qr/[A-Z]+/);
125 ## should
126 }
127
128 if ($host =~ /^\[/) {
129 #
130 } else {
131 my $host_np = Encode::encode ('utf8', $host);
132 $host_np =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
133
134 if ($host_np eq '') {
135 ## NOTE: Although not explicitly mentioned, an empty host
136 ## should be considered as an exception for the recommendation
137 ## that a host "should" be a DNS name.
138 } elsif ($host_np !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
139 $onerror->(type => 'URL:non-DNS host',
140 level => $levels->{uri_lc_should},
141 value => $host_np);
142 ## should
143 ## should be IDNA encoding if wish to maximize interoperability
144 } elsif (length $host > 255) {
145 ## NOTE: This length might be incorrect if there were percent-encoded
146 ## UTF-8 bytes; however, the above condition catches all non-ASCII.
147 $onerror->(type => 'URL:long host',
148 level => $levels->{uri_lc_should},
149 value => $host_np,
150 pos_start => 256, pos_end => length $host);
151 ## should
152 }
153
154 ## FQDN should be followed by "." if necessary --- untestable
155
156 ## must be UTF-8
157 unless ($host_np =~ /\A(?>
158 [\x00-\x7F] |
159 [\xC2-\xDF][\x80-\xBF] | # UTF8-2
160 [\xE0][\xA0-\xBF][\x80-\xBF] |
161 [\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
162 [\xED][\x80-\x9F][\x80-\xBF] |
163 [\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
164 [\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
165 [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
166 [\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
167 )*\z/x) {
168 $onerror->(type => 'URL:non UTF-8 host',
169 level => $levels->{uri_lc_must},
170 value => $host); # not $host_np
171 # must
172 }
173 }
174 }
175
176 ## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
177 my $port = $uri_o->uri_port;
178 if (defined $port) {
179 if ($port =~ /\A([0-9]+)\z/) {
180 if ($DefaultPort->{$scheme_canon} == $1) {
181 $onerror->(type => 'URL:default port',
182 level => $levels->{uri_lc_should},
183 value => $port);
184 ## should
185 }
186 } elsif ($port eq '') {
187 $onerror->(type => 'URL:empty port',
188 level => $levels->{uri_lc_should},
189 value => $uri_o->uri_authority,
190 value_mark_end => 1);
191 ## should
192 }
193 }
194
195 ## RFC 3986 3.4.
196 ## ... says that "/" or "?" in query might be problematic for
197 ## old implementations, but also suggest that for readability percent-encoding
198 ## might not be good idea. It provides no recommendation on this issue.
199 ## Therefore, we do no check for this matter.
200
201 ## RFC 3986 3.5.
202 ## ... says again that "/" or "?" in fragment might be problematic,
203 ## without any recommendation.
204 ## We again left this unchecked.
205
206 ## RFC 3986 4.4.
207 ## Authors should not assume ... different, though equivalent,
208 ## URI will (or will not) be interpreted as a same-document reference ...
209 ## This is not testable.
210
211 ## RFC 3986 5.4.2.
212 ## "scheme:relative" should be avoided
213 ## This is not testable without scheme specific information.
214
215 ## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
216 my $path = $uri_o->uri_path;
217 if (defined $scheme) {
218 if (
219 $path =~ m!/\.\./! or
220 $path =~ m!/\./! or
221 $path =~ m!/\.\.\z! or
222 $path =~ m!/\.\z! or
223 $path =~ m!\A\.\./! or
224 $path =~ m!\A\./! or
225 $path eq '.,' or
226 $path eq '.'
227 ) {
228 $onerror->(type => 'URL:dot-segment',
229 level => $levels->{uri_lc_should},
230 value => $path,
231 value_mark => qr[(?<=/)\.\.?(?=/|\z)|\A\.\.?(?=/|\z)]);
232 ## should
233 }
234 }
235
236 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
237 my $authority = $uri_o->uri_authority;
238 if (defined $authority) {
239 if ($path eq '') {
240 $onerror->(type => 'URL:empty path',
241 level => $levels->{uri_lc_should},
242 value => $uri_s, value_mark_end => 1);
243 ## should
244 }
245 }
246
247 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
248 ## Scheme dependent default authority should be omitted
249
250 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
251 if (defined $host and $host eq '' and
252 (defined $ui or defined $port)) {
253 $onerror->(type => 'URL:empty host',
254 level => $levels->{uri_lc_should},
255 value => $authority,
256 pos_start => defined $ui ? 1 + length $ui : 0,
257 pos_end => defined $ui ? 1 + length $ui : 0);
258 ## should # when empty authority is allowed
259 }
260
261 ## RFC 3986 7.5.
262 ## should not ... username or password that is intended to be secret
263 ## This is not testable.
264
265 ## RFC 3987 4.1.
266 ## MUST be in full logical order
267 ## This is not testable.
268
269 ## RFC 3987 4.1., 6.4.
270 ## URI scheme dependent syntax
271 ## MUST
272 ## TODO
273
274 ## RFC 3987 4.2.
275 ## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
276 ## SHOULD NOT use both rtl and ltr characters
277 ## SHOULD start with rtl if using rtl characters
278 ## TODO
279
280 ## RFC 3987 5.3.2.2.
281 ## SHOULD be NFC
282 ## NFKC may avoid even more problems
283 ## TODO
284
285 ## RFC 3987 5.3.3.
286 ## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
287 ## SHOULD be normalized by Nameprep
288 ## TODO
289
290 ## TODO: If it is a relative reference, then resolve and then check against scheme dependent requirements
291 } # check_iri_reference
292
293 sub check_rdf_uri_reference ($$$;$) {
294 require Message::URI::URIReference;
295 my $dom = 'Message::DOM::DOMImplementation';
296 my $uri_o = $dom->create_uri_reference ($_[1]);
297 my $uri_s = $uri_o->uri_reference;
298
299 my $levels = $_[3] || $default_error_levels;
300
301 if ($uri_s =~ /[\x00-\x1F\x7F-\x9F]/) {
302 $_[2]->(type => 'syntax error:rdfuriref',
303 level => $levels->{rdf_fact},
304 position => $-[0]);
305 }
306
307 my $ascii_uri_o = $uri_o->get_uri_reference_3986; # same as RDF spec's one
308
309 unless ($ascii_uri_o->is_uri) { ## TODO: is_uri_2396 should be used.
310 $_[2]->(#type => 'syntax error:uri2396',
311 type => 'syntax error:uri3986',
312 level => $levels->{uri_fact},
313 value => $ascii_uri_o->uri_reference);
314 }
315
316 ## TODO: Check against RFC 2396.
317 #Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
318 } # check_rdf_uri_reference
319
320 1;
321 ## $Date: 2008/12/06 10:05:23 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24