/[suikacvs]/markup/html/whatpm/Whatpm/URIChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/URIChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (hide annotations) (download)
Thu Dec 11 03:18:17 2008 UTC (15 years, 10 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.13: +4 -4 lines
++ whatpm/Whatpm/ChangeLog	11 Dec 2008 03:17:21 -0000
2008-12-11  Wakaba  <wakaba@suika.fam.cx>

	* URIChecker.pm: Some of |pos_end| values were wrong.

++ whatpm/Whatpm/ContentChecker/ChangeLog	11 Dec 2008 03:18:04 -0000
2008-12-11  Wakaba  <wakaba@suika.fam.cx>

	* HTML.pm: Added support for |pattern| attributes.

1 wakaba 1.1 package Whatpm::URIChecker;
2     use strict;
3    
4 wakaba 1.2 require Encode;
5    
6     our $DefaultPort = {
7     http => 80,
8     };
9    
10 wakaba 1.9 my $default_error_levels = {
11     uri_fact => 'm',
12     uri_lc_must => 'm', ## Non-RFC 2119 "must" (or fact)
13     uri_lc_should => 'w', ## Non-RFC 2119 "should"
14     uri_syntax => 'm',
15 wakaba 1.7
16 wakaba 1.9 rdf_fact => 'm',
17    
18 wakaba 1.11 warn => 'w',
19 wakaba 1.9 uncertain => 'u',
20     };
21    
22     sub check_iri ($$$;$) {
23 wakaba 1.6 require Message::URI::URIReference;
24 wakaba 1.13 my $dom = 'Message::DOM::DOMImplementation';
25 wakaba 1.6 my $uri_o = $dom->create_uri_reference ($_[1]);
26     my $uri_s = $uri_o->uri_reference;
27    
28     local $Error::Depth = $Error::Depth + 1;
29    
30     unless ($uri_o->is_iri_3987) {
31 wakaba 1.9 $_[2]->(type => 'syntax error:iri3987',
32     level => ($_[3] or $default_error_levels)->{uri_syntax});
33 wakaba 1.6 }
34    
35 wakaba 1.9 Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
36 wakaba 1.6 } # check_iri
37    
38 wakaba 1.9 sub check_iri_reference ($$$;$) {
39 wakaba 1.1 my $onerror = $_[2];
40 wakaba 1.9 my $levels = $_[3] || $default_error_levels;
41 wakaba 1.1
42 wakaba 1.10 require Message::DOM::DOMImplementation;
43 wakaba 1.13 my $dom = 'Message::DOM::DOMImplementation';
44 wakaba 1.1 my $uri_o = $dom->create_uri_reference ($_[1]);
45     my $uri_s = $uri_o->uri_reference;
46    
47 wakaba 1.2 ## RFC 3987 4.1.
48 wakaba 1.1 unless ($uri_o->is_iri_reference_3987) {
49 wakaba 1.9 $onerror->(type => 'syntax error:iriref3987',
50     level => $levels->{uri_syntax});
51     ## MUST (NOTE: A requirement for bidi IRIs.)
52 wakaba 1.1 }
53    
54 wakaba 1.2 ## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
55     pos ($uri_s) = 0;
56 wakaba 1.1 while ($uri_s =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
57 wakaba 1.9 $onerror->(type => 'URL:lowercase hexadecimal digit',
58 wakaba 1.11 level => $levels->{uri_lc_should},
59     value => $uri_s,
60 wakaba 1.14 pos_start => $-[0], pos_end => $+[0]);
61 wakaba 1.1 ## shoult not
62     }
63    
64     ## RFC 3986 2.2.
65     ## URI producing applications should percent-encode ... reserved ...
66     ## unless ... allowed by the URI scheme .... --- This is not testable.
67    
68 wakaba 1.2 ## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
69     pos ($uri_s) = 0;
70 wakaba 1.1 while ($uri_s =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
71 wakaba 1.9 $onerror->(type => 'URL:percent-encoded unreserved',
72 wakaba 1.11 level => $levels->{uri_lc_should},
73     value => $uri_s,
74 wakaba 1.14 pos_start => $-[0], pos_end => $+[0]);
75 wakaba 1.2 ## should
76     ## should
77 wakaba 1.1 }
78    
79     ## RFC 3986 2.4.
80     ## ... "%" ... must be percent-encoded as "%25" ...
81     ## --- Either syntax error or undetectable if followed by two hexadecimals
82    
83 wakaba 1.2 ## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
84 wakaba 1.1 my $scheme = $uri_o->uri_scheme;
85 wakaba 1.2 my $scheme_canon;
86 wakaba 1.1 if (defined $scheme) {
87 wakaba 1.4 $scheme_canon = Encode::encode ('utf8', $scheme);
88 wakaba 1.2 $scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
89     if ($scheme_canon =~ tr/A-Z/a-z/) {
90 wakaba 1.9 $onerror->(type => 'URL:uppercase scheme name',
91 wakaba 1.11 level => $levels->{uri_lc_should},
92     value => $scheme, value_mark => qr/[A-Z]+/);
93 wakaba 1.2 ## should
94 wakaba 1.1 }
95     }
96    
97     ## Note that nothing prevent a conforming URI (if there is one)
98     ## using an unregistered URI scheme...
99    
100 wakaba 1.2 ## RFC 3986 3.2.1., 7.5.
101 wakaba 1.1 my $ui = $uri_o->uri_userinfo;
102     if (defined $ui and $ui =~ /:/) {
103 wakaba 1.9 $onerror->(type => 'URL:password', level => $levels->{uri_lc_should});
104 wakaba 1.11 # deprecated, should be considered an error
105     ## NOTE: We intentionally don't set |value| parameter.
106 wakaba 1.1 }
107    
108 wakaba 1.2 ## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
109 wakaba 1.1 my $host = $uri_o->uri_host;
110     if (defined $host) {
111     if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
112 wakaba 1.9 $onerror->(type => 'URL:address format',
113 wakaba 1.11 level => $levels->{warn},
114     text => $1,
115 wakaba 1.14 value => $host, pos_start => $-[1], pos_end => $+[1]);
116 wakaba 1.11 ## NOTE: No conformance creteria is defined for new address format,
117     ## nor is any standardization process.
118     }
119 wakaba 1.2 my $hostnp = $host;
120     $hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
121     if ($hostnp =~ /[A-Z]/) {
122 wakaba 1.9 $onerror->(type => 'URL:uppercase host',
123     level => $levels->{uri_lc_should},
124 wakaba 1.11 value => $hostnp, value_mark => qr/[A-Z]+/);
125 wakaba 1.2 ## should
126     }
127    
128     if ($host =~ /^\[/) {
129     #
130 wakaba 1.1 } else {
131 wakaba 1.11 my $host_np = Encode::encode ('utf8', $host);
132     $host_np =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
133 wakaba 1.2
134 wakaba 1.11 if ($host_np eq '') {
135 wakaba 1.2 ## NOTE: Although not explicitly mentioned, an empty host
136     ## should be considered as an exception for the recommendation
137     ## that a host "should" be a DNS name.
138 wakaba 1.11 } elsif ($host_np !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
139 wakaba 1.9 $onerror->(type => 'URL:non-DNS host',
140 wakaba 1.11 level => $levels->{uri_lc_should},
141     value => $host_np);
142 wakaba 1.2 ## should
143     ## should be IDNA encoding if wish to maximize interoperability
144     } elsif (length $host > 255) {
145     ## NOTE: This length might be incorrect if there were percent-encoded
146     ## UTF-8 bytes; however, the above condition catches all non-ASCII.
147 wakaba 1.9 $onerror->(type => 'URL:long host',
148 wakaba 1.11 level => $levels->{uri_lc_should},
149     value => $host_np,
150     pos_start => 256, pos_end => length $host);
151 wakaba 1.2 ## should
152 wakaba 1.1 }
153    
154 wakaba 1.2 ## FQDN should be followed by "." if necessary --- untestable
155 wakaba 1.1
156 wakaba 1.2 ## must be UTF-8
157 wakaba 1.11 unless ($host_np =~ /\A(?>
158 wakaba 1.2 [\x00-\x7F] |
159     [\xC2-\xDF][\x80-\xBF] | # UTF8-2
160     [\xE0][\xA0-\xBF][\x80-\xBF] |
161     [\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
162     [\xED][\x80-\x9F][\x80-\xBF] |
163     [\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
164     [\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
165     [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
166     [\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
167     )*\z/x) {
168 wakaba 1.9 $onerror->(type => 'URL:non UTF-8 host',
169 wakaba 1.11 level => $levels->{uri_lc_must},
170     value => $host); # not $host_np
171 wakaba 1.2 # must
172     }
173     }
174     }
175 wakaba 1.1
176 wakaba 1.2 ## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
177     my $port = $uri_o->uri_port;
178     if (defined $port) {
179     if ($port =~ /\A([0-9]+)\z/) {
180     if ($DefaultPort->{$scheme_canon} == $1) {
181 wakaba 1.9 $onerror->(type => 'URL:default port',
182 wakaba 1.11 level => $levels->{uri_lc_should},
183     value => $port);
184 wakaba 1.2 ## should
185     }
186     } elsif ($port eq '') {
187 wakaba 1.9 $onerror->(type => 'URL:empty port',
188 wakaba 1.11 level => $levels->{uri_lc_should},
189     value => $uri_o->uri_authority,
190     value_mark_end => 1);
191 wakaba 1.2 ## should
192     }
193     }
194 wakaba 1.1
195 wakaba 1.2 ## RFC 3986 3.4.
196     ## ... says that "/" or "?" in query might be problematic for
197     ## old implementations, but also suggest that for readability percent-encoding
198     ## might not be good idea. It provides no recommendation on this issue.
199     ## Therefore, we do no check for this matter.
200    
201     ## RFC 3986 3.5.
202     ## ... says again that "/" or "?" in fragment might be problematic,
203     ## without any recommendation.
204     ## We again left this unchecked.
205    
206     ## RFC 3986 4.4.
207     ## Authors should not assume ... different, though equivalent,
208     ## URI will (or will not) be interpreted as a same-document reference ...
209     ## This is not testable.
210    
211     ## RFC 3986 5.4.2.
212     ## "scheme:relative" should be avoided
213     ## This is not testable without scheme specific information.
214 wakaba 1.1
215 wakaba 1.2 ## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
216     my $path = $uri_o->uri_path;
217     if (defined $scheme) {
218     if (
219 wakaba 1.11 $path =~ m!/\.\./! or
220 wakaba 1.2 $path =~ m!/\./! or
221     $path =~ m!/\.\.\z! or
222     $path =~ m!/\.\z! or
223     $path =~ m!\A\.\./! or
224     $path =~ m!\A\./! or
225     $path eq '.,' or
226     $path eq '.'
227     ) {
228 wakaba 1.9 $onerror->(type => 'URL:dot-segment',
229 wakaba 1.11 level => $levels->{uri_lc_should},
230     value => $path,
231     value_mark => qr[(?<=/)\.\.?(?=/|\z)|\A\.\.?(?=/|\z)]);
232 wakaba 1.2 ## should
233 wakaba 1.1 }
234     }
235    
236 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
237     my $authority = $uri_o->uri_authority;
238     if (defined $authority) {
239     if ($path eq '') {
240 wakaba 1.9 $onerror->(type => 'URL:empty path',
241 wakaba 1.11 level => $levels->{uri_lc_should},
242 wakaba 1.12 value => $uri_s, value_mark_end => 1);
243 wakaba 1.2 ## should
244     }
245     }
246 wakaba 1.1
247 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
248     ## Scheme dependent default authority should be omitted
249 wakaba 1.1
250 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
251     if (defined $host and $host eq '' and
252     (defined $ui or defined $port)) {
253 wakaba 1.9 $onerror->(type => 'URL:empty host',
254 wakaba 1.11 level => $levels->{uri_lc_should},
255 wakaba 1.12 value => $authority,
256     pos_start => defined $ui ? 1 + length $ui : 0,
257     pos_end => defined $ui ? 1 + length $ui : 0);
258 wakaba 1.2 ## should # when empty authority is allowed
259     }
260    
261     ## RFC 3986 7.5.
262     ## should not ... username or password that is intended to be secret
263     ## This is not testable.
264    
265     ## RFC 3987 4.1.
266     ## MUST be in full logical order
267     ## This is not testable.
268    
269     ## RFC 3987 4.1., 6.4.
270     ## URI scheme dependent syntax
271     ## MUST
272     ## TODO
273    
274     ## RFC 3987 4.2.
275     ## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
276     ## SHOULD NOT use both rtl and ltr characters
277     ## SHOULD start with rtl if using rtl characters
278     ## TODO
279    
280     ## RFC 3987 5.3.2.2.
281     ## SHOULD be NFC
282     ## NFKC may avoid even more problems
283     ## TODO
284    
285     ## RFC 3987 5.3.3.
286     ## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
287     ## SHOULD be normalized by Nameprep
288     ## TODO
289 wakaba 1.3
290     ## TODO: If it is a relative reference, then resolve and then check against scheme dependent requirements
291 wakaba 1.1 } # check_iri_reference
292    
293 wakaba 1.9 sub check_rdf_uri_reference ($$$;$) {
294 wakaba 1.8 require Message::URI::URIReference;
295 wakaba 1.13 my $dom = 'Message::DOM::DOMImplementation';
296 wakaba 1.8 my $uri_o = $dom->create_uri_reference ($_[1]);
297     my $uri_s = $uri_o->uri_reference;
298    
299 wakaba 1.9 my $levels = $_[3] || $default_error_levels;
300 wakaba 1.8
301     if ($uri_s =~ /[\x00-\x1F\x7F-\x9F]/) {
302 wakaba 1.9 $_[2]->(type => 'syntax error:rdfuriref',
303     level => $levels->{rdf_fact},
304 wakaba 1.8 position => $-[0]);
305     }
306    
307     my $ascii_uri_o = $uri_o->get_uri_reference_3986; # same as RDF spec's one
308    
309 wakaba 1.9 unless ($ascii_uri_o->is_uri) { ## TODO: is_uri_2396 should be used.
310     $_[2]->(#type => 'syntax error:uri2396',
311     type => 'syntax error:uri3986',
312     level => $levels->{uri_fact},
313 wakaba 1.8 value => $ascii_uri_o->uri_reference);
314     }
315    
316     ## TODO: Check against RFC 2396.
317 wakaba 1.9 #Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
318 wakaba 1.8 } # check_rdf_uri_reference
319    
320 wakaba 1.1 1;
321 wakaba 1.14 ## $Date: 2008/12/06 10:05:23 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24