/[suikacvs]/markup/html/whatpm/Whatpm/URIChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/URIChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (hide annotations) (download)
Sat Aug 30 05:31:38 2008 UTC (16 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.10: +44 -22 lines
++ whatpm/t/ChangeLog	30 Aug 2008 05:31:31 -0000
	* URIChecker.t: Error results updated.

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	30 Aug 2008 05:31:16 -0000
	* URIChecker.pm: Set parameters representing where in the
	value the error occurs for errors.  Report unknown
	address format error in warning level, since address
	formats are rarely added.  Path segments starting with "/.."
	were misinterpreted as a dot-segment.

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::URIChecker;
2     use strict;
3    
4 wakaba 1.2 require Encode;
5    
6     our $DefaultPort = {
7     http => 80,
8     };
9    
10 wakaba 1.9 my $default_error_levels = {
11     uri_fact => 'm',
12     uri_lc_must => 'm', ## Non-RFC 2119 "must" (or fact)
13     uri_lc_should => 'w', ## Non-RFC 2119 "should"
14     uri_syntax => 'm',
15 wakaba 1.7
16 wakaba 1.9 rdf_fact => 'm',
17    
18 wakaba 1.11 warn => 'w',
19 wakaba 1.9 uncertain => 'u',
20     };
21    
22     sub check_iri ($$$;$) {
23 wakaba 1.6 require Message::URI::URIReference;
24     my $dom = Message::DOM::DOMImplementation->new;
25     my $uri_o = $dom->create_uri_reference ($_[1]);
26     my $uri_s = $uri_o->uri_reference;
27    
28     local $Error::Depth = $Error::Depth + 1;
29    
30     unless ($uri_o->is_iri_3987) {
31 wakaba 1.9 $_[2]->(type => 'syntax error:iri3987',
32     level => ($_[3] or $default_error_levels)->{uri_syntax});
33 wakaba 1.6 }
34    
35 wakaba 1.9 Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
36 wakaba 1.6 } # check_iri
37    
38 wakaba 1.9 sub check_iri_reference ($$$;$) {
39 wakaba 1.1 my $onerror = $_[2];
40 wakaba 1.9 my $levels = $_[3] || $default_error_levels;
41 wakaba 1.1
42 wakaba 1.10 require Message::DOM::DOMImplementation;
43 wakaba 1.6 my $dom = Message::DOM::DOMImplementation->new;
44 wakaba 1.1 my $uri_o = $dom->create_uri_reference ($_[1]);
45     my $uri_s = $uri_o->uri_reference;
46    
47 wakaba 1.2 ## RFC 3987 4.1.
48 wakaba 1.1 unless ($uri_o->is_iri_reference_3987) {
49 wakaba 1.9 $onerror->(type => 'syntax error:iriref3987',
50     level => $levels->{uri_syntax});
51     ## MUST (NOTE: A requirement for bidi IRIs.)
52 wakaba 1.1 }
53    
54 wakaba 1.2 ## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
55     pos ($uri_s) = 0;
56 wakaba 1.1 while ($uri_s =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
57 wakaba 1.9 $onerror->(type => 'URL:lowercase hexadecimal digit',
58 wakaba 1.11 level => $levels->{uri_lc_should},
59     value => $uri_s,
60     pos_start => $-[0], pos_end => $+[0] - 1);
61 wakaba 1.1 ## shoult not
62     }
63    
64     ## RFC 3986 2.2.
65     ## URI producing applications should percent-encode ... reserved ...
66     ## unless ... allowed by the URI scheme .... --- This is not testable.
67    
68 wakaba 1.2 ## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
69     pos ($uri_s) = 0;
70 wakaba 1.1 while ($uri_s =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
71 wakaba 1.9 $onerror->(type => 'URL:percent-encoded unreserved',
72 wakaba 1.11 level => $levels->{uri_lc_should},
73     value => $uri_s,
74     pos_start => $-[0], pos_end => $+[0] - 1);
75 wakaba 1.2 ## should
76     ## should
77 wakaba 1.1 }
78    
79     ## RFC 3986 2.4.
80     ## ... "%" ... must be percent-encoded as "%25" ...
81     ## --- Either syntax error or undetectable if followed by two hexadecimals
82    
83 wakaba 1.2 ## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
84 wakaba 1.1 my $scheme = $uri_o->uri_scheme;
85 wakaba 1.2 my $scheme_canon;
86 wakaba 1.1 if (defined $scheme) {
87 wakaba 1.4 $scheme_canon = Encode::encode ('utf8', $scheme);
88 wakaba 1.2 $scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
89     if ($scheme_canon =~ tr/A-Z/a-z/) {
90 wakaba 1.9 $onerror->(type => 'URL:uppercase scheme name',
91 wakaba 1.11 level => $levels->{uri_lc_should},
92     value => $scheme, value_mark => qr/[A-Z]+/);
93 wakaba 1.2 ## should
94 wakaba 1.1 }
95     }
96    
97     ## Note that nothing prevent a conforming URI (if there is one)
98     ## using an unregistered URI scheme...
99    
100 wakaba 1.2 ## RFC 3986 3.2.1., 7.5.
101 wakaba 1.1 my $ui = $uri_o->uri_userinfo;
102     if (defined $ui and $ui =~ /:/) {
103 wakaba 1.9 $onerror->(type => 'URL:password', level => $levels->{uri_lc_should});
104 wakaba 1.11 # deprecated, should be considered an error
105     ## NOTE: We intentionally don't set |value| parameter.
106 wakaba 1.1 }
107    
108 wakaba 1.2 ## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
109 wakaba 1.1 my $host = $uri_o->uri_host;
110     if (defined $host) {
111     if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
112 wakaba 1.9 $onerror->(type => 'URL:address format',
113 wakaba 1.11 level => $levels->{warn},
114     text => $1,
115     value => $host, pos_start => $-[1], pos_end => $+[1] - 1);
116     ## NOTE: No conformance creteria is defined for new address format,
117     ## nor is any standardization process.
118     }
119 wakaba 1.2 my $hostnp = $host;
120     $hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
121     if ($hostnp =~ /[A-Z]/) {
122 wakaba 1.9 $onerror->(type => 'URL:uppercase host',
123     level => $levels->{uri_lc_should},
124 wakaba 1.11 value => $hostnp, value_mark => qr/[A-Z]+/);
125 wakaba 1.2 ## should
126     }
127    
128     if ($host =~ /^\[/) {
129     #
130 wakaba 1.1 } else {
131 wakaba 1.11 my $host_np = Encode::encode ('utf8', $host);
132     $host_np =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
133 wakaba 1.2
134 wakaba 1.11 if ($host_np eq '') {
135 wakaba 1.2 ## NOTE: Although not explicitly mentioned, an empty host
136     ## should be considered as an exception for the recommendation
137     ## that a host "should" be a DNS name.
138 wakaba 1.11 } elsif ($host_np !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
139 wakaba 1.9 $onerror->(type => 'URL:non-DNS host',
140 wakaba 1.11 level => $levels->{uri_lc_should},
141     value => $host_np);
142 wakaba 1.2 ## should
143     ## should be IDNA encoding if wish to maximize interoperability
144     } elsif (length $host > 255) {
145     ## NOTE: This length might be incorrect if there were percent-encoded
146     ## UTF-8 bytes; however, the above condition catches all non-ASCII.
147 wakaba 1.9 $onerror->(type => 'URL:long host',
148 wakaba 1.11 level => $levels->{uri_lc_should},
149     value => $host_np,
150     pos_start => 256, pos_end => length $host);
151 wakaba 1.2 ## should
152 wakaba 1.1 }
153    
154 wakaba 1.2 ## FQDN should be followed by "." if necessary --- untestable
155 wakaba 1.1
156 wakaba 1.2 ## must be UTF-8
157 wakaba 1.11 unless ($host_np =~ /\A(?>
158 wakaba 1.2 [\x00-\x7F] |
159     [\xC2-\xDF][\x80-\xBF] | # UTF8-2
160     [\xE0][\xA0-\xBF][\x80-\xBF] |
161     [\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
162     [\xED][\x80-\x9F][\x80-\xBF] |
163     [\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
164     [\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
165     [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
166     [\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
167     )*\z/x) {
168 wakaba 1.9 $onerror->(type => 'URL:non UTF-8 host',
169 wakaba 1.11 level => $levels->{uri_lc_must},
170     value => $host); # not $host_np
171 wakaba 1.2 # must
172     }
173     }
174     }
175 wakaba 1.1
176 wakaba 1.2 ## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
177     my $port = $uri_o->uri_port;
178     if (defined $port) {
179     if ($port =~ /\A([0-9]+)\z/) {
180     if ($DefaultPort->{$scheme_canon} == $1) {
181 wakaba 1.9 $onerror->(type => 'URL:default port',
182 wakaba 1.11 level => $levels->{uri_lc_should},
183     value => $port);
184 wakaba 1.2 ## should
185     }
186     } elsif ($port eq '') {
187 wakaba 1.9 $onerror->(type => 'URL:empty port',
188 wakaba 1.11 level => $levels->{uri_lc_should},
189     value => $uri_o->uri_authority,
190     value_mark_end => 1);
191 wakaba 1.2 ## should
192     }
193     }
194 wakaba 1.1
195 wakaba 1.2 ## RFC 3986 3.4.
196     ## ... says that "/" or "?" in query might be problematic for
197     ## old implementations, but also suggest that for readability percent-encoding
198     ## might not be good idea. It provides no recommendation on this issue.
199     ## Therefore, we do no check for this matter.
200    
201     ## RFC 3986 3.5.
202     ## ... says again that "/" or "?" in fragment might be problematic,
203     ## without any recommendation.
204     ## We again left this unchecked.
205    
206     ## RFC 3986 4.4.
207     ## Authors should not assume ... different, though equivalent,
208     ## URI will (or will not) be interpreted as a same-document reference ...
209     ## This is not testable.
210    
211     ## RFC 3986 5.4.2.
212     ## "scheme:relative" should be avoided
213     ## This is not testable without scheme specific information.
214 wakaba 1.1
215 wakaba 1.2 ## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
216     my $path = $uri_o->uri_path;
217     if (defined $scheme) {
218     if (
219 wakaba 1.11 $path =~ m!/\.\./! or
220 wakaba 1.2 $path =~ m!/\./! or
221     $path =~ m!/\.\.\z! or
222     $path =~ m!/\.\z! or
223     $path =~ m!\A\.\./! or
224     $path =~ m!\A\./! or
225     $path eq '.,' or
226     $path eq '.'
227     ) {
228 wakaba 1.9 $onerror->(type => 'URL:dot-segment',
229 wakaba 1.11 level => $levels->{uri_lc_should},
230     value => $path,
231     value_mark => qr[(?<=/)\.\.?(?=/|\z)|\A\.\.?(?=/|\z)]);
232 wakaba 1.2 ## should
233 wakaba 1.1 }
234     }
235    
236 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
237     my $authority = $uri_o->uri_authority;
238     if (defined $authority) {
239     if ($path eq '') {
240 wakaba 1.9 $onerror->(type => 'URL:empty path',
241 wakaba 1.11 level => $levels->{uri_lc_should},
242     value => $uri_s);
243 wakaba 1.2 ## should
244     }
245     }
246 wakaba 1.1
247 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
248     ## Scheme dependent default authority should be omitted
249 wakaba 1.1
250 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
251     if (defined $host and $host eq '' and
252     (defined $ui or defined $port)) {
253 wakaba 1.9 $onerror->(type => 'URL:empty host',
254 wakaba 1.11 level => $levels->{uri_lc_should},
255     value => $authority);
256 wakaba 1.2 ## should # when empty authority is allowed
257     }
258    
259     ## RFC 3986 7.5.
260     ## should not ... username or password that is intended to be secret
261     ## This is not testable.
262    
263     ## RFC 3987 4.1.
264     ## MUST be in full logical order
265     ## This is not testable.
266    
267     ## RFC 3987 4.1., 6.4.
268     ## URI scheme dependent syntax
269     ## MUST
270     ## TODO
271    
272     ## RFC 3987 4.2.
273     ## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
274     ## SHOULD NOT use both rtl and ltr characters
275     ## SHOULD start with rtl if using rtl characters
276     ## TODO
277    
278     ## RFC 3987 5.3.2.2.
279     ## SHOULD be NFC
280     ## NFKC may avoid even more problems
281     ## TODO
282    
283     ## RFC 3987 5.3.3.
284     ## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
285     ## SHOULD be normalized by Nameprep
286     ## TODO
287 wakaba 1.3
288     ## TODO: If it is a relative reference, then resolve and then check against scheme dependent requirements
289 wakaba 1.1 } # check_iri_reference
290    
291 wakaba 1.9 sub check_rdf_uri_reference ($$$;$) {
292 wakaba 1.8 require Message::URI::URIReference;
293     my $dom = Message::DOM::DOMImplementation->new;
294     my $uri_o = $dom->create_uri_reference ($_[1]);
295     my $uri_s = $uri_o->uri_reference;
296    
297 wakaba 1.9 my $levels = $_[3] || $default_error_levels;
298 wakaba 1.8
299     if ($uri_s =~ /[\x00-\x1F\x7F-\x9F]/) {
300 wakaba 1.9 $_[2]->(type => 'syntax error:rdfuriref',
301     level => $levels->{rdf_fact},
302 wakaba 1.8 position => $-[0]);
303     }
304    
305     my $ascii_uri_o = $uri_o->get_uri_reference_3986; # same as RDF spec's one
306    
307 wakaba 1.9 unless ($ascii_uri_o->is_uri) { ## TODO: is_uri_2396 should be used.
308     $_[2]->(#type => 'syntax error:uri2396',
309     type => 'syntax error:uri3986',
310     level => $levels->{uri_fact},
311 wakaba 1.8 value => $ascii_uri_o->uri_reference);
312     }
313    
314     ## TODO: Check against RFC 2396.
315 wakaba 1.9 #Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
316 wakaba 1.8 } # check_rdf_uri_reference
317    
318 wakaba 1.1 1;
319 wakaba 1.11 ## $Date: 2008/08/30 04:31:57 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24