/[suikacvs]/markup/html/whatpm/Whatpm/URIChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/URIChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (hide annotations) (download)
Sat Aug 30 10:26:39 2008 UTC (16 years, 2 months ago) by wakaba
Branch: MAIN
Changes since 1.11: +5 -3 lines
++ whatpm/t/ChangeLog	30 Aug 2008 10:22:30 -0000
	* ContentChecker.t: Updated for latest version of the
	Whatpm::ContentChecker module.

	* content-model-1.dat, content-model-2.dat, content-model-3.dat,
	content-model-4.dat, content-model-6.dat, content-model-atom-1.dat,
	content-model-atom-2.dat, content-model-atom-threading-1.dat,
	table-1.dat: Results updated.

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	30 Aug 2008 10:24:24 -0000
	* ContentChecker.pm: Error level definition for |xml_id_error|
	was missing.

	* URIChecker.pm: The end of the URL should be marked as the
	error location for an empty path error.  The position
	between the userinfo and the port components should be
	marked as the error location for an empty host error.

2008-08-30  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ContentChecker/ChangeLog	30 Aug 2008 10:26:28 -0000
2008-08-30  Wakaba  <wakaba@suika.fam.cx>

	* Atom.pm: s/element missing/child element missing/ for
	consistency.

	* HTML.pm: Typos fixed.
	(pre): "No significant content" error was unintentionally
	disabled.  s/element missing/child element missing/ for
	consistency.

1 wakaba 1.1 package Whatpm::URIChecker;
2     use strict;
3    
4 wakaba 1.2 require Encode;
5    
6     our $DefaultPort = {
7     http => 80,
8     };
9    
10 wakaba 1.9 my $default_error_levels = {
11     uri_fact => 'm',
12     uri_lc_must => 'm', ## Non-RFC 2119 "must" (or fact)
13     uri_lc_should => 'w', ## Non-RFC 2119 "should"
14     uri_syntax => 'm',
15 wakaba 1.7
16 wakaba 1.9 rdf_fact => 'm',
17    
18 wakaba 1.11 warn => 'w',
19 wakaba 1.9 uncertain => 'u',
20     };
21    
22     sub check_iri ($$$;$) {
23 wakaba 1.6 require Message::URI::URIReference;
24     my $dom = Message::DOM::DOMImplementation->new;
25     my $uri_o = $dom->create_uri_reference ($_[1]);
26     my $uri_s = $uri_o->uri_reference;
27    
28     local $Error::Depth = $Error::Depth + 1;
29    
30     unless ($uri_o->is_iri_3987) {
31 wakaba 1.9 $_[2]->(type => 'syntax error:iri3987',
32     level => ($_[3] or $default_error_levels)->{uri_syntax});
33 wakaba 1.6 }
34    
35 wakaba 1.9 Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
36 wakaba 1.6 } # check_iri
37    
38 wakaba 1.9 sub check_iri_reference ($$$;$) {
39 wakaba 1.1 my $onerror = $_[2];
40 wakaba 1.9 my $levels = $_[3] || $default_error_levels;
41 wakaba 1.1
42 wakaba 1.10 require Message::DOM::DOMImplementation;
43 wakaba 1.6 my $dom = Message::DOM::DOMImplementation->new;
44 wakaba 1.1 my $uri_o = $dom->create_uri_reference ($_[1]);
45     my $uri_s = $uri_o->uri_reference;
46    
47 wakaba 1.2 ## RFC 3987 4.1.
48 wakaba 1.1 unless ($uri_o->is_iri_reference_3987) {
49 wakaba 1.9 $onerror->(type => 'syntax error:iriref3987',
50     level => $levels->{uri_syntax});
51     ## MUST (NOTE: A requirement for bidi IRIs.)
52 wakaba 1.1 }
53    
54 wakaba 1.2 ## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
55     pos ($uri_s) = 0;
56 wakaba 1.1 while ($uri_s =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
57 wakaba 1.9 $onerror->(type => 'URL:lowercase hexadecimal digit',
58 wakaba 1.11 level => $levels->{uri_lc_should},
59     value => $uri_s,
60     pos_start => $-[0], pos_end => $+[0] - 1);
61 wakaba 1.1 ## shoult not
62     }
63    
64     ## RFC 3986 2.2.
65     ## URI producing applications should percent-encode ... reserved ...
66     ## unless ... allowed by the URI scheme .... --- This is not testable.
67    
68 wakaba 1.2 ## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
69     pos ($uri_s) = 0;
70 wakaba 1.1 while ($uri_s =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
71 wakaba 1.9 $onerror->(type => 'URL:percent-encoded unreserved',
72 wakaba 1.11 level => $levels->{uri_lc_should},
73     value => $uri_s,
74     pos_start => $-[0], pos_end => $+[0] - 1);
75 wakaba 1.2 ## should
76     ## should
77 wakaba 1.1 }
78    
79     ## RFC 3986 2.4.
80     ## ... "%" ... must be percent-encoded as "%25" ...
81     ## --- Either syntax error or undetectable if followed by two hexadecimals
82    
83 wakaba 1.2 ## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
84 wakaba 1.1 my $scheme = $uri_o->uri_scheme;
85 wakaba 1.2 my $scheme_canon;
86 wakaba 1.1 if (defined $scheme) {
87 wakaba 1.4 $scheme_canon = Encode::encode ('utf8', $scheme);
88 wakaba 1.2 $scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
89     if ($scheme_canon =~ tr/A-Z/a-z/) {
90 wakaba 1.9 $onerror->(type => 'URL:uppercase scheme name',
91 wakaba 1.11 level => $levels->{uri_lc_should},
92     value => $scheme, value_mark => qr/[A-Z]+/);
93 wakaba 1.2 ## should
94 wakaba 1.1 }
95     }
96    
97     ## Note that nothing prevent a conforming URI (if there is one)
98     ## using an unregistered URI scheme...
99    
100 wakaba 1.2 ## RFC 3986 3.2.1., 7.5.
101 wakaba 1.1 my $ui = $uri_o->uri_userinfo;
102     if (defined $ui and $ui =~ /:/) {
103 wakaba 1.9 $onerror->(type => 'URL:password', level => $levels->{uri_lc_should});
104 wakaba 1.11 # deprecated, should be considered an error
105     ## NOTE: We intentionally don't set |value| parameter.
106 wakaba 1.1 }
107    
108 wakaba 1.2 ## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
109 wakaba 1.1 my $host = $uri_o->uri_host;
110     if (defined $host) {
111     if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
112 wakaba 1.9 $onerror->(type => 'URL:address format',
113 wakaba 1.11 level => $levels->{warn},
114     text => $1,
115     value => $host, pos_start => $-[1], pos_end => $+[1] - 1);
116     ## NOTE: No conformance creteria is defined for new address format,
117     ## nor is any standardization process.
118     }
119 wakaba 1.2 my $hostnp = $host;
120     $hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
121     if ($hostnp =~ /[A-Z]/) {
122 wakaba 1.9 $onerror->(type => 'URL:uppercase host',
123     level => $levels->{uri_lc_should},
124 wakaba 1.11 value => $hostnp, value_mark => qr/[A-Z]+/);
125 wakaba 1.2 ## should
126     }
127    
128     if ($host =~ /^\[/) {
129     #
130 wakaba 1.1 } else {
131 wakaba 1.11 my $host_np = Encode::encode ('utf8', $host);
132     $host_np =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
133 wakaba 1.2
134 wakaba 1.11 if ($host_np eq '') {
135 wakaba 1.2 ## NOTE: Although not explicitly mentioned, an empty host
136     ## should be considered as an exception for the recommendation
137     ## that a host "should" be a DNS name.
138 wakaba 1.11 } elsif ($host_np !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
139 wakaba 1.9 $onerror->(type => 'URL:non-DNS host',
140 wakaba 1.11 level => $levels->{uri_lc_should},
141     value => $host_np);
142 wakaba 1.2 ## should
143     ## should be IDNA encoding if wish to maximize interoperability
144     } elsif (length $host > 255) {
145     ## NOTE: This length might be incorrect if there were percent-encoded
146     ## UTF-8 bytes; however, the above condition catches all non-ASCII.
147 wakaba 1.9 $onerror->(type => 'URL:long host',
148 wakaba 1.11 level => $levels->{uri_lc_should},
149     value => $host_np,
150     pos_start => 256, pos_end => length $host);
151 wakaba 1.2 ## should
152 wakaba 1.1 }
153    
154 wakaba 1.2 ## FQDN should be followed by "." if necessary --- untestable
155 wakaba 1.1
156 wakaba 1.2 ## must be UTF-8
157 wakaba 1.11 unless ($host_np =~ /\A(?>
158 wakaba 1.2 [\x00-\x7F] |
159     [\xC2-\xDF][\x80-\xBF] | # UTF8-2
160     [\xE0][\xA0-\xBF][\x80-\xBF] |
161     [\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
162     [\xED][\x80-\x9F][\x80-\xBF] |
163     [\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
164     [\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
165     [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
166     [\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
167     )*\z/x) {
168 wakaba 1.9 $onerror->(type => 'URL:non UTF-8 host',
169 wakaba 1.11 level => $levels->{uri_lc_must},
170     value => $host); # not $host_np
171 wakaba 1.2 # must
172     }
173     }
174     }
175 wakaba 1.1
176 wakaba 1.2 ## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
177     my $port = $uri_o->uri_port;
178     if (defined $port) {
179     if ($port =~ /\A([0-9]+)\z/) {
180     if ($DefaultPort->{$scheme_canon} == $1) {
181 wakaba 1.9 $onerror->(type => 'URL:default port',
182 wakaba 1.11 level => $levels->{uri_lc_should},
183     value => $port);
184 wakaba 1.2 ## should
185     }
186     } elsif ($port eq '') {
187 wakaba 1.9 $onerror->(type => 'URL:empty port',
188 wakaba 1.11 level => $levels->{uri_lc_should},
189     value => $uri_o->uri_authority,
190     value_mark_end => 1);
191 wakaba 1.2 ## should
192     }
193     }
194 wakaba 1.1
195 wakaba 1.2 ## RFC 3986 3.4.
196     ## ... says that "/" or "?" in query might be problematic for
197     ## old implementations, but also suggest that for readability percent-encoding
198     ## might not be good idea. It provides no recommendation on this issue.
199     ## Therefore, we do no check for this matter.
200    
201     ## RFC 3986 3.5.
202     ## ... says again that "/" or "?" in fragment might be problematic,
203     ## without any recommendation.
204     ## We again left this unchecked.
205    
206     ## RFC 3986 4.4.
207     ## Authors should not assume ... different, though equivalent,
208     ## URI will (or will not) be interpreted as a same-document reference ...
209     ## This is not testable.
210    
211     ## RFC 3986 5.4.2.
212     ## "scheme:relative" should be avoided
213     ## This is not testable without scheme specific information.
214 wakaba 1.1
215 wakaba 1.2 ## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
216     my $path = $uri_o->uri_path;
217     if (defined $scheme) {
218     if (
219 wakaba 1.11 $path =~ m!/\.\./! or
220 wakaba 1.2 $path =~ m!/\./! or
221     $path =~ m!/\.\.\z! or
222     $path =~ m!/\.\z! or
223     $path =~ m!\A\.\./! or
224     $path =~ m!\A\./! or
225     $path eq '.,' or
226     $path eq '.'
227     ) {
228 wakaba 1.9 $onerror->(type => 'URL:dot-segment',
229 wakaba 1.11 level => $levels->{uri_lc_should},
230     value => $path,
231     value_mark => qr[(?<=/)\.\.?(?=/|\z)|\A\.\.?(?=/|\z)]);
232 wakaba 1.2 ## should
233 wakaba 1.1 }
234     }
235    
236 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
237     my $authority = $uri_o->uri_authority;
238     if (defined $authority) {
239     if ($path eq '') {
240 wakaba 1.9 $onerror->(type => 'URL:empty path',
241 wakaba 1.11 level => $levels->{uri_lc_should},
242 wakaba 1.12 value => $uri_s, value_mark_end => 1);
243 wakaba 1.2 ## should
244     }
245     }
246 wakaba 1.1
247 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
248     ## Scheme dependent default authority should be omitted
249 wakaba 1.1
250 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
251     if (defined $host and $host eq '' and
252     (defined $ui or defined $port)) {
253 wakaba 1.9 $onerror->(type => 'URL:empty host',
254 wakaba 1.11 level => $levels->{uri_lc_should},
255 wakaba 1.12 value => $authority,
256     pos_start => defined $ui ? 1 + length $ui : 0,
257     pos_end => defined $ui ? 1 + length $ui : 0);
258 wakaba 1.2 ## should # when empty authority is allowed
259     }
260    
261     ## RFC 3986 7.5.
262     ## should not ... username or password that is intended to be secret
263     ## This is not testable.
264    
265     ## RFC 3987 4.1.
266     ## MUST be in full logical order
267     ## This is not testable.
268    
269     ## RFC 3987 4.1., 6.4.
270     ## URI scheme dependent syntax
271     ## MUST
272     ## TODO
273    
274     ## RFC 3987 4.2.
275     ## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
276     ## SHOULD NOT use both rtl and ltr characters
277     ## SHOULD start with rtl if using rtl characters
278     ## TODO
279    
280     ## RFC 3987 5.3.2.2.
281     ## SHOULD be NFC
282     ## NFKC may avoid even more problems
283     ## TODO
284    
285     ## RFC 3987 5.3.3.
286     ## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
287     ## SHOULD be normalized by Nameprep
288     ## TODO
289 wakaba 1.3
290     ## TODO: If it is a relative reference, then resolve and then check against scheme dependent requirements
291 wakaba 1.1 } # check_iri_reference
292    
293 wakaba 1.9 sub check_rdf_uri_reference ($$$;$) {
294 wakaba 1.8 require Message::URI::URIReference;
295     my $dom = Message::DOM::DOMImplementation->new;
296     my $uri_o = $dom->create_uri_reference ($_[1]);
297     my $uri_s = $uri_o->uri_reference;
298    
299 wakaba 1.9 my $levels = $_[3] || $default_error_levels;
300 wakaba 1.8
301     if ($uri_s =~ /[\x00-\x1F\x7F-\x9F]/) {
302 wakaba 1.9 $_[2]->(type => 'syntax error:rdfuriref',
303     level => $levels->{rdf_fact},
304 wakaba 1.8 position => $-[0]);
305     }
306    
307     my $ascii_uri_o = $uri_o->get_uri_reference_3986; # same as RDF spec's one
308    
309 wakaba 1.9 unless ($ascii_uri_o->is_uri) { ## TODO: is_uri_2396 should be used.
310     $_[2]->(#type => 'syntax error:uri2396',
311     type => 'syntax error:uri3986',
312     level => $levels->{uri_fact},
313 wakaba 1.8 value => $ascii_uri_o->uri_reference);
314     }
315    
316     ## TODO: Check against RFC 2396.
317 wakaba 1.9 #Whatpm::URIChecker->check_iri_reference ($_[1], $_[2], $_[3]);
318 wakaba 1.8 } # check_rdf_uri_reference
319    
320 wakaba 1.1 1;
321 wakaba 1.12 ## $Date: 2008/08/30 05:31:38 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24