/[suikacvs]/markup/html/whatpm/Whatpm/URIChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/URIChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (hide annotations) (download)
Fri Mar 21 10:58:30 2008 UTC (16 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.7: +28 -1 lines
++ whatpm/Whatpm/ChangeLog	21 Mar 2008 10:58:23 -0000
	* RDFXML.pm: TODO items noted.  Validation of ID
	and URI attributes is implemented.  Warn if unknown
	value is used in rdf:parseType="" attribute.

	* URIChecker.pm (check_rdf_uri_reference): New function.

2008-03-21  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::URIChecker;
2     use strict;
3    
4 wakaba 1.2 require Encode;
5    
6     our $DefaultPort = {
7     http => 80,
8     };
9    
10 wakaba 1.8 our $FactLevel = 'm';
11 wakaba 1.7 our $MustLevel = 'm'; ## Non-RFC 2119 "must" (or fact)
12     our $ShouldLevel = 'w'; ## Non-RFC 2119 "should"
13    
14 wakaba 1.6 sub check_iri ($$$) {
15     require Message::URI::URIReference;
16     my $dom = Message::DOM::DOMImplementation->new;
17     my $uri_o = $dom->create_uri_reference ($_[1]);
18     my $uri_s = $uri_o->uri_reference;
19    
20     local $Error::Depth = $Error::Depth + 1;
21    
22     unless ($uri_o->is_iri_3987) {
23 wakaba 1.7 $_[2]->(type => 'syntax error:iri3987', level => $MustLevel);
24 wakaba 1.6 ## MUST
25     }
26    
27     Whatpm::URIChecker->check_iri_reference ($_[1], $_[2]);
28     } # check_iri
29    
30 wakaba 1.1 sub check_iri_reference ($$$) {
31     my $onerror = $_[2];
32    
33 wakaba 1.6 require Message::URI::URIReference;
34     my $dom = Message::DOM::DOMImplementation->new;
35 wakaba 1.1 my $uri_o = $dom->create_uri_reference ($_[1]);
36     my $uri_s = $uri_o->uri_reference;
37    
38     local $Error::Depth = $Error::Depth + 1;
39    
40 wakaba 1.2 ## RFC 3987 4.1.
41 wakaba 1.1 unless ($uri_o->is_iri_reference_3987) {
42 wakaba 1.7 $onerror->(type => 'syntax error:iriref3987', level => $MustLevel);
43 wakaba 1.2 ## MUST
44 wakaba 1.1 }
45    
46 wakaba 1.2 ## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
47     pos ($uri_s) = 0;
48 wakaba 1.1 while ($uri_s =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
49     $onerror->(type => 'lowercase hexadecimal digit',
50 wakaba 1.7 position => $-[0] + 1, level => $ShouldLevel);
51 wakaba 1.1 ## shoult not
52     }
53    
54     ## RFC 3986 2.2.
55     ## URI producing applications should percent-encode ... reserved ...
56     ## unless ... allowed by the URI scheme .... --- This is not testable.
57    
58 wakaba 1.2 ## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
59     pos ($uri_s) = 0;
60 wakaba 1.1 while ($uri_s =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
61     $onerror->(type => 'percent-encoded unreserved',
62 wakaba 1.7 position => $-[0] + 1, level => $ShouldLevel);
63 wakaba 1.2 ## should
64     ## should
65 wakaba 1.1 }
66    
67     ## RFC 3986 2.4.
68     ## ... "%" ... must be percent-encoded as "%25" ...
69     ## --- Either syntax error or undetectable if followed by two hexadecimals
70    
71 wakaba 1.2 ## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
72 wakaba 1.1 my $scheme = $uri_o->uri_scheme;
73 wakaba 1.2 my $scheme_canon;
74 wakaba 1.1 if (defined $scheme) {
75 wakaba 1.4 $scheme_canon = Encode::encode ('utf8', $scheme);
76 wakaba 1.2 $scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
77     if ($scheme_canon =~ tr/A-Z/a-z/) {
78 wakaba 1.7 $onerror->(type => 'uppercase scheme name', level => $ShouldLevel);
79 wakaba 1.2 ## should
80 wakaba 1.1 }
81     }
82    
83     ## Note that nothing prevent a conforming URI (if there is one)
84     ## using an unregistered URI scheme...
85    
86 wakaba 1.2 ## RFC 3986 3.2.1., 7.5.
87 wakaba 1.1 my $ui = $uri_o->uri_userinfo;
88     if (defined $ui and $ui =~ /:/) {
89 wakaba 1.7 $onerror->(type => 'password', level => $ShouldLevel);
90 wakaba 1.1 ## deprecated
91     }
92    
93 wakaba 1.2 ## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
94 wakaba 1.1 my $host = $uri_o->uri_host;
95     if (defined $host) {
96     if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
97 wakaba 1.5 $onerror->(type => 'address format:'.$1, level => 'unsupported');
98 wakaba 1.2 }
99     my $hostnp = $host;
100     $hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
101     if ($hostnp =~ /[A-Z]/) {
102 wakaba 1.7 $onerror->(type => 'uppercase host', level => $ShouldLevel);
103 wakaba 1.2 ## should
104     }
105    
106     if ($host =~ /^\[/) {
107     #
108 wakaba 1.1 } else {
109 wakaba 1.2 $host = Encode::encode ('utf8', $host);
110     $host =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
111    
112     if ($host eq '') {
113     ## NOTE: Although not explicitly mentioned, an empty host
114     ## should be considered as an exception for the recommendation
115     ## that a host "should" be a DNS name.
116     } elsif ($host !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
117 wakaba 1.7 $onerror->(type => 'non-DNS host', level => $ShouldLevel);
118 wakaba 1.2 ## should
119     ## should be IDNA encoding if wish to maximize interoperability
120     } elsif (length $host > 255) {
121     ## NOTE: This length might be incorrect if there were percent-encoded
122     ## UTF-8 bytes; however, the above condition catches all non-ASCII.
123 wakaba 1.7 $onerror->(type => 'long host', level => $ShouldLevel);
124 wakaba 1.2 ## should
125 wakaba 1.1 }
126    
127 wakaba 1.2 ## FQDN should be followed by "." if necessary --- untestable
128 wakaba 1.1
129 wakaba 1.2 ## must be UTF-8
130     unless ($host =~ /\A(?>
131     [\x00-\x7F] |
132     [\xC2-\xDF][\x80-\xBF] | # UTF8-2
133     [\xE0][\xA0-\xBF][\x80-\xBF] |
134     [\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
135     [\xED][\x80-\x9F][\x80-\xBF] |
136     [\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
137     [\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
138     [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
139     [\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
140     )*\z/x) {
141 wakaba 1.7 $onerror->(type => 'non UTF-8 host', level => $MustLevel);
142 wakaba 1.2 # must
143     }
144     }
145     }
146 wakaba 1.1
147 wakaba 1.2 ## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
148     my $port = $uri_o->uri_port;
149     if (defined $port) {
150     if ($port =~ /\A([0-9]+)\z/) {
151     if ($DefaultPort->{$scheme_canon} == $1) {
152 wakaba 1.7 $onerror->(type => 'default port', level => $ShouldLevel);
153 wakaba 1.2 ## should
154     }
155     } elsif ($port eq '') {
156 wakaba 1.7 $onerror->(type => 'empty port', level => $ShouldLevel);
157 wakaba 1.2 ## should
158     }
159     }
160 wakaba 1.1
161 wakaba 1.2 ## RFC 3986 3.4.
162     ## ... says that "/" or "?" in query might be problematic for
163     ## old implementations, but also suggest that for readability percent-encoding
164     ## might not be good idea. It provides no recommendation on this issue.
165     ## Therefore, we do no check for this matter.
166    
167     ## RFC 3986 3.5.
168     ## ... says again that "/" or "?" in fragment might be problematic,
169     ## without any recommendation.
170     ## We again left this unchecked.
171    
172     ## RFC 3986 4.4.
173     ## Authors should not assume ... different, though equivalent,
174     ## URI will (or will not) be interpreted as a same-document reference ...
175     ## This is not testable.
176    
177     ## RFC 3986 5.4.2.
178     ## "scheme:relative" should be avoided
179     ## This is not testable without scheme specific information.
180 wakaba 1.1
181 wakaba 1.2 ## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
182     my $path = $uri_o->uri_path;
183     if (defined $scheme) {
184     if (
185     $path =~ m!/\.\.! or
186     $path =~ m!/\./! or
187     $path =~ m!/\.\.\z! or
188     $path =~ m!/\.\z! or
189     $path =~ m!\A\.\./! or
190     $path =~ m!\A\./! or
191     $path eq '.,' or
192     $path eq '.'
193     ) {
194 wakaba 1.7 $onerror->(type => 'dot-segment', level => $ShouldLevel);
195 wakaba 1.2 ## should
196 wakaba 1.1 }
197     }
198    
199 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
200     my $authority = $uri_o->uri_authority;
201     if (defined $authority) {
202     if ($path eq '') {
203 wakaba 1.7 $onerror->(type => 'empty path', level => $ShouldLevel);
204 wakaba 1.2 ## should
205     }
206     }
207 wakaba 1.1
208 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
209     ## Scheme dependent default authority should be omitted
210 wakaba 1.1
211 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
212     if (defined $host and $host eq '' and
213     (defined $ui or defined $port)) {
214 wakaba 1.7 $onerror->(type => 'empty host', level => $ShouldLevel);
215 wakaba 1.2 ## should # when empty authority is allowed
216     }
217    
218     ## RFC 3986 7.5.
219     ## should not ... username or password that is intended to be secret
220     ## This is not testable.
221    
222     ## RFC 3987 4.1.
223     ## MUST be in full logical order
224     ## This is not testable.
225    
226     ## RFC 3987 4.1., 6.4.
227     ## URI scheme dependent syntax
228     ## MUST
229     ## TODO
230    
231     ## RFC 3987 4.2.
232     ## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
233     ## SHOULD NOT use both rtl and ltr characters
234     ## SHOULD start with rtl if using rtl characters
235     ## TODO
236    
237     ## RFC 3987 5.3.2.2.
238     ## SHOULD be NFC
239     ## NFKC may avoid even more problems
240     ## TODO
241    
242     ## RFC 3987 5.3.3.
243     ## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
244     ## SHOULD be normalized by Nameprep
245     ## TODO
246 wakaba 1.3
247     ## TODO: If it is a relative reference, then resolve and then check against scheme dependent requirements
248 wakaba 1.1 } # check_iri_reference
249    
250 wakaba 1.8 sub check_rdf_uri_reference ($$$) {
251     require Message::URI::URIReference;
252     my $dom = Message::DOM::DOMImplementation->new;
253     my $uri_o = $dom->create_uri_reference ($_[1]);
254     my $uri_s = $uri_o->uri_reference;
255    
256     local $Error::Depth = $Error::Depth + 1;
257    
258     if ($uri_s =~ /[\x00-\x1F\x7F-\x9F]/) {
259     $_[2]->(type => 'syntax error:rdfuriref', level => $FactLevel,
260     position => $-[0]);
261     ## TODO: Add error type to the list.
262     }
263    
264     my $ascii_uri_o = $uri_o->get_uri_reference_3986; # same as RDF spec's one
265    
266     unless ($ascii_uri_o->is_uri) { ## TODO: is_uri_2396
267     $_[2]->(type => 'syntax error:uri2396', level => $FactLevel,
268     value => $ascii_uri_o->uri_reference);
269     ## TODO: Add error type to the list.
270     }
271    
272     ## TODO: Check against RFC 2396.
273     #Whatpm::URIChecker->check_iri_reference ($_[1], $_[2]);
274     } # check_rdf_uri_reference
275    
276 wakaba 1.1 1;
277 wakaba 1.8 ## $Date: 2007/11/23 12:01:20 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24