/[suikacvs]/markup/html/whatpm/Whatpm/URIChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/URIChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (hide annotations) (download)
Fri Nov 23 12:01:20 2007 UTC (16 years, 11 months ago) by wakaba
Branch: MAIN
Changes since 1.6: +19 -17 lines
++ whatpm/Whatpm/ChangeLog	23 Nov 2007 12:01:06 -0000
	* URIChecker.pm: Make RFC 3986 should-level errors
	warnings (rather than SHOULD-level errors).

2007-11-23  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::URIChecker;
2     use strict;
3    
4 wakaba 1.2 require Encode;
5    
6     our $DefaultPort = {
7     http => 80,
8     };
9    
10 wakaba 1.7 our $MustLevel = 'm'; ## Non-RFC 2119 "must" (or fact)
11     our $ShouldLevel = 'w'; ## Non-RFC 2119 "should"
12    
13 wakaba 1.6 sub check_iri ($$$) {
14     require Message::URI::URIReference;
15     my $dom = Message::DOM::DOMImplementation->new;
16     my $uri_o = $dom->create_uri_reference ($_[1]);
17     my $uri_s = $uri_o->uri_reference;
18    
19     local $Error::Depth = $Error::Depth + 1;
20    
21     unless ($uri_o->is_iri_3987) {
22 wakaba 1.7 $_[2]->(type => 'syntax error:iri3987', level => $MustLevel);
23 wakaba 1.6 ## MUST
24     }
25    
26     Whatpm::URIChecker->check_iri_reference ($_[1], $_[2]);
27     } # check_iri
28    
29 wakaba 1.1 sub check_iri_reference ($$$) {
30     my $onerror = $_[2];
31    
32 wakaba 1.6 require Message::URI::URIReference;
33     my $dom = Message::DOM::DOMImplementation->new;
34 wakaba 1.1 my $uri_o = $dom->create_uri_reference ($_[1]);
35     my $uri_s = $uri_o->uri_reference;
36    
37     local $Error::Depth = $Error::Depth + 1;
38    
39 wakaba 1.2 ## RFC 3987 4.1.
40 wakaba 1.1 unless ($uri_o->is_iri_reference_3987) {
41 wakaba 1.7 $onerror->(type => 'syntax error:iriref3987', level => $MustLevel);
42 wakaba 1.2 ## MUST
43 wakaba 1.1 }
44    
45 wakaba 1.2 ## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
46     pos ($uri_s) = 0;
47 wakaba 1.1 while ($uri_s =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
48     $onerror->(type => 'lowercase hexadecimal digit',
49 wakaba 1.7 position => $-[0] + 1, level => $ShouldLevel);
50 wakaba 1.1 ## shoult not
51     }
52    
53     ## RFC 3986 2.2.
54     ## URI producing applications should percent-encode ... reserved ...
55     ## unless ... allowed by the URI scheme .... --- This is not testable.
56    
57 wakaba 1.2 ## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
58     pos ($uri_s) = 0;
59 wakaba 1.1 while ($uri_s =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
60     $onerror->(type => 'percent-encoded unreserved',
61 wakaba 1.7 position => $-[0] + 1, level => $ShouldLevel);
62 wakaba 1.2 ## should
63     ## should
64 wakaba 1.1 }
65    
66     ## RFC 3986 2.4.
67     ## ... "%" ... must be percent-encoded as "%25" ...
68     ## --- Either syntax error or undetectable if followed by two hexadecimals
69    
70 wakaba 1.2 ## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
71 wakaba 1.1 my $scheme = $uri_o->uri_scheme;
72 wakaba 1.2 my $scheme_canon;
73 wakaba 1.1 if (defined $scheme) {
74 wakaba 1.4 $scheme_canon = Encode::encode ('utf8', $scheme);
75 wakaba 1.2 $scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
76     if ($scheme_canon =~ tr/A-Z/a-z/) {
77 wakaba 1.7 $onerror->(type => 'uppercase scheme name', level => $ShouldLevel);
78 wakaba 1.2 ## should
79 wakaba 1.1 }
80     }
81    
82     ## Note that nothing prevent a conforming URI (if there is one)
83     ## using an unregistered URI scheme...
84    
85 wakaba 1.2 ## RFC 3986 3.2.1., 7.5.
86 wakaba 1.1 my $ui = $uri_o->uri_userinfo;
87     if (defined $ui and $ui =~ /:/) {
88 wakaba 1.7 $onerror->(type => 'password', level => $ShouldLevel);
89 wakaba 1.1 ## deprecated
90     }
91    
92 wakaba 1.2 ## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
93 wakaba 1.1 my $host = $uri_o->uri_host;
94     if (defined $host) {
95     if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
96 wakaba 1.5 $onerror->(type => 'address format:'.$1, level => 'unsupported');
97 wakaba 1.2 }
98     my $hostnp = $host;
99     $hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
100     if ($hostnp =~ /[A-Z]/) {
101 wakaba 1.7 $onerror->(type => 'uppercase host', level => $ShouldLevel);
102 wakaba 1.2 ## should
103     }
104    
105     if ($host =~ /^\[/) {
106     #
107 wakaba 1.1 } else {
108 wakaba 1.2 $host = Encode::encode ('utf8', $host);
109     $host =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
110    
111     if ($host eq '') {
112     ## NOTE: Although not explicitly mentioned, an empty host
113     ## should be considered as an exception for the recommendation
114     ## that a host "should" be a DNS name.
115     } elsif ($host !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
116 wakaba 1.7 $onerror->(type => 'non-DNS host', level => $ShouldLevel);
117 wakaba 1.2 ## should
118     ## should be IDNA encoding if wish to maximize interoperability
119     } elsif (length $host > 255) {
120     ## NOTE: This length might be incorrect if there were percent-encoded
121     ## UTF-8 bytes; however, the above condition catches all non-ASCII.
122 wakaba 1.7 $onerror->(type => 'long host', level => $ShouldLevel);
123 wakaba 1.2 ## should
124 wakaba 1.1 }
125    
126 wakaba 1.2 ## FQDN should be followed by "." if necessary --- untestable
127 wakaba 1.1
128 wakaba 1.2 ## must be UTF-8
129     unless ($host =~ /\A(?>
130     [\x00-\x7F] |
131     [\xC2-\xDF][\x80-\xBF] | # UTF8-2
132     [\xE0][\xA0-\xBF][\x80-\xBF] |
133     [\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
134     [\xED][\x80-\x9F][\x80-\xBF] |
135     [\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
136     [\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
137     [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
138     [\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
139     )*\z/x) {
140 wakaba 1.7 $onerror->(type => 'non UTF-8 host', level => $MustLevel);
141 wakaba 1.2 # must
142     }
143     }
144     }
145 wakaba 1.1
146 wakaba 1.2 ## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
147     my $port = $uri_o->uri_port;
148     if (defined $port) {
149     if ($port =~ /\A([0-9]+)\z/) {
150     if ($DefaultPort->{$scheme_canon} == $1) {
151 wakaba 1.7 $onerror->(type => 'default port', level => $ShouldLevel);
152 wakaba 1.2 ## should
153     }
154     } elsif ($port eq '') {
155 wakaba 1.7 $onerror->(type => 'empty port', level => $ShouldLevel);
156 wakaba 1.2 ## should
157     }
158     }
159 wakaba 1.1
160 wakaba 1.2 ## RFC 3986 3.4.
161     ## ... says that "/" or "?" in query might be problematic for
162     ## old implementations, but also suggest that for readability percent-encoding
163     ## might not be good idea. It provides no recommendation on this issue.
164     ## Therefore, we do no check for this matter.
165    
166     ## RFC 3986 3.5.
167     ## ... says again that "/" or "?" in fragment might be problematic,
168     ## without any recommendation.
169     ## We again left this unchecked.
170    
171     ## RFC 3986 4.4.
172     ## Authors should not assume ... different, though equivalent,
173     ## URI will (or will not) be interpreted as a same-document reference ...
174     ## This is not testable.
175    
176     ## RFC 3986 5.4.2.
177     ## "scheme:relative" should be avoided
178     ## This is not testable without scheme specific information.
179 wakaba 1.1
180 wakaba 1.2 ## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
181     my $path = $uri_o->uri_path;
182     if (defined $scheme) {
183     if (
184     $path =~ m!/\.\.! or
185     $path =~ m!/\./! or
186     $path =~ m!/\.\.\z! or
187     $path =~ m!/\.\z! or
188     $path =~ m!\A\.\./! or
189     $path =~ m!\A\./! or
190     $path eq '.,' or
191     $path eq '.'
192     ) {
193 wakaba 1.7 $onerror->(type => 'dot-segment', level => $ShouldLevel);
194 wakaba 1.2 ## should
195 wakaba 1.1 }
196     }
197    
198 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
199     my $authority = $uri_o->uri_authority;
200     if (defined $authority) {
201     if ($path eq '') {
202 wakaba 1.7 $onerror->(type => 'empty path', level => $ShouldLevel);
203 wakaba 1.2 ## should
204     }
205     }
206 wakaba 1.1
207 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
208     ## Scheme dependent default authority should be omitted
209 wakaba 1.1
210 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
211     if (defined $host and $host eq '' and
212     (defined $ui or defined $port)) {
213 wakaba 1.7 $onerror->(type => 'empty host', level => $ShouldLevel);
214 wakaba 1.2 ## should # when empty authority is allowed
215     }
216    
217     ## RFC 3986 7.5.
218     ## should not ... username or password that is intended to be secret
219     ## This is not testable.
220    
221     ## RFC 3987 4.1.
222     ## MUST be in full logical order
223     ## This is not testable.
224    
225     ## RFC 3987 4.1., 6.4.
226     ## URI scheme dependent syntax
227     ## MUST
228     ## TODO
229    
230     ## RFC 3987 4.2.
231     ## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
232     ## SHOULD NOT use both rtl and ltr characters
233     ## SHOULD start with rtl if using rtl characters
234     ## TODO
235    
236     ## RFC 3987 5.3.2.2.
237     ## SHOULD be NFC
238     ## NFKC may avoid even more problems
239     ## TODO
240    
241     ## RFC 3987 5.3.3.
242     ## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
243     ## SHOULD be normalized by Nameprep
244     ## TODO
245 wakaba 1.3
246     ## TODO: If it is a relative reference, then resolve and then check against scheme dependent requirements
247 wakaba 1.1 } # check_iri_reference
248    
249     1;
250 wakaba 1.7 ## $Date: 2007/08/05 07:12:45 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24