/[suikacvs]/markup/html/whatpm/Whatpm/URIChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/URIChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Fri May 25 14:16:29 2007 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +161 -63 lines
++ whatpm/t/ChangeLog	25 May 2007 14:16:22 -0000
	* URIChecker.t: New test entries.

2007-05-25  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	25 May 2007 14:16:00 -0000
	* URIChecker.pm: All recommendations from RFC 3986
	and RFC 3987 are listed (not all testable items are checked yet).

2007-05-25  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::URIChecker;
2     use strict;
3    
4 wakaba 1.2 require Encode;
5    
6     our $DefaultPort = {
7     http => 80,
8     };
9    
10 wakaba 1.1 sub check_iri_reference ($$$) {
11     my $onerror = $_[2];
12    
13     require Message::URI::URIReference; ## From manakai
14     my $dom = 'Message::DOM::DOMImplementation'; ## ISSUE: This is not a formal way to instantiate it.
15    
16     my $uri_o = $dom->create_uri_reference ($_[1]);
17     my $uri_s = $uri_o->uri_reference;
18    
19     local $Error::Depth = $Error::Depth + 1;
20    
21 wakaba 1.2 ## RFC 3987 4.1.
22 wakaba 1.1 unless ($uri_o->is_iri_reference_3987) {
23     $onerror->(type => 'syntax error', level => 'm');
24 wakaba 1.2 ## MUST
25 wakaba 1.1 }
26    
27 wakaba 1.2 ## RFC 3986 2.1., 6.2.2.1., RFC 3987 5.3.2.1.
28     pos ($uri_s) = 0;
29 wakaba 1.1 while ($uri_s =~ /%([a-f][0-9A-Fa-f]|[0-9A-F][a-f])/g) {
30     $onerror->(type => 'lowercase hexadecimal digit',
31     position => $-[0] + 1, level => 's');
32     ## shoult not
33     }
34    
35     ## RFC 3986 2.2.
36     ## URI producing applications should percent-encode ... reserved ...
37     ## unless ... allowed by the URI scheme .... --- This is not testable.
38    
39 wakaba 1.2 ## RFC 3986 2.3., 6.2.2.2., RFC 3987 5.3.2.3.
40     pos ($uri_s) = 0;
41 wakaba 1.1 while ($uri_s =~ /%(2[DdEe]|4[1-9A-Fa-f]|5[AaFf]|6[1-9A-Fa-f]|7[AaEe])/g) {
42     $onerror->(type => 'percent-encoded unreserved',
43     position => $-[0] + 1, level => 's');
44 wakaba 1.2 ## should
45     ## should
46 wakaba 1.1 }
47    
48     ## RFC 3986 2.4.
49     ## ... "%" ... must be percent-encoded as "%25" ...
50     ## --- Either syntax error or undetectable if followed by two hexadecimals
51    
52 wakaba 1.2 ## RFC 3986 3.1., 6.2.2.1., RFC 3987 5.3.2.1.
53 wakaba 1.1 my $scheme = $uri_o->uri_scheme;
54 wakaba 1.2 my $scheme_canon;
55 wakaba 1.1 if (defined $scheme) {
56 wakaba 1.2 $scheme_canon = Encode::decode ('utf8', $scheme);
57     $scheme_canon =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
58     if ($scheme_canon =~ tr/A-Z/a-z/) {
59     $onerror->(type => 'uppercase scheme name', level => 's');
60     ## should
61 wakaba 1.1 }
62     }
63    
64     ## Note that nothing prevent a conforming URI (if there is one)
65     ## using an unregistered URI scheme...
66    
67 wakaba 1.2 ## RFC 3986 3.2.1., 7.5.
68 wakaba 1.1 my $ui = $uri_o->uri_userinfo;
69     if (defined $ui and $ui =~ /:/) {
70     $onerror->(type => 'password', level => 's');
71     ## deprecated
72     }
73    
74 wakaba 1.2 ## RFC 3986 3.2.2., 6.2.2.1., RFC 3987 5.3.2.1.
75 wakaba 1.1 my $host = $uri_o->uri_host;
76     if (defined $host) {
77     if ($host =~ /^\[([vV][0-9A-Fa-f]+)\./) {
78     $onerror->(type => 'address format not supported:'.$1,
79     level => 'w');
80 wakaba 1.2 }
81     my $hostnp = $host;
82     $hostnp =~ s/%([0-9A-Fa-f][0-9A-Fa-f])//g;
83     if ($hostnp =~ /[A-Z]/) {
84     $onerror->(type => 'uppercase host',
85     level => 's');
86     ## should
87     }
88    
89     if ($host =~ /^\[/) {
90     #
91 wakaba 1.1 } else {
92 wakaba 1.2 $host = Encode::encode ('utf8', $host);
93     $host =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack 'C', hex $1/ge;
94    
95     if ($host eq '') {
96     ## NOTE: Although not explicitly mentioned, an empty host
97     ## should be considered as an exception for the recommendation
98     ## that a host "should" be a DNS name.
99     } elsif ($host !~ /\A(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)(?>\.(?>[A-Za-z0-9](?>[A-Za-z0-9-]{0,61}[A-Za-z0-9])?))*\.?\z/) {
100     $onerror->(type => 'non-DNS host', level => 's');
101     ## should
102     ## should be IDNA encoding if wish to maximize interoperability
103     } elsif (length $host > 255) {
104     ## NOTE: This length might be incorrect if there were percent-encoded
105     ## UTF-8 bytes; however, the above condition catches all non-ASCII.
106     $onerror->(type => 'long host', level => 's');
107     ## should
108 wakaba 1.1 }
109    
110 wakaba 1.2 ## FQDN should be followed by "." if necessary --- untestable
111 wakaba 1.1
112 wakaba 1.2 ## must be UTF-8
113     unless ($host =~ /\A(?>
114     [\x00-\x7F] |
115     [\xC2-\xDF][\x80-\xBF] | # UTF8-2
116     [\xE0][\xA0-\xBF][\x80-\xBF] |
117     [\xE1-\xEC][\x80-\xBF][\x80-\xBF] |
118     [\xED][\x80-\x9F][\x80-\xBF] |
119     [\xEE\xEF][\x80-\xBF][\x80-\xBF] | # UTF8-3
120     [\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF] |
121     [\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF] |
122     [\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF] # UTF8-4
123     )*\z/x) {
124     $onerror->(type => 'non UTF-8 host', level => 'm');
125     # must
126     }
127     }
128     }
129 wakaba 1.1
130 wakaba 1.2 ## RFC 3986 3.2., 3.2.3., 6.2.3., RFC 3987 5.3.3.
131     my $port = $uri_o->uri_port;
132     if (defined $port) {
133     if ($port =~ /\A([0-9]+)\z/) {
134     if ($DefaultPort->{$scheme_canon} == $1) {
135     $onerror->(type => 'default port', level => 's');
136     ## should
137     }
138     } elsif ($port eq '') {
139     $onerror->(type => 'empty port', level => 's');
140     ## should
141     }
142     }
143 wakaba 1.1
144 wakaba 1.2 ## RFC 3986 3.4.
145     ## ... says that "/" or "?" in query might be problematic for
146     ## old implementations, but also suggest that for readability percent-encoding
147     ## might not be good idea. It provides no recommendation on this issue.
148     ## Therefore, we do no check for this matter.
149    
150     ## RFC 3986 3.5.
151     ## ... says again that "/" or "?" in fragment might be problematic,
152     ## without any recommendation.
153     ## We again left this unchecked.
154    
155     ## RFC 3986 4.4.
156     ## Authors should not assume ... different, though equivalent,
157     ## URI will (or will not) be interpreted as a same-document reference ...
158     ## This is not testable.
159    
160     ## RFC 3986 5.4.2.
161     ## "scheme:relative" should be avoided
162     ## This is not testable without scheme specific information.
163 wakaba 1.1
164 wakaba 1.2 ## RFC 3986 6.2.2.3., RFC 3987 5.3.2.4.
165     my $path = $uri_o->uri_path;
166     if (defined $scheme) {
167     if (
168     $path =~ m!/\.\.! or
169     $path =~ m!/\./! or
170     $path =~ m!/\.\.\z! or
171     $path =~ m!/\.\z! or
172     $path =~ m!\A\.\./! or
173     $path =~ m!\A\./! or
174     $path eq '.,' or
175     $path eq '.'
176     ) {
177     $onerror->(type => 'dot-segment', level => 's');
178     ## should
179 wakaba 1.1 }
180     }
181    
182 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
183     my $authority = $uri_o->uri_authority;
184     if (defined $authority) {
185     if ($path eq '') {
186     $onerror->(type => 'empty path', level => 's');
187     ## should
188     }
189     }
190 wakaba 1.1
191 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
192     ## Scheme dependent default authority should be omitted
193 wakaba 1.1
194 wakaba 1.2 ## RFC 3986 6.2.3., RFC 3987 5.3.3.
195     if (defined $host and $host eq '' and
196     (defined $ui or defined $port)) {
197     $onerror->(type => 'empty host', level => 's');
198     ## should # when empty authority is allowed
199     }
200    
201     ## RFC 3986 7.5.
202     ## should not ... username or password that is intended to be secret
203     ## This is not testable.
204    
205     ## RFC 3987 4.1.
206     ## MUST be in full logical order
207     ## This is not testable.
208    
209     ## RFC 3987 4.1., 6.4.
210     ## URI scheme dependent syntax
211     ## MUST
212     ## TODO
213    
214     ## RFC 3987 4.2.
215     ## iuserinfo, ireg-name, isegment, isegment-nz, isegment-nz-nc, iquery, ifragment
216     ## SHOULD NOT use both rtl and ltr characters
217     ## SHOULD start with rtl if using rtl characters
218     ## TODO
219    
220     ## RFC 3987 5.3.2.2.
221     ## SHOULD be NFC
222     ## NFKC may avoid even more problems
223     ## TODO
224    
225     ## RFC 3987 5.3.3.
226     ## IDN (ireg-name or elsewhere) SHOULD be validated by ToASCII(UseSTD3ASCIIRules, AllowUnassigned)
227     ## SHOULD be normalized by Nameprep
228     ## TODO
229 wakaba 1.1 } # check_iri_reference
230    
231     1;
232 wakaba 1.2 ## $Date: 2007/05/25 12:13:55 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24