2 |
use strict; |
use strict; |
3 |
|
|
4 |
sub new ($) { |
sub new ($) { |
5 |
return bless {}, shift; |
return bless {urls => []}, shift; |
6 |
} # new |
} # new |
7 |
|
|
8 |
sub id_prefix ($) { '' } |
sub id_prefix ($) { '' } |
13 |
|
|
14 |
sub full_subdocument_index ($) { 0 } |
sub full_subdocument_index ($) { 0 } |
15 |
|
|
16 |
|
sub url ($) { |
17 |
|
my $self = shift; |
18 |
|
if (@{$self->{urls}}) { |
19 |
|
return $self->{urls}->[-1]; |
20 |
|
} else { |
21 |
|
return undef; |
22 |
|
} |
23 |
|
} # url |
24 |
|
|
25 |
|
sub add_url ($$) { |
26 |
|
my ($self, $url) = @_; |
27 |
|
push @{$self->{urls}}, ''.$url; |
28 |
|
} # add_url |
29 |
|
|
30 |
|
sub urls ($) { |
31 |
|
my $self = shift; |
32 |
|
return [@{$self->{urls}}]; |
33 |
|
} # urls |
34 |
|
|
35 |
|
sub get_document ($$$$) { |
36 |
|
my $self = shift->new; |
37 |
|
|
38 |
|
my ($cgi => $result => $out) = @_; |
39 |
|
|
40 |
|
$out->input ($self); |
41 |
|
|
42 |
|
require Encode; |
43 |
|
my $url_s = Encode::decode ('utf-8', $cgi->get_parameter ('uri')); |
44 |
|
my $url_o; |
45 |
|
if (defined $url_s and length $url_s) { |
46 |
|
require Message::DOM::DOMImplementation; |
47 |
|
my $dom = Message::DOM::DOMImplementation->new; |
48 |
|
|
49 |
|
$url_o = $dom->create_uri_reference ($url_s); |
50 |
|
$url_o->uri_fragment (undef); |
51 |
|
|
52 |
|
$self->add_url ($url_o->uri_reference); |
53 |
|
|
54 |
|
my $url_scheme = lc $url_o->uri_scheme; ## TODO: html5_url_scheme |
55 |
|
my $class = { |
56 |
|
http => 'WebHACC::Input::HTTP', |
57 |
|
}->{$url_scheme} || 'WebHACC::Input::UnsupportedURLSchemeError'; |
58 |
|
bless $self, $class; |
59 |
|
} else { |
60 |
|
bless $self, 'WebHACC::Input::Text'; |
61 |
|
} |
62 |
|
|
63 |
|
$self->_get_document ($cgi => $result => $out, $url_o); |
64 |
|
|
65 |
|
return $self unless defined $self->{s}; |
66 |
|
|
67 |
|
if (length $self->{s} > 1000_000) { |
68 |
|
$self->{error_status_text} = 'Entity-body too large'; |
69 |
|
delete $self->{s}; |
70 |
|
bless $self, 'WebHACC::Input::Error'; |
71 |
|
return $self; |
72 |
|
} |
73 |
|
|
74 |
|
require Whatpm::ContentType; |
75 |
|
($self->{official_type}, $self->{media_type}) |
76 |
|
= Whatpm::ContentType->get_sniffed_type |
77 |
|
(get_file_head => sub { |
78 |
|
return substr $self->{s}, 0, shift; |
79 |
|
}, |
80 |
|
http_content_type_byte => $self->{http_content_type_bytes}, |
81 |
|
supported_image_types => {}); |
82 |
|
|
83 |
|
my $input_format = $cgi->get_parameter ('i'); |
84 |
|
if (defined $input_format and length $input_format) { |
85 |
|
$self->{media_type_overridden} |
86 |
|
= (not defined $self->{media_type} or |
87 |
|
$input_format ne $self->{media_type}); |
88 |
|
$self->{media_type} = $input_format; |
89 |
|
} |
90 |
|
if (defined $self->{s} and not defined $self->{media_type}) { |
91 |
|
$self->{media_type} = 'text/html'; |
92 |
|
$self->{media_type_overridden} = 1; |
93 |
|
} |
94 |
|
|
95 |
|
if ($self->{media_type} eq 'text/xml') { |
96 |
|
unless (defined $self->{charset}) { |
97 |
|
$self->{charset} = 'us-ascii'; |
98 |
|
$self->{official_charset} = $self->{charset}; |
99 |
|
} elsif ($self->{charset_overridden} and $self->{charset} eq 'us-ascii') { |
100 |
|
$self->{charset_overridden} = 0; |
101 |
|
} |
102 |
|
} |
103 |
|
|
104 |
|
$self->{inner_html_element} = $cgi->get_parameter ('e'); |
105 |
|
|
106 |
|
return $self; |
107 |
|
} # get_document |
108 |
|
|
109 |
|
sub _get_document ($$$$) { |
110 |
|
die "$0: _get_document of " . ref $_[0]; |
111 |
|
} # _get_document |
112 |
|
|
113 |
sub generate_info_section ($$) { |
sub generate_info_section ($$) { |
114 |
my $self = shift; |
my $self = shift; |
115 |
|
|
119 |
$out->start_section (id => 'document-info', title => 'Information'); |
$out->start_section (id => 'document-info', title => 'Information'); |
120 |
$out->start_tag ('dl'); |
$out->start_tag ('dl'); |
121 |
|
|
122 |
$out->dt ('Request URL'); |
my $urls = $self->urls; |
|
$out->start_tag ('dd'); |
|
|
$out->url ($self->{request_uri}); |
|
123 |
|
|
124 |
$out->dt ('Document URL'); ## TODO: HTML5 "document's address"? |
$out->dt (@$urls == 1 ? 'URL' : 'URLs'); |
125 |
|
my $url = pop @$urls; |
126 |
|
for (@$urls) { |
127 |
|
$out->start_tag ('dd'); |
128 |
|
$out->url ($_); |
129 |
|
} |
130 |
$out->start_tag ('dd'); |
$out->start_tag ('dd'); |
131 |
$out->url ($self->{uri}, id => 'anchor-document-url'); |
$out->url ($url, id => 'anchor-document-url'); |
132 |
$out->script (q[ |
$out->script (q[ |
133 |
document.title = '<' |
document.title = '<' |
134 |
+ document.getElementById ('anchor-document-url').href + '> \\u2014 ' |
+ document.getElementById ('anchor-document-url').href + '> \\u2014 ' |
144 |
$out->start_tag ('dd'); |
$out->start_tag ('dd'); |
145 |
$out->code ($self->{media_type}, class => 'MIME', lang => 'en'); |
$out->code ($self->{media_type}, class => 'MIME', lang => 'en'); |
146 |
if ($self->{media_type_overridden}) { |
if ($self->{media_type_overridden}) { |
147 |
$out->html (' <em>(overridden)</em>'); |
$out->nl_text ('... overridden'); |
148 |
} elsif (defined $self->{official_type}) { |
} elsif (defined $self->{official_type}) { |
149 |
if ($self->{media_type} eq $self->{official_type}) { |
if ($self->{media_type} eq $self->{official_type}) { |
150 |
# |
# |
151 |
} else { |
} else { |
152 |
$out->html (' <em>(sniffed; official type is: '); |
$out->nl_text ('... sniffed, official type is #', |
153 |
$out->code ($self->{official_type}, class => 'MIME', lang => 'en'); |
text => $self->{official_type}); |
|
$out->html (')</em>'); |
|
154 |
} |
} |
155 |
} else { |
} else { |
156 |
$out->html ( '<em>(sniffed)</em>'); |
$out->nl_text ( '... sniffed'); |
157 |
} |
} |
158 |
|
|
159 |
$out->dt ('Character Encoding'); |
$out->dt ('Character Encoding'); |
161 |
if (defined $self->{charset}) { |
if (defined $self->{charset}) { |
162 |
$out->code ($self->{charset}, class => 'charset', lang => 'en'); |
$out->code ($self->{charset}, class => 'charset', lang => 'en'); |
163 |
} else { |
} else { |
164 |
$out->text ('(none)'); |
$out->nl_text ('(unknown)'); |
165 |
} |
} |
166 |
$out->html (' <em>overridden</em>') if $self->{charset_overridden}; |
$out->nl_text ('... overridden') if $self->{charset_overridden}; |
167 |
|
|
168 |
$out->dt ($self->{is_char_string} ? 'Character Length' : 'Byte Length'); |
$out->dt ($self->{is_char_string} ? 'Character Length' : 'Byte Length'); |
169 |
## TODO: formatting |
## TODO: formatting |
170 |
$out->start_tag ('dd'); |
$out->start_tag ('dd'); |
171 |
my $length = length $self->{s}; |
my $length = length $self->{s}; |
172 |
$out->text ($length . ($self->{is_char_string} ? ' character' : ' byte') . |
$out->text ($length . ' '); |
173 |
($length == 1 ? '' : 's')); |
$out->nl_text (($self->{is_char_string} ? 'character' : 'byte') . |
174 |
|
($length == 1 ? '' : 's')); |
175 |
} |
} |
176 |
|
|
177 |
$out->end_tag ('dl'); |
$out->end_tag ('dl'); |
226 |
$out->end_section; |
$out->end_section; |
227 |
} # generate_http_header_section |
} # generate_http_header_section |
228 |
|
|
229 |
|
package WebHACC::Input::HTTP; |
230 |
|
push our @ISA, 'WebHACC::Input'; |
231 |
|
|
232 |
|
{ |
233 |
|
my $HostPermit; |
234 |
|
sub host_permit ($) { |
235 |
|
return $HostPermit if $HostPermit; |
236 |
|
|
237 |
|
require Message::Util::HostPermit; |
238 |
|
$HostPermit = new Message::Util::HostPermit; |
239 |
|
$HostPermit->add_rule (<<'EOH'); |
240 |
|
Allow host=suika port=80 |
241 |
|
Deny host=suika |
242 |
|
Allow host=suika.fam.cx port=80 |
243 |
|
Deny host=suika.fam.cx |
244 |
|
Deny host=localhost |
245 |
|
Deny host=*.localdomain |
246 |
|
Deny ipv4=0.0.0.0/8 |
247 |
|
Deny ipv4=10.0.0.0/8 |
248 |
|
Deny ipv4=127.0.0.0/8 |
249 |
|
Deny ipv4=169.254.0.0/16 |
250 |
|
Deny ipv4=172.0.0.0/11 |
251 |
|
Deny ipv4=192.0.2.0/24 |
252 |
|
Deny ipv4=192.88.99.0/24 |
253 |
|
Deny ipv4=192.168.0.0/16 |
254 |
|
Deny ipv4=198.18.0.0/15 |
255 |
|
Deny ipv4=224.0.0.0/4 |
256 |
|
Deny ipv4=255.255.255.255/32 |
257 |
|
Deny ipv6=0::0/0 |
258 |
|
Allow host=* |
259 |
|
EOH |
260 |
|
return $HostPermit; |
261 |
|
} # host_permit |
262 |
|
} |
263 |
|
|
264 |
|
sub _get_document ($$$$$) { |
265 |
|
my ($self, $cgi => $result => $out, $url_o) = @_; |
266 |
|
|
267 |
|
unless ($self->host_permit->check ($url_o->uri_host, $url_o->uri_port || 80)) { |
268 |
|
$self->{error_status_text} = 'Connection to the host is forbidden'; |
269 |
|
return $self; |
270 |
|
} |
271 |
|
|
272 |
|
my $ua = WDCC::LWPUA->new; |
273 |
|
$ua->{wdcc_dom} = Message::DOM::DOMImplementation->new; |
274 |
|
$ua->{wdcc_host_permit} = $self->host_permit; |
275 |
|
$ua->agent ('Mozilla'); ## TODO: for now. |
276 |
|
$ua->parse_head (0); |
277 |
|
$ua->protocols_allowed ([qw/http/]); |
278 |
|
$ua->max_size (1000_000); |
279 |
|
my $req = HTTP::Request->new (GET => $url_o->uri_reference); |
280 |
|
$req->header ('Accept-Encoding' => 'identity, *; q=0'); |
281 |
|
my $res = $ua->request ($req); |
282 |
|
## TODO: 401 sets |is_success| true. |
283 |
|
## TODO: Don't follow redirect if error-page=true |
284 |
|
if ($res->is_success or $cgi->get_parameter ('error-page')) { |
285 |
|
$self->{base_uri} = $res->base; ## NOTE: It does check |Content-Base|, |Content-Location|, and <base>. ## TODO: Use our own code! |
286 |
|
my $new_url = $res->request->uri; |
287 |
|
$self->add_url ($new_url) if $new_url ne $self->url; |
288 |
|
|
289 |
|
## TODO: More strict parsing... |
290 |
|
my $ct = $self->{http_content_type_bytes} = $res->header ('Content-Type'); |
291 |
|
if (defined $ct and $ct =~ /;\s*charset\s*=\s*"?([^\s;"]+)"?/i) { |
292 |
|
$self->{charset} = lc $1; |
293 |
|
$self->{charset} =~ tr/\\//d; |
294 |
|
$self->{official_charset} = $self->{charset}; |
295 |
|
} |
296 |
|
|
297 |
|
my $input_charset = $cgi->get_parameter ('charset'); |
298 |
|
if (defined $input_charset and length $input_charset) { |
299 |
|
$self->{charset_overridden} |
300 |
|
= (not defined $self->{charset} or $self->{charset} ne $input_charset); |
301 |
|
$self->{charset} = $input_charset; |
302 |
|
} |
303 |
|
|
304 |
|
## TODO: Support for HTTP Content-Encoding |
305 |
|
|
306 |
|
$self->{s} = ''.$res->content; |
307 |
|
} else { |
308 |
|
$self->add_url ($res->request->uri); |
309 |
|
$self->{error_status_text} = $res->status_line; |
310 |
|
bless $self, 'WebHACC::Input::HTTPError'; |
311 |
|
} |
312 |
|
|
313 |
|
$self->{header_field} = []; |
314 |
|
$res->scan (sub { |
315 |
|
push @{$self->{header_field}}, [$_[0], $_[1]]; |
316 |
|
}); |
317 |
|
$self->{header_status_code} = $res->code; |
318 |
|
$self->{header_status_text} = $res->message; |
319 |
|
|
320 |
|
return $self; |
321 |
|
} # _get_document |
322 |
|
|
323 |
|
package WebHACC::Input::Text; |
324 |
|
push our @ISA, 'WebHACC::Input'; |
325 |
|
|
326 |
|
sub _get_document ($$$$) { |
327 |
|
my ($self, $cgi => $result => $out) = @_; |
328 |
|
|
329 |
|
$self->add_url (q<thismessage:/>); |
330 |
|
$self->{base_uri} = q<thismessage:/>; |
331 |
|
|
332 |
|
$self->{s} = ''.$cgi->get_parameter ('s'); |
333 |
|
$self->{charset} = ''.$cgi->get_parameter ('_charset_'); |
334 |
|
$self->{charset} =~ s/\s+//g; |
335 |
|
$self->{charset} = 'utf-8' if $self->{charset} eq ''; |
336 |
|
$self->{official_charset} = $self->{charset}; |
337 |
|
$self->{header_field} = []; |
338 |
|
|
339 |
|
return $self; |
340 |
|
} # _get_document |
341 |
|
|
342 |
package WebHACC::Input::Subdocument; |
package WebHACC::Input::Subdocument; |
343 |
push our @ISA, 'WebHACC::Input'; |
push our @ISA, 'WebHACC::Input'; |
344 |
|
|
349 |
} # new |
} # new |
350 |
|
|
351 |
sub id_prefix ($) { |
sub id_prefix ($) { |
352 |
return 'subdoc-' . shift->{subdocument_index} . '-'; |
return 'subdoc-' . shift->full_subdocument_index . '-'; |
353 |
} # id_prefix |
} # id_prefix |
354 |
|
|
355 |
sub nested ($) { 1 } |
sub nested ($) { 1 } |
376 |
|
|
377 |
my $index = $self->full_subdocument_index; |
my $index = $self->full_subdocument_index; |
378 |
$out->start_section (id => $self->id_prefix, |
$out->start_section (id => $self->id_prefix, |
379 |
title => qq[Subdocument #] . $index, |
title => qq[Subdocument #], |
380 |
short_title => 'Sub #' . $index); |
short_title => 'Sub #', |
381 |
|
text => $index); |
382 |
} # start_section |
} # start_section |
383 |
|
|
384 |
sub end_section ($$) { |
sub end_section ($$) { |
439 |
$out->end_section; |
$out->end_section; |
440 |
} # generate_transfer_sections |
} # generate_transfer_sections |
441 |
|
|
442 |
|
package WebHACC::Input::HTTPError; |
443 |
|
push our @ISA, 'WebHACC::Input::Error', 'WebHACC::Input::HTTP'; |
444 |
|
|
445 |
|
package WebHACC::Input::UnsupportedURLSchemeError; |
446 |
|
push our @ISA, 'WebHACC::Input::Error'; |
447 |
|
|
448 |
|
sub _get_document ($$$$) { |
449 |
|
my ($self, $cgi => $result => $out) = @_; |
450 |
|
|
451 |
|
$self->{error_status_text} = 'URL scheme not allowed'; |
452 |
|
|
453 |
|
return $self; |
454 |
|
} # _get_document |
455 |
|
|
456 |
|
package WDCC::LWPUA; |
457 |
|
require LWP::UserAgent; |
458 |
|
push our @ISA, 'LWP::UserAgent'; |
459 |
|
|
460 |
|
sub redirect_ok { |
461 |
|
my $ua = shift; |
462 |
|
unless ($ua->SUPER::redirect_ok (@_)) { |
463 |
|
return 0; |
464 |
|
} |
465 |
|
|
466 |
|
my $uris = $_[1]->header ('Location'); |
467 |
|
return 0 unless $uris; |
468 |
|
my $uri = $ua->{wdcc_dom}->create_uri_reference ($uris); |
469 |
|
unless ({ |
470 |
|
http => 1, |
471 |
|
}->{lc $uri->uri_scheme}) { ## TODO: html5_url_scheme |
472 |
|
return 0; |
473 |
|
} |
474 |
|
unless ($ua->{wdcc_host_permit}->check ($uri->uri_host, $uri->uri_port || 80)) { |
475 |
|
return 0; |
476 |
|
} |
477 |
|
return 1; |
478 |
|
} # redirect_ok |
479 |
|
|
480 |
1; |
1; |