2 |
use strict; |
use strict; |
3 |
|
|
4 |
sub new ($) { |
sub new ($) { |
5 |
return bless {id_prefix => ''}, shift; |
return bless {urls => []}, shift; |
6 |
} # new |
} # new |
7 |
|
|
8 |
sub id_prefix ($;$) { |
sub id_prefix ($) { '' } |
9 |
if (@_ > 1) { |
|
10 |
if (defined $_[1]) { |
sub nested ($) { 0 } |
11 |
$_[0]->{id_prefix} = ''.$_[1]; |
|
12 |
} else { |
sub subdocument_index ($) { 0 } |
13 |
$_[0]->{id_prefix} = ''; |
|
14 |
|
sub full_subdocument_index ($) { 0 } |
15 |
|
|
16 |
|
sub url ($) { |
17 |
|
my $self = shift; |
18 |
|
if (@{$self->{urls}}) { |
19 |
|
return $self->{urls}->[-1]; |
20 |
|
} else { |
21 |
|
return undef; |
22 |
|
} |
23 |
|
} # url |
24 |
|
|
25 |
|
sub add_url ($$) { |
26 |
|
my ($self, $url) = @_; |
27 |
|
push @{$self->{urls}}, ''.$url; |
28 |
|
} # add_url |
29 |
|
|
30 |
|
sub urls ($) { |
31 |
|
my $self = shift; |
32 |
|
return [@{$self->{urls}}]; |
33 |
|
} # urls |
34 |
|
|
35 |
|
sub get_document ($$$$) { |
36 |
|
my $self = shift->new; |
37 |
|
|
38 |
|
my ($cgi => $result => $out) = @_; |
39 |
|
|
40 |
|
$out->input ($self); |
41 |
|
|
42 |
|
require Encode; |
43 |
|
my $url_s = Encode::decode ('utf-8', $cgi->get_parameter ('uri')); |
44 |
|
my $url_o; |
45 |
|
if (defined $url_s and length $url_s) { |
46 |
|
require Message::DOM::DOMImplementation; |
47 |
|
my $dom = Message::DOM::DOMImplementation->new; |
48 |
|
|
49 |
|
$url_o = $dom->create_uri_reference ($url_s); |
50 |
|
$url_o->uri_fragment (undef); |
51 |
|
|
52 |
|
$self->add_url ($url_o->uri_reference); |
53 |
|
|
54 |
|
my $url_scheme = lc $url_o->uri_scheme; ## TODO: html5_url_scheme |
55 |
|
my $class = { |
56 |
|
http => 'WebHACC::Input::HTTP', |
57 |
|
}->{$url_scheme} || 'WebHACC::Input::UnsupportedURLSchemeError'; |
58 |
|
bless $self, $class; |
59 |
|
} else { |
60 |
|
bless $self, 'WebHACC::Input::Text'; |
61 |
|
} |
62 |
|
|
63 |
|
$self->_get_document ($cgi => $result => $out, $url_o); |
64 |
|
|
65 |
|
return $self unless defined $self->{s}; |
66 |
|
|
67 |
|
if (length $self->{s} > 1000_000) { |
68 |
|
$self->{error_status_text} = 'Entity-body too large'; |
69 |
|
delete $self->{s}; |
70 |
|
bless $self, 'WebHACC::Input::Error'; |
71 |
|
return $self; |
72 |
|
} |
73 |
|
|
74 |
|
require Whatpm::ContentType; |
75 |
|
($self->{official_type}, $self->{media_type}) |
76 |
|
= Whatpm::ContentType->get_sniffed_type |
77 |
|
(get_file_head => sub { |
78 |
|
return substr $self->{s}, 0, shift; |
79 |
|
}, |
80 |
|
http_content_type_byte => $self->{http_content_type_bytes}, |
81 |
|
supported_image_types => {}); |
82 |
|
|
83 |
|
my $input_format = $cgi->get_parameter ('i'); |
84 |
|
if (defined $input_format and length $input_format) { |
85 |
|
$self->{media_type_overridden} |
86 |
|
= (not defined $self->{media_type} or |
87 |
|
$input_format ne $self->{media_type}); |
88 |
|
$self->{media_type} = $input_format; |
89 |
|
} |
90 |
|
if (defined $self->{s} and not defined $self->{media_type}) { |
91 |
|
$self->{media_type} = 'text/html'; |
92 |
|
$self->{media_type_overridden} = 1; |
93 |
|
} |
94 |
|
|
95 |
|
if ($self->{media_type} eq 'text/xml') { |
96 |
|
unless (defined $self->{charset}) { |
97 |
|
$self->{charset} = 'us-ascii'; |
98 |
|
$self->{official_charset} = $self->{charset}; |
99 |
|
} elsif ($self->{charset_overridden} and $self->{charset} eq 'us-ascii') { |
100 |
|
$self->{charset_overridden} = 0; |
101 |
} |
} |
102 |
} |
} |
103 |
|
|
104 |
return $_[0]->{id_prefix}; |
$self->{inner_html_element} = $cgi->get_parameter ('e'); |
105 |
} # id_prefix |
|
106 |
|
return $self; |
107 |
|
} # get_document |
108 |
|
|
109 |
|
sub _get_document ($$$$) { |
110 |
|
die "$0: _get_document of " . ref $_[0]; |
111 |
|
} # _get_document |
112 |
|
|
113 |
|
sub generate_info_section ($$) { |
114 |
|
my $self = shift; |
115 |
|
|
116 |
|
my $result = shift; |
117 |
|
my $out = $result->output; |
118 |
|
|
119 |
|
$out->start_section (id => 'document-info', title => 'Information'); |
120 |
|
$out->start_tag ('dl'); |
121 |
|
|
122 |
|
my $urls = $self->urls; |
123 |
|
|
124 |
|
$out->dt (@$urls == 1 ? 'URL' : 'URLs'); |
125 |
|
my $url = pop @$urls; |
126 |
|
for (@$urls) { |
127 |
|
$out->start_tag ('dd'); |
128 |
|
$out->url ($_); |
129 |
|
} |
130 |
|
$out->start_tag ('dd'); |
131 |
|
$out->url ($url, id => 'anchor-document-url'); |
132 |
|
$out->script (q[ |
133 |
|
document.title = '<' |
134 |
|
+ document.getElementById ('anchor-document-url').href + '> \\u2014 ' |
135 |
|
+ document.title; |
136 |
|
]); |
137 |
|
|
138 |
|
if (defined $self->{s}) { |
139 |
|
$out->dt ('Base URL'); |
140 |
|
$out->start_tag ('dd'); |
141 |
|
$out->url ($self->{base_uri}); |
142 |
|
|
143 |
|
$out->dt ('Internet Media Type'); |
144 |
|
$out->start_tag ('dd'); |
145 |
|
$out->code ($self->{media_type}, class => 'MIME', lang => 'en'); |
146 |
|
if ($self->{media_type_overridden}) { |
147 |
|
$out->nl_text ('... overridden'); |
148 |
|
} elsif (defined $self->{official_type}) { |
149 |
|
if ($self->{media_type} eq $self->{official_type}) { |
150 |
|
# |
151 |
|
} else { |
152 |
|
$out->nl_text ('... sniffed, official type is #', |
153 |
|
text => $self->{official_type}); |
154 |
|
} |
155 |
|
} else { |
156 |
|
$out->nl_text ( '... sniffed'); |
157 |
|
} |
158 |
|
|
159 |
sub nested ($;$) { |
$out->dt ('Character Encoding'); |
160 |
if (@_ > 1) { |
$out->start_tag ('dd'); |
161 |
if ($_[1]) { |
if (defined $self->{charset}) { |
162 |
$_[0]->{nested} = 1; |
$out->code ($self->{charset}, class => 'charset', lang => 'en'); |
163 |
} else { |
} else { |
164 |
delete $_[0]->{nested}; |
$out->nl_text ('(unknown)'); |
165 |
} |
} |
166 |
|
$out->nl_text ('... overridden') if $self->{charset_overridden}; |
167 |
|
|
168 |
|
$out->dt ($self->{is_char_string} ? 'Character Length' : 'Byte Length'); |
169 |
|
## TODO: formatting |
170 |
|
$out->start_tag ('dd'); |
171 |
|
my $length = length $self->{s}; |
172 |
|
$out->text ($length . ' '); |
173 |
|
$out->nl_text (($self->{is_char_string} ? 'character' : 'byte') . |
174 |
|
($length == 1 ? '' : 's')); |
175 |
} |
} |
176 |
|
|
177 |
return $_[0]->{nested}; |
$out->end_tag ('dl'); |
178 |
} # nested |
$out->end_section; |
179 |
|
} # generate_info_section |
180 |
|
|
181 |
|
sub generate_transfer_sections ($$) { } |
182 |
|
|
183 |
|
package WebHACC::Input::HTTP; |
184 |
|
push our @ISA, 'WebHACC::Input'; |
185 |
|
|
186 |
|
{ |
187 |
|
my $HostPermit; |
188 |
|
sub host_permit ($) { |
189 |
|
return $HostPermit if $HostPermit; |
190 |
|
|
191 |
|
require Message::Util::HostPermit; |
192 |
|
$HostPermit = new Message::Util::HostPermit; |
193 |
|
$HostPermit->add_rule (<<'EOH'); |
194 |
|
Allow host=suika port=80 |
195 |
|
Deny host=suika |
196 |
|
Allow host=suika.fam.cx port=80 |
197 |
|
Deny host=suika.fam.cx |
198 |
|
Deny host=localhost |
199 |
|
Deny host=*.localdomain |
200 |
|
Deny ipv4=0.0.0.0/8 |
201 |
|
Deny ipv4=10.0.0.0/8 |
202 |
|
Deny ipv4=127.0.0.0/8 |
203 |
|
Deny ipv4=169.254.0.0/16 |
204 |
|
Deny ipv4=172.0.0.0/11 |
205 |
|
Deny ipv4=192.0.2.0/24 |
206 |
|
Deny ipv4=192.88.99.0/24 |
207 |
|
Deny ipv4=192.168.0.0/16 |
208 |
|
Deny ipv4=198.18.0.0/15 |
209 |
|
Deny ipv4=224.0.0.0/4 |
210 |
|
Deny ipv4=255.255.255.255/32 |
211 |
|
Deny ipv6=0::0/0 |
212 |
|
Allow host=* |
213 |
|
EOH |
214 |
|
return $HostPermit; |
215 |
|
} # host_permit |
216 |
|
} |
217 |
|
|
218 |
|
sub _get_document ($$$$$) { |
219 |
|
my ($self, $cgi => $result => $out, $url_o) = @_; |
220 |
|
|
221 |
|
unless ($self->host_permit->check ($url_o->uri_host, $url_o->uri_port || 80)) { |
222 |
|
$self->{error_status_text} = 'Connection to the host is forbidden'; |
223 |
|
bless $self, 'WebHACC::Input::Error'; |
224 |
|
return $self; |
225 |
|
} |
226 |
|
|
227 |
|
my $ua = WDCC::LWPUA->new; |
228 |
|
$ua->{wdcc_dom} = Message::DOM::DOMImplementation->new; |
229 |
|
$ua->{wdcc_host_permit} = $self->host_permit; |
230 |
|
$ua->agent ('Mozilla'); ## TODO: for now. |
231 |
|
$ua->parse_head (0); |
232 |
|
$ua->protocols_allowed ([qw/http/]); |
233 |
|
$ua->max_size (1000_000); |
234 |
|
my $req = HTTP::Request->new (GET => $url_o->uri_reference); |
235 |
|
$req->header ('Accept-Encoding' => 'identity, *; q=0'); |
236 |
|
my $res = $ua->request ($req); |
237 |
|
## TODO: 401 sets |is_success| true. |
238 |
|
## TODO: Don't follow redirect if error-page=true |
239 |
|
if ($res->is_success or $cgi->get_parameter ('error-page')) { |
240 |
|
$self->{base_uri} = $res->base; ## NOTE: It does check |Content-Base|, |Content-Location|, and <base>. ## TODO: Use our own code! |
241 |
|
my $new_url = $res->request->uri; |
242 |
|
$self->add_url ($new_url) if $new_url ne $self->url; |
243 |
|
|
244 |
|
## TODO: More strict parsing... |
245 |
|
my $ct = $self->{http_content_type_bytes} = $res->header ('Content-Type'); |
246 |
|
if (defined $ct and $ct =~ /;\s*charset\s*=\s*"?([^\s;"]+)"?/i) { |
247 |
|
$self->{charset} = lc $1; |
248 |
|
$self->{charset} =~ tr/\\//d; |
249 |
|
$self->{official_charset} = $self->{charset}; |
250 |
|
} |
251 |
|
|
252 |
|
my $input_charset = $cgi->get_parameter ('charset'); |
253 |
|
if (defined $input_charset and length $input_charset) { |
254 |
|
$self->{charset_overridden} |
255 |
|
= (not defined $self->{charset} or $self->{charset} ne $input_charset); |
256 |
|
$self->{charset} = $input_charset; |
257 |
|
} |
258 |
|
|
259 |
|
## TODO: Support for HTTP Content-Encoding |
260 |
|
|
261 |
|
$self->{s} = ''.$res->content; |
262 |
|
} else { |
263 |
|
$self->add_url ($res->request->uri); |
264 |
|
$self->{error_status_text} = $res->status_line; |
265 |
|
bless $self, 'WebHACC::Input::HTTPError'; |
266 |
|
} |
267 |
|
|
268 |
|
$self->{header_field} = []; |
269 |
|
$res->scan (sub { |
270 |
|
push @{$self->{header_field}}, [$_[0], $_[1]]; |
271 |
|
}); |
272 |
|
$self->{header_status_code} = $res->code; |
273 |
|
$self->{header_status_text} = $res->message; |
274 |
|
|
275 |
|
return $self; |
276 |
|
} # _get_document |
277 |
|
|
278 |
sub generate_transfer_sections ($$) { |
sub generate_transfer_sections ($$) { |
279 |
my $self = shift; |
my $self = shift; |
280 |
my $result = shift; |
my $result = shift; |
281 |
|
|
282 |
|
$result->layer_uncertain ('transfer'); |
283 |
|
|
284 |
$self->generate_http_header_section ($result); |
$self->generate_http_header_section ($result); |
285 |
} # generate_transfer_sections |
} # generate_transfer_sections |
325 |
$out->end_section; |
$out->end_section; |
326 |
} # generate_http_header_section |
} # generate_http_header_section |
327 |
|
|
328 |
|
package WebHACC::Input::Text; |
329 |
|
push our @ISA, 'WebHACC::Input'; |
330 |
|
|
331 |
|
sub _get_document ($$$$) { |
332 |
|
my ($self, $cgi => $result => $out) = @_; |
333 |
|
|
334 |
|
$self->add_url (q<thismessage:/>); |
335 |
|
$self->{base_uri} = q<thismessage:/>; |
336 |
|
|
337 |
|
$self->{s} = ''.$cgi->get_parameter ('s'); |
338 |
|
$self->{charset} = ''.$cgi->get_parameter ('_charset_'); |
339 |
|
$self->{charset} =~ s/\s+//g; |
340 |
|
$self->{charset} = 'utf-8' if $self->{charset} eq ''; |
341 |
|
$self->{official_charset} = $self->{charset}; |
342 |
|
$self->{header_field} = []; |
343 |
|
|
344 |
|
return $self; |
345 |
|
} # _get_document |
346 |
|
|
347 |
|
package WebHACC::Input::Subdocument; |
348 |
|
push our @ISA, 'WebHACC::Input'; |
349 |
|
|
350 |
|
sub new ($$) { |
351 |
|
my $self = bless {}, shift; |
352 |
|
$self->{subdocument_index} = shift; |
353 |
|
return $self; |
354 |
|
} # new |
355 |
|
|
356 |
|
sub id_prefix ($) { |
357 |
|
my $self = shift; |
358 |
|
return $self->{parent_input}->id_prefix . |
359 |
|
'subdoc-' . $self->{subdocument_index} . '-'; |
360 |
|
} # id_prefix |
361 |
|
|
362 |
|
sub nested ($) { 1 } |
363 |
|
|
364 |
|
sub subdocument_index ($) { |
365 |
|
return shift->{subdocument_index}; |
366 |
|
} # subdocument_index |
367 |
|
|
368 |
|
sub full_subdocument_index ($) { |
369 |
|
my $self = shift; |
370 |
|
my $parent = $self->{parent_input}->full_subdocument_index; |
371 |
|
if ($parent) { |
372 |
|
return $parent . '.' . $self->{subdocument_index}; |
373 |
|
} else { |
374 |
|
return $self->{subdocument_index}; |
375 |
|
} |
376 |
|
} # full_subdocument_index |
377 |
|
|
378 |
|
sub start_section ($$) { |
379 |
|
my $self = shift; |
380 |
|
|
381 |
|
my $result = shift; |
382 |
|
my $out = $result->output; |
383 |
|
|
384 |
|
my $index = $self->subdocument_index; |
385 |
|
$out->start_section (id => my $id = 'subdoc-' . $index . '-', |
386 |
|
title => qq[Subdocument #], |
387 |
|
short_title => 'Sub #', |
388 |
|
role => 'subdoc', |
389 |
|
text => $self->full_subdocument_index); |
390 |
|
$out->script (q[ insertNavSections ('] . $out->input->id_prefix . $id . q[') ]); |
391 |
|
} # start_section |
392 |
|
|
393 |
|
sub end_section ($$) { |
394 |
|
$_[1]->output->end_section; |
395 |
|
} # end_section |
396 |
|
|
397 |
|
sub generate_info_section ($$) { |
398 |
|
my $self = shift; |
399 |
|
|
400 |
|
my $result = shift; |
401 |
|
my $out = $result->output; |
402 |
|
|
403 |
|
$out->start_section (id => 'document-info', title => 'Information'); |
404 |
|
$out->start_tag ('dl'); |
405 |
|
|
406 |
|
$out->dt ('Internet Media Type'); |
407 |
|
$out->start_tag ('dd'); |
408 |
|
$out->code ($self->{media_type}, code => 'MIME', lang => 'en'); |
409 |
|
|
410 |
|
if (defined $self->{container_node}) { |
411 |
|
$out->dt ('Container Node'); |
412 |
|
$out->start_tag ('dd'); |
413 |
|
my $original_input = $out->input; |
414 |
|
$out->input ($self->{parent_input}); |
415 |
|
$out->node_link ($self->{container_node}); |
416 |
|
$out->input ($original_input); |
417 |
|
} |
418 |
|
|
419 |
|
$out->dt ('Base URL'); |
420 |
|
$out->start_tag ('dd'); |
421 |
|
$out->url ($self->{base_uri}); |
422 |
|
|
423 |
|
$out->end_tag ('dl'); |
424 |
|
$out->end_section; |
425 |
|
} # generate_info_section |
426 |
|
|
427 |
package WebHACC::Input::Error; |
package WebHACC::Input::Error; |
428 |
push our @ISA, 'WebHACC::Input'; |
push our @ISA, 'WebHACC::Input'; |
429 |
|
|
430 |
sub generate_transfer_sections ($$) { |
sub generate_transfer_sections ($$) { |
431 |
my $self = shift; |
my $self = shift; |
|
|
|
|
$self->SUPER::generate_transfer_sections (@_); |
|
432 |
|
|
433 |
my $result = shift; |
my $result = shift; |
434 |
my $out = $result->output; |
my $out = $result->output; |
435 |
|
|
436 |
$out->start_section (id => 'transfer-errors', title => 'Transfer Errors'); |
$out->start_section (role => 'transfer-errors'); |
437 |
|
|
438 |
$out->start_tag ('dl'); |
$out->start_tag ('dl'); |
439 |
$result->add_error (layer => 'transfer', |
$result->add_error (layer => 'transfer', |
446 |
$out->end_section; |
$out->end_section; |
447 |
} # generate_transfer_sections |
} # generate_transfer_sections |
448 |
|
|
449 |
|
package WebHACC::Input::HTTPError; |
450 |
|
push our @ISA, 'WebHACC::Input::Error', 'WebHACC::Input::HTTP'; |
451 |
|
|
452 |
|
sub generate_transfer_sections ($$) { |
453 |
|
my $self = shift; |
454 |
|
|
455 |
|
my $result = shift; |
456 |
|
|
457 |
|
$self->WebHACC::Input::Error->generate_transfer_sections ($result); |
458 |
|
$self->WebHACC::Input::HTTP->generate_transfer_sections ($result); |
459 |
|
|
460 |
|
} # generate_transfer_sections |
461 |
|
|
462 |
|
package WebHACC::Input::UnsupportedURLSchemeError; |
463 |
|
push our @ISA, 'WebHACC::Input::Error'; |
464 |
|
|
465 |
|
sub _get_document ($$$$) { |
466 |
|
my ($self, $cgi => $result => $out) = @_; |
467 |
|
|
468 |
|
$self->{error_status_text} = 'URL scheme not allowed'; |
469 |
|
|
470 |
|
return $self; |
471 |
|
} # _get_document |
472 |
|
|
473 |
|
package WDCC::LWPUA; |
474 |
|
require LWP::UserAgent; |
475 |
|
push our @ISA, 'LWP::UserAgent'; |
476 |
|
|
477 |
|
sub redirect_ok { |
478 |
|
my $ua = shift; |
479 |
|
unless ($ua->SUPER::redirect_ok (@_)) { |
480 |
|
return 0; |
481 |
|
} |
482 |
|
|
483 |
|
my $uris = $_[1]->header ('Location'); |
484 |
|
return 0 unless $uris; |
485 |
|
my $uri = $ua->{wdcc_dom}->create_uri_reference ($uris); |
486 |
|
unless ({ |
487 |
|
http => 1, |
488 |
|
}->{lc $uri->uri_scheme}) { ## TODO: html5_url_scheme |
489 |
|
return 0; |
490 |
|
} |
491 |
|
unless ($ua->{wdcc_host_permit}->check ($uri->uri_host, $uri->uri_port || 80)) { |
492 |
|
return 0; |
493 |
|
} |
494 |
|
return 1; |
495 |
|
} # redirect_ok |
496 |
|
|
497 |
1; |
1; |