| 1 | package WebHACC::Input; | 
| 2 | use strict; | 
| 3 |  | 
| 4 | sub new ($) { | 
| 5 | return bless {urls => []}, shift; | 
| 6 | } # new | 
| 7 |  | 
| 8 | sub id_prefix ($) { '' } | 
| 9 |  | 
| 10 | sub nested ($) { 0 } | 
| 11 |  | 
| 12 | sub subdocument_index ($) { 0 } | 
| 13 |  | 
| 14 | sub full_subdocument_index ($) { 0 } | 
| 15 |  | 
| 16 | sub url ($) { | 
| 17 | my $self = shift; | 
| 18 | if (@{$self->{urls}}) { | 
| 19 | return $self->{urls}->[-1]; | 
| 20 | } else { | 
| 21 | return undef; | 
| 22 | } | 
| 23 | } # url | 
| 24 |  | 
| 25 | sub add_url ($$) { | 
| 26 | my ($self, $url) = @_; | 
| 27 | push @{$self->{urls}}, ''.$url; | 
| 28 | } # add_url | 
| 29 |  | 
| 30 | sub urls ($) { | 
| 31 | my $self = shift; | 
| 32 | return [@{$self->{urls}}]; | 
| 33 | } # urls | 
| 34 |  | 
| 35 | sub get_document ($$$$) { | 
| 36 | my $self = shift->new; | 
| 37 |  | 
| 38 | my ($cgi => $result => $out) = @_; | 
| 39 |  | 
| 40 | $out->input ($self); | 
| 41 |  | 
| 42 | require Encode; | 
| 43 | my $url_s = Encode::decode ('utf-8', $cgi->get_parameter ('uri')); | 
| 44 | my $url_o; | 
| 45 | if (defined $url_s and length $url_s) { | 
| 46 | require Message::DOM::DOMImplementation; | 
| 47 | my $dom = Message::DOM::DOMImplementation->new; | 
| 48 |  | 
| 49 | $url_o = $dom->create_uri_reference ($url_s); | 
| 50 | $url_o->uri_fragment (undef); | 
| 51 |  | 
| 52 | $self->add_url ($url_o->uri_reference); | 
| 53 |  | 
| 54 | my $url_scheme = lc $url_o->uri_scheme; ## TODO: html5_url_scheme | 
| 55 | my $class = { | 
| 56 | http => 'WebHACC::Input::HTTP', | 
| 57 | }->{$url_scheme} || 'WebHACC::Input::UnsupportedURLSchemeError'; | 
| 58 | bless $self, $class; | 
| 59 | } else { | 
| 60 | bless $self, 'WebHACC::Input::Text'; | 
| 61 | } | 
| 62 |  | 
| 63 | $self->_get_document ($cgi => $result => $out, $url_o); | 
| 64 |  | 
| 65 | return $self unless defined $self->{s}; | 
| 66 |  | 
| 67 | if (length $self->{s} > 1000_000) { | 
| 68 | $self->{error_status_text} = 'Entity-body too large'; | 
| 69 | delete $self->{s}; | 
| 70 | bless $self, 'WebHACC::Input::Error'; | 
| 71 | return $self; | 
| 72 | } | 
| 73 |  | 
| 74 | require Whatpm::ContentType; | 
| 75 | ($self->{official_type}, $self->{media_type}) | 
| 76 | = Whatpm::ContentType->get_sniffed_type | 
| 77 | (get_file_head => sub { | 
| 78 | return substr $self->{s}, 0, shift; | 
| 79 | }, | 
| 80 | http_content_type_byte => $self->{http_content_type_bytes}, | 
| 81 | supported_image_types => {}); | 
| 82 |  | 
| 83 | my $input_format = $cgi->get_parameter ('i'); | 
| 84 | if (defined $input_format and length $input_format) { | 
| 85 | $self->{media_type_overridden} | 
| 86 | = (not defined $self->{media_type} or | 
| 87 | $input_format ne $self->{media_type}); | 
| 88 | $self->{media_type} = $input_format; | 
| 89 | } | 
| 90 | if (defined $self->{s} and not defined $self->{media_type}) { | 
| 91 | $self->{media_type} = 'text/html'; | 
| 92 | $self->{media_type_overridden} = 1; | 
| 93 | } | 
| 94 |  | 
| 95 | if ($self->{media_type} eq 'text/xml') { | 
| 96 | unless (defined $self->{charset}) { | 
| 97 | $self->{charset} = 'us-ascii'; | 
| 98 | $self->{official_charset} = $self->{charset}; | 
| 99 | } elsif ($self->{charset_overridden} and $self->{charset} eq 'us-ascii') { | 
| 100 | $self->{charset_overridden} = 0; | 
| 101 | } | 
| 102 | } | 
| 103 |  | 
| 104 | $self->{inner_html_element} = $cgi->get_parameter ('e'); | 
| 105 |  | 
| 106 | return $self; | 
| 107 | } # get_document | 
| 108 |  | 
| 109 | sub _get_document ($$$$) { | 
| 110 | die "$0: _get_document of " . ref $_[0]; | 
| 111 | } # _get_document | 
| 112 |  | 
| 113 | sub generate_info_section ($$) { | 
| 114 | my $self = shift; | 
| 115 |  | 
| 116 | my $result = shift; | 
| 117 | my $out = $result->output; | 
| 118 |  | 
| 119 | $out->start_section (id => 'document-info', title => 'Information'); | 
| 120 | $out->start_tag ('dl'); | 
| 121 |  | 
| 122 | my $urls = $self->urls; | 
| 123 |  | 
| 124 | $out->dt (@$urls == 1 ? 'URL' : 'URLs'); | 
| 125 | my $url = pop @$urls; | 
| 126 | for (@$urls) { | 
| 127 | $out->start_tag ('dd'); | 
| 128 | $out->url ($_); | 
| 129 | } | 
| 130 | $out->start_tag ('dd'); | 
| 131 | $out->url ($url, id => 'anchor-document-url'); | 
| 132 | $out->script (q[ | 
| 133 | document.title = '<' | 
| 134 | + document.getElementById ('anchor-document-url').href + '> \\u2014 ' | 
| 135 | + document.title; | 
| 136 | ]); | 
| 137 |  | 
| 138 | if (defined $self->{s}) { | 
| 139 | $out->dt ('Base URL'); | 
| 140 | $out->start_tag ('dd'); | 
| 141 | $out->url ($self->{base_uri}); | 
| 142 |  | 
| 143 | $out->dt ('Internet Media Type'); | 
| 144 | $out->start_tag ('dd'); | 
| 145 | $out->code ($self->{media_type}, class => 'MIME', lang => 'en'); | 
| 146 | if ($self->{media_type_overridden}) { | 
| 147 | $out->nl_text ('... overridden'); | 
| 148 | } elsif (defined $self->{official_type}) { | 
| 149 | if ($self->{media_type} eq $self->{official_type}) { | 
| 150 | # | 
| 151 | } else { | 
| 152 | $out->nl_text ('... sniffed, official type is #', | 
| 153 | text => $self->{official_type}); | 
| 154 | } | 
| 155 | } else { | 
| 156 | $out->nl_text ( '... sniffed'); | 
| 157 | } | 
| 158 |  | 
| 159 | $out->dt ('Character Encoding'); | 
| 160 | $out->start_tag ('dd'); | 
| 161 | if (defined $self->{charset}) { | 
| 162 | $out->code ($self->{charset}, class => 'charset', lang => 'en'); | 
| 163 | } else { | 
| 164 | $out->nl_text ('(unknown)'); | 
| 165 | } | 
| 166 | $out->nl_text ('... overridden') if $self->{charset_overridden}; | 
| 167 |  | 
| 168 | $out->dt ($self->{is_char_string} ? 'Character Length' : 'Byte Length'); | 
| 169 | ## TODO: formatting | 
| 170 | $out->start_tag ('dd'); | 
| 171 | my $length = length $self->{s}; | 
| 172 | $out->text ($length . ' '); | 
| 173 | $out->nl_text (($self->{is_char_string} ? 'character' : 'byte') . | 
| 174 | ($length == 1 ? '' : 's')); | 
| 175 | } | 
| 176 |  | 
| 177 | $out->end_tag ('dl'); | 
| 178 | $out->end_section; | 
| 179 | } # generate_info_section | 
| 180 |  | 
| 181 | sub generate_transfer_sections ($$) { } | 
| 182 |  | 
| 183 | package WebHACC::Input::HTTP; | 
| 184 | push our @ISA, 'WebHACC::Input'; | 
| 185 |  | 
| 186 | { | 
| 187 | my $HostPermit; | 
| 188 | sub host_permit ($) { | 
| 189 | return $HostPermit if $HostPermit; | 
| 190 |  | 
| 191 | require Message::Util::HostPermit; | 
| 192 | $HostPermit = new Message::Util::HostPermit; | 
| 193 | $HostPermit->add_rule (<<'EOH'); | 
| 194 | Allow host=suika port=80 | 
| 195 | Deny host=suika | 
| 196 | Allow host=suika.fam.cx port=80 | 
| 197 | Deny host=suika.fam.cx | 
| 198 | Deny host=localhost | 
| 199 | Deny host=*.localdomain | 
| 200 | Deny ipv4=0.0.0.0/8 | 
| 201 | Deny ipv4=10.0.0.0/8 | 
| 202 | Deny ipv4=127.0.0.0/8 | 
| 203 | Deny ipv4=169.254.0.0/16 | 
| 204 | Deny ipv4=172.0.0.0/11 | 
| 205 | Deny ipv4=192.0.2.0/24 | 
| 206 | Deny ipv4=192.88.99.0/24 | 
| 207 | Deny ipv4=192.168.0.0/16 | 
| 208 | Deny ipv4=198.18.0.0/15 | 
| 209 | Deny ipv4=224.0.0.0/4 | 
| 210 | Deny ipv4=255.255.255.255/32 | 
| 211 | Deny ipv6=0::0/0 | 
| 212 | Allow host=* | 
| 213 | EOH | 
| 214 | return $HostPermit; | 
| 215 | } # host_permit | 
| 216 | } | 
| 217 |  | 
| 218 | sub _get_document ($$$$$) { | 
| 219 | my ($self, $cgi => $result => $out, $url_o) = @_; | 
| 220 |  | 
| 221 | unless ($self->host_permit->check ($url_o->uri_host, $url_o->uri_port || 80)) { | 
| 222 | $self->{error_status_text} = 'Connection to the host is forbidden'; | 
| 223 | bless $self, 'WebHACC::Input::Error'; | 
| 224 | return $self; | 
| 225 | } | 
| 226 |  | 
| 227 | my $ua = WDCC::LWPUA->new; | 
| 228 | $ua->{wdcc_dom} = Message::DOM::DOMImplementation->new; | 
| 229 | $ua->{wdcc_host_permit} = $self->host_permit; | 
| 230 | $ua->agent ('Mozilla'); ## TODO: for now. | 
| 231 | $ua->parse_head (0); | 
| 232 | $ua->protocols_allowed ([qw/http/]); | 
| 233 | $ua->max_size (1000_000); | 
| 234 | my $req = HTTP::Request->new (GET => $url_o->uri_reference); | 
| 235 | $req->header ('Accept-Encoding' => 'identity, *; q=0'); | 
| 236 | my $res = $ua->request ($req); | 
| 237 | ## TODO: 401 sets |is_success| true. | 
| 238 | ## TODO: Don't follow redirect if error-page=true | 
| 239 | if ($res->is_success or $cgi->get_parameter ('error-page')) { | 
| 240 | $self->{base_uri} = $res->base; ## NOTE: It does check |Content-Base|, |Content-Location|, and <base>. ## TODO: Use our own code! | 
| 241 | my $new_url = $res->request->uri; | 
| 242 | $self->add_url ($new_url) if $new_url ne $self->url; | 
| 243 |  | 
| 244 | ## TODO: More strict parsing... | 
| 245 | my $ct = $self->{http_content_type_bytes} = $res->header ('Content-Type'); | 
| 246 | if (defined $ct and $ct =~ /;\s*charset\s*=\s*"?([^\s;"]+)"?/i) { | 
| 247 | $self->{charset} = lc $1; | 
| 248 | $self->{charset} =~ tr/\\//d; | 
| 249 | $self->{official_charset} = $self->{charset}; | 
| 250 | } | 
| 251 |  | 
| 252 | my $input_charset = $cgi->get_parameter ('charset'); | 
| 253 | if (defined $input_charset and length $input_charset) { | 
| 254 | $self->{charset_overridden} | 
| 255 | = (not defined $self->{charset} or $self->{charset} ne $input_charset); | 
| 256 | $self->{charset} = $input_charset; | 
| 257 | } | 
| 258 |  | 
| 259 | ## TODO: Support for HTTP Content-Encoding | 
| 260 |  | 
| 261 | $self->{s} = ''.$res->content; | 
| 262 | } else { | 
| 263 | $self->add_url ($res->request->uri); | 
| 264 | $self->{error_status_text} = $res->status_line; | 
| 265 | bless $self, 'WebHACC::Input::HTTPError'; | 
| 266 | } | 
| 267 |  | 
| 268 | $self->{header_field} = []; | 
| 269 | $res->scan (sub { | 
| 270 | push @{$self->{header_field}}, [$_[0], $_[1]]; | 
| 271 | }); | 
| 272 | $self->{header_status_code} = $res->code; | 
| 273 | $self->{header_status_text} = $res->message; | 
| 274 |  | 
| 275 | return $self; | 
| 276 | } # _get_document | 
| 277 |  | 
| 278 | sub generate_transfer_sections ($$) { | 
| 279 | my $self = shift; | 
| 280 | my $result = shift; | 
| 281 |  | 
| 282 | $result->layer_uncertain ('transfer'); | 
| 283 |  | 
| 284 | $self->generate_http_header_section ($result); | 
| 285 | } # generate_transfer_sections | 
| 286 |  | 
| 287 | sub generate_http_header_section ($$) { | 
| 288 | my ($self, $result) = @_; | 
| 289 |  | 
| 290 | return unless defined $self->{header_status_code} or | 
| 291 | defined $self->{header_status_text} or | 
| 292 | @{$self->{header_field} or []}; | 
| 293 |  | 
| 294 | my $out = $result->output; | 
| 295 |  | 
| 296 | $out->start_section (id => 'source-header', title => 'HTTP Header'); | 
| 297 | $out->html (qq[<p><strong>Note</strong>: Due to the limitation of the | 
| 298 | network library in use, the content of this section might | 
| 299 | not be the real header.</p> | 
| 300 |  | 
| 301 | <table><tbody> | 
| 302 | ]); | 
| 303 |  | 
| 304 | if (defined $self->{header_status_code}) { | 
| 305 | $out->html (qq[<tr><th scope="row">Status code</th>]); | 
| 306 | $out->start_tag ('td'); | 
| 307 | $out->code ($self->{header_status_code}); | 
| 308 | } | 
| 309 | if (defined $self->{header_status_text}) { | 
| 310 | $out->html (qq[<tr><th scope="row">Status text</th>]); | 
| 311 | $out->start_tag ('td'); | 
| 312 | $out->code ($self->{header_status_text}); | 
| 313 | } | 
| 314 |  | 
| 315 | for (@{$self->{header_field}}) { | 
| 316 | $out->start_tag ('tr'); | 
| 317 | $out->start_tag ('th', scope => 'row'); | 
| 318 | $out->code ($_->[0]); | 
| 319 | $out->start_tag ('td'); | 
| 320 | $out->code ($_->[1]); | 
| 321 | } | 
| 322 |  | 
| 323 | $out->end_tag ('table'); | 
| 324 |  | 
| 325 | $out->end_section; | 
| 326 | } # generate_http_header_section | 
| 327 |  | 
| 328 | package WebHACC::Input::Text; | 
| 329 | push our @ISA, 'WebHACC::Input'; | 
| 330 |  | 
| 331 | sub _get_document ($$$$) { | 
| 332 | my ($self, $cgi => $result => $out) = @_; | 
| 333 |  | 
| 334 | $self->add_url (q<thismessage:/>); | 
| 335 | $self->{base_uri} = q<thismessage:/>; | 
| 336 |  | 
| 337 | $self->{s} = ''.$cgi->get_parameter ('s'); | 
| 338 | $self->{charset} = ''.$cgi->get_parameter ('_charset_'); | 
| 339 | $self->{charset} =~ s/\s+//g; | 
| 340 | $self->{charset} = 'utf-8' if $self->{charset} eq ''; | 
| 341 | $self->{official_charset} = $self->{charset}; | 
| 342 | $self->{header_field} = []; | 
| 343 |  | 
| 344 | return $self; | 
| 345 | } # _get_document | 
| 346 |  | 
| 347 | package WebHACC::Input::Subdocument; | 
| 348 | push our @ISA, 'WebHACC::Input'; | 
| 349 |  | 
| 350 | sub new ($$) { | 
| 351 | my $self = shift->SUPER::new; | 
| 352 | $self->{subdocument_index} = shift; | 
| 353 | return $self; | 
| 354 | } # new | 
| 355 |  | 
| 356 | sub id_prefix ($) { | 
| 357 | my $self = shift; | 
| 358 | return $self->{parent_input}->id_prefix . | 
| 359 | 'subdoc-' . $self->{subdocument_index} . '-'; | 
| 360 | } # id_prefix | 
| 361 |  | 
| 362 | sub nested ($) { 1 } | 
| 363 |  | 
| 364 | sub subdocument_index ($) { | 
| 365 | return shift->{subdocument_index}; | 
| 366 | } # subdocument_index | 
| 367 |  | 
| 368 | sub full_subdocument_index ($) { | 
| 369 | my $self = shift; | 
| 370 | my $parent = $self->{parent_input}->full_subdocument_index; | 
| 371 | if ($parent) { | 
| 372 | return $parent . '.' . $self->{subdocument_index}; | 
| 373 | } else { | 
| 374 | return $self->{subdocument_index}; | 
| 375 | } | 
| 376 | } # full_subdocument_index | 
| 377 |  | 
| 378 | sub start_section ($$) { | 
| 379 | my $self = shift; | 
| 380 |  | 
| 381 | my $result = shift; | 
| 382 | my $out = $result->output; | 
| 383 |  | 
| 384 | my $index = $self->subdocument_index; | 
| 385 | $out->start_section (id => my $id = 'subdoc-' . $index . '-', | 
| 386 | title => qq[Subdocument #], | 
| 387 | short_title => 'Sub #', | 
| 388 | role => 'subdoc', | 
| 389 | text => $self->full_subdocument_index); | 
| 390 | $out->script (q[ insertNavSections ('] . $out->input->id_prefix . $id . q[') ]); | 
| 391 | } # start_section | 
| 392 |  | 
| 393 | sub end_section ($$) { | 
| 394 | $_[1]->output->end_section; | 
| 395 | } # end_section | 
| 396 |  | 
| 397 | sub generate_info_section ($$) { | 
| 398 | my $self = shift; | 
| 399 |  | 
| 400 | my $result = shift; | 
| 401 | my $out = $result->output; | 
| 402 |  | 
| 403 | $out->start_section (id => 'document-info', title => 'Information'); | 
| 404 | $out->start_tag ('dl'); | 
| 405 |  | 
| 406 | $out->dt ('Internet Media Type'); | 
| 407 | $out->start_tag ('dd'); | 
| 408 | $out->code ($self->{media_type}, code => 'MIME', lang => 'en'); | 
| 409 |  | 
| 410 | if (defined $self->{container_node}) { | 
| 411 | $out->dt ('Container Node'); | 
| 412 | $out->start_tag ('dd'); | 
| 413 | my $original_input = $out->input; | 
| 414 | $out->input ($self->{parent_input}); | 
| 415 | $out->node_link ($self->{container_node}); | 
| 416 | $out->input ($original_input); | 
| 417 | } | 
| 418 |  | 
| 419 | $out->dt ('Base URL'); | 
| 420 | $out->start_tag ('dd'); | 
| 421 | $out->url ($self->{base_uri}); | 
| 422 |  | 
| 423 | $out->end_tag ('dl'); | 
| 424 | $out->end_section; | 
| 425 | } # generate_info_section | 
| 426 |  | 
| 427 | package WebHACC::Input::Error; | 
| 428 | push our @ISA, 'WebHACC::Input'; | 
| 429 |  | 
| 430 | sub generate_transfer_sections ($$) { | 
| 431 | my $self = shift; | 
| 432 |  | 
| 433 | my $result = shift; | 
| 434 | my $out = $result->output; | 
| 435 |  | 
| 436 | $out->start_section (role => 'transfer-errors'); | 
| 437 | $out->start_error_list (role => 'transfer-errors'); | 
| 438 |  | 
| 439 | $result->layer_applicable ('transfer'); | 
| 440 | $result->add_error (layer => 'transfer', | 
| 441 | level => 'u', | 
| 442 | type => 'resource retrieval error', | 
| 443 | url => $self->{request_uri}, | 
| 444 | text => $self->{error_status_text}); | 
| 445 |  | 
| 446 | $out->end_error_list (role => 'transfer-errors'); | 
| 447 | $out->end_section; | 
| 448 | } # generate_transfer_sections | 
| 449 |  | 
| 450 | package WebHACC::Input::HTTPError; | 
| 451 | push our @ISA, 'WebHACC::Input::Error', 'WebHACC::Input::HTTP'; | 
| 452 |  | 
| 453 | sub generate_transfer_sections ($$) { | 
| 454 | my $self = shift; | 
| 455 |  | 
| 456 | my $result = shift; | 
| 457 |  | 
| 458 | $self->WebHACC::Input::Error::generate_transfer_sections ($result); | 
| 459 | $self->WebHACC::Input::HTTP::generate_transfer_sections ($result); | 
| 460 |  | 
| 461 | } # generate_transfer_sections | 
| 462 |  | 
| 463 | package WebHACC::Input::UnsupportedURLSchemeError; | 
| 464 | push our @ISA, 'WebHACC::Input::Error'; | 
| 465 |  | 
| 466 | sub _get_document ($$$$) { | 
| 467 | my ($self, $cgi => $result => $out) = @_; | 
| 468 |  | 
| 469 | $self->{error_status_text} = 'URL scheme not allowed'; | 
| 470 |  | 
| 471 | return $self; | 
| 472 | } # _get_document | 
| 473 |  | 
| 474 | package WDCC::LWPUA; | 
| 475 | require LWP::UserAgent; | 
| 476 | push our @ISA, 'LWP::UserAgent'; | 
| 477 |  | 
| 478 | sub redirect_ok { | 
| 479 | my $ua = shift; | 
| 480 | unless ($ua->SUPER::redirect_ok (@_)) { | 
| 481 | return 0; | 
| 482 | } | 
| 483 |  | 
| 484 | my $uris = $_[1]->header ('Location'); | 
| 485 | return 0 unless $uris; | 
| 486 | my $uri = $ua->{wdcc_dom}->create_uri_reference ($uris); | 
| 487 | unless ({ | 
| 488 | http => 1, | 
| 489 | }->{lc $uri->uri_scheme}) { ## TODO: html5_url_scheme | 
| 490 | return 0; | 
| 491 | } | 
| 492 | unless ($ua->{wdcc_host_permit}->check ($uri->uri_host, $uri->uri_port || 80)) { | 
| 493 | return 0; | 
| 494 | } | 
| 495 | return 1; | 
| 496 | } # redirect_ok | 
| 497 |  | 
| 498 | 1; |