#!/usr/bin/perl
use strict;
use lib q;
use Encode;
use Whatpm::HTML;
use Whatpm::NanoDOM;
use File::Path;
our $UA;
our $data_dir_name;
my @WGET = (qw/wget/, '-U' => $UA, '-t' => 5);
my $list_dir_name = $data_dir_name . q
;
my $log_dir_name = $data_dir_name . q;
my $get_log_file_name = $log_dir_name . q;
mkpath $list_dir_name;
mkpath $log_dir_name;
open STDERR, '>>', $get_log_file_name;
our @keyword;
for my $i (0..$#keyword) {
my $qkeyword = Encode::encode ('utf-8', $keyword[$i]);
$qkeyword =~ s/([^0-9A-Za-z_.~-])/sprintf '%%%02X', ord $1/ge;
my $top10_file_name_stem = qq<$list_dir_name$i-top10>;
my $last_file_name_stem = qq<$list_dir_name$i-last>;
my $last10_file_name_stem = qq<$list_dir_name$i-last10>;
my $page_dir_name = qq<$data_dir_name$i/>;
mkpath $page_dir_name;
qkeyword_to_searchresult ($qkeyword, 0 => $top10_file_name_stem);
searchresult_to_urilist ($top10_file_name_stem);
urilist_to_files ($top10_file_name_stem => $page_dir_name . q);
qkeyword_to_searchresult ($qkeyword, 990 => $last_file_name_stem);
searchresultlast_to_urilistlast10
($qkeyword, $last_file_name_stem => $last10_file_name_stem);
urilist_to_files ($last10_file_name_stem => $page_dir_name . q);
}
sub qkeyword_to_searchresult ($$$) {
my ($qkeyword, $start => $file_name_stem) = @_;
my $uri = qq;
$uri .= q<&start=> . $start if $start;
system @WGET, '-O' => "$file_name_stem.html", $uri;
} # keyword_to_searchresult
sub searchresultlast_to_urilistlast10 ($$) {
my ($qkeyword, $last_file_name_stem => $last10_file_name_stem) = @_;
searchresult_to_urilist ($last_file_name_stem);
open my $file, '<', "$last_file_name_stem.list"
or die "$0: $last_file_name_stem.list: $!";
my @uri = <$file>;
my $start = 990 - (10 - @uri);
if ($start != 990) {
qkeyword_to_searchresult ($qkeyword, $start => $last10_file_name_stem);
searchresult_to_urilist ($last10_file_name_stem);
} else {
system 'cp', "$last_file_name_stem.list" => "$last10_file_name_stem.list";
}
} # searchresultlast_to_urilistlast10
sub searchresult_to_urilist ($) {
my $file_name_stem = shift;
open my $html, '<', "$file_name_stem.html"
or die "$0: $file_name_stem.html: $!";
local $/ = undef;
my $doc = Whatpm::HTML->parse_string
(scalar <$html> => Whatpm::NanoDOM::Document->new, sub { });
close $html;
my @uri;
my @node = @{$doc->child_nodes};
while (@node) {
my $node = shift @node;
if ($node->node_type == 1) {
if ($node->namespace_uri eq q and
$node->manakai_local_name eq 'a') {
if ($node->get_attribute_ns (undef, 'class') eq 'l') {
push @uri, $node->get_attribute_ns (undef, 'href');
}
}
}
push @node, @{$node->child_nodes};
}
open my $urilist_file, '>', "$file_name_stem.list"
or die "$0: $file_name_stem.list: $!";
print $urilist_file join "\n", @uri;
print $urilist_file "\n";
close $urilist_file;
} # searchresult_to_urilist
sub urilist_to_files ($) {
my ($urilist_file_name_stem, $file_name_prefix) = @_;
open my $urilist_file, '<', "$urilist_file_name_stem.list"
or die "$0: $urilist_file_name_stem.list: $!";
my $i = 0;
while (<$urilist_file>) {
tr/\x0D\x0A//d;
get_uri ($_ => "$file_name_prefix$i.html");
$i++;
}
close $urilist_file;
} # urilist_to_files
sub get_uri ($$) {
my ($uri => $file_name) = @_;
system @WGET, '-s', '-O' => $file_name, $uri;
} # get_uri
=head1 AUTHOR
Wakaba .
=head1 LICENSE
Copyright 2007 Wakaba
This library is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.
=cut
1;
## $Date: 2007/06/02 12:12:28 $