#!/usr/bin/perl use strict; use lib q; use Encode; use Whatpm::HTML; use Whatpm::NanoDOM; use File::Path; our $UA; our $data_dir_name; my @WGET = (qw/wget/, '-U' => $UA, '-t' => 5); my $list_dir_name = $data_dir_name . q; my $log_dir_name = $data_dir_name . q; my $get_log_file_name = $log_dir_name . q; mkpath $list_dir_name; mkpath $log_dir_name; open STDERR, '>>', $get_log_file_name; our @keyword; for my $i (0..$#keyword) { my $qkeyword = Encode::encode ('utf-8', $keyword[$i]); $qkeyword =~ s/([^0-9A-Za-z_.~-])/sprintf '%%%02X', ord $1/ge; my $top10_file_name_stem = qq<$list_dir_name$i-top10>; my $last_file_name_stem = qq<$list_dir_name$i-last>; my $last10_file_name_stem = qq<$list_dir_name$i-last10>; my $page_dir_name = qq<$data_dir_name$i/>; mkpath $page_dir_name; qkeyword_to_searchresult ($qkeyword, 0 => $top10_file_name_stem); searchresult_to_urilist ($top10_file_name_stem); urilist_to_files ($top10_file_name_stem => $page_dir_name . q); qkeyword_to_searchresult ($qkeyword, 990 => $last_file_name_stem); searchresultlast_to_urilistlast10 ($qkeyword, $last_file_name_stem => $last10_file_name_stem); urilist_to_files ($last10_file_name_stem => $page_dir_name . q); } sub qkeyword_to_searchresult ($$$) { my ($qkeyword, $start => $file_name_stem) = @_; my $uri = qq; $uri .= q<&start=> . $start if $start; system @WGET, '-O' => "$file_name_stem.html", $uri; } # keyword_to_searchresult sub searchresultlast_to_urilistlast10 ($$) { my ($qkeyword, $last_file_name_stem => $last10_file_name_stem) = @_; searchresult_to_urilist ($last_file_name_stem); open my $file, '<', "$last_file_name_stem.list" or die "$0: $last_file_name_stem.list: $!"; my @uri = <$file>; my $start = 990 - (10 - @uri); if ($start != 990) { qkeyword_to_searchresult ($qkeyword, $start => $last10_file_name_stem); searchresult_to_urilist ($last10_file_name_stem); } else { system 'cp', "$last_file_name_stem.list" => "$last10_file_name_stem.list"; } } # searchresultlast_to_urilistlast10 sub searchresult_to_urilist ($) { my $file_name_stem = shift; open my $html, '<', "$file_name_stem.html" or die "$0: $file_name_stem.html: $!"; local $/ = undef; my $doc = Whatpm::HTML->parse_string (scalar <$html> => Whatpm::NanoDOM::Document->new, sub { }); close $html; my @uri; my @node = @{$doc->child_nodes}; while (@node) { my $node = shift @node; if ($node->node_type == 1) { if ($node->namespace_uri eq q and $node->manakai_local_name eq 'a') { if ($node->get_attribute_ns (undef, 'class') eq 'l') { push @uri, $node->get_attribute_ns (undef, 'href'); } } } push @node, @{$node->child_nodes}; } open my $urilist_file, '>', "$file_name_stem.list" or die "$0: $file_name_stem.list: $!"; print $urilist_file join "\n", @uri; print $urilist_file "\n"; close $urilist_file; } # searchresult_to_urilist sub urilist_to_files ($) { my ($urilist_file_name_stem, $file_name_prefix) = @_; open my $urilist_file, '<', "$urilist_file_name_stem.list" or die "$0: $urilist_file_name_stem.list: $!"; my $i = 0; while (<$urilist_file>) { tr/\x0D\x0A//d; get_uri ($_ => "$file_name_prefix$i.html"); $i++; } close $urilist_file; } # urilist_to_files sub get_uri ($$) { my ($uri => $file_name) = @_; system @WGET, '-s', '-O' => $file_name, $uri; } # get_uri =head1 AUTHOR Wakaba . =head1 LICENSE Copyright 2007 Wakaba This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1; ## $Date: 2007/06/02 12:12:28 $