/[suikacvs]/webroot/www/ja1200/get/get.pl
Suika

Contents of /webroot/www/ja1200/get/get.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download)
Sat Jun 2 12:12:28 2007 UTC (18 years ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
File MIME type: text/plain
New

1 wakaba 1.1 #!/usr/bin/perl
2     use strict;
3    
4     use lib q</home/httpd/html/www/markup/html/whatpm/>;
5    
6     use Encode;
7     use Whatpm::HTML;
8     use Whatpm::NanoDOM;
9     use File::Path;
10    
11     our $UA;
12     our $data_dir_name;
13     my @WGET = (qw/wget/, '-U' => $UA, '-t' => 5);
14     my $list_dir_name = $data_dir_name . q<list/>;
15     my $log_dir_name = $data_dir_name . q<log/>;
16     my $get_log_file_name = $log_dir_name . q<get.log>;
17    
18     mkpath $list_dir_name;
19     mkpath $log_dir_name;
20    
21     open STDERR, '>>', $get_log_file_name;
22    
23     our @keyword;
24    
25     for my $i (0..$#keyword) {
26     my $qkeyword = Encode::encode ('utf-8', $keyword[$i]);
27     $qkeyword =~ s/([^0-9A-Za-z_.~-])/sprintf '%%%02X', ord $1/ge;
28    
29     my $top10_file_name_stem = qq<$list_dir_name$i-top10>;
30     my $last_file_name_stem = qq<$list_dir_name$i-last>;
31     my $last10_file_name_stem = qq<$list_dir_name$i-last10>;
32     my $page_dir_name = qq<$data_dir_name$i/>;
33     mkpath $page_dir_name;
34    
35     qkeyword_to_searchresult ($qkeyword, 0 => $top10_file_name_stem);
36     searchresult_to_urilist ($top10_file_name_stem);
37     urilist_to_files ($top10_file_name_stem => $page_dir_name . q<t->);
38    
39     qkeyword_to_searchresult ($qkeyword, 990 => $last_file_name_stem);
40     searchresultlast_to_urilistlast10
41     ($qkeyword, $last_file_name_stem => $last10_file_name_stem);
42     urilist_to_files ($last10_file_name_stem => $page_dir_name . q<l->);
43     }
44    
45     sub qkeyword_to_searchresult ($$$) {
46     my ($qkeyword, $start => $file_name_stem) = @_;
47     my $uri = qq<http://www.google.co.jp/search?q=$qkeyword&ie=utf-8&oe=utf-8&hl=ja&lr=lang_ja>;
48     $uri .= q<&start=> . $start if $start;
49     system @WGET, '-O' => "$file_name_stem.html", $uri;
50     } # keyword_to_searchresult
51    
52     sub searchresultlast_to_urilistlast10 ($$) {
53     my ($qkeyword, $last_file_name_stem => $last10_file_name_stem) = @_;
54     searchresult_to_urilist ($last_file_name_stem);
55     open my $file, '<', "$last_file_name_stem.list"
56     or die "$0: $last_file_name_stem.list: $!";
57     my @uri = <$file>;
58     my $start = 990 - (10 - @uri);
59     if ($start != 990) {
60     qkeyword_to_searchresult ($qkeyword, $start => $last10_file_name_stem);
61     searchresult_to_urilist ($last10_file_name_stem);
62     } else {
63     system 'cp', "$last_file_name_stem.list" => "$last10_file_name_stem.list";
64     }
65     } # searchresultlast_to_urilistlast10
66    
67     sub searchresult_to_urilist ($) {
68     my $file_name_stem = shift;
69    
70     open my $html, '<', "$file_name_stem.html"
71     or die "$0: $file_name_stem.html: $!";
72     local $/ = undef;
73     my $doc = Whatpm::HTML->parse_string
74     (scalar <$html> => Whatpm::NanoDOM::Document->new, sub { });
75     close $html;
76    
77     my @uri;
78     my @node = @{$doc->child_nodes};
79     while (@node) {
80     my $node = shift @node;
81     if ($node->node_type == 1) {
82     if ($node->namespace_uri eq q<http://www.w3.org/1999/xhtml> and
83     $node->manakai_local_name eq 'a') {
84     if ($node->get_attribute_ns (undef, 'class') eq 'l') {
85     push @uri, $node->get_attribute_ns (undef, 'href');
86     }
87     }
88     }
89     push @node, @{$node->child_nodes};
90     }
91    
92     open my $urilist_file, '>', "$file_name_stem.list"
93     or die "$0: $file_name_stem.list: $!";
94     print $urilist_file join "\n", @uri;
95     print $urilist_file "\n";
96     close $urilist_file;
97     } # searchresult_to_urilist
98    
99     sub urilist_to_files ($) {
100     my ($urilist_file_name_stem, $file_name_prefix) = @_;
101     open my $urilist_file, '<', "$urilist_file_name_stem.list"
102     or die "$0: $urilist_file_name_stem.list: $!";
103     my $i = 0;
104     while (<$urilist_file>) {
105     tr/\x0D\x0A//d;
106     get_uri ($_ => "$file_name_prefix$i.html");
107     $i++;
108     }
109     close $urilist_file;
110     } # urilist_to_files
111    
112     sub get_uri ($$) {
113     my ($uri => $file_name) = @_;
114     system @WGET, '-s', '-O' => $file_name, $uri;
115     } # get_uri
116    
117     =head1 AUTHOR
118    
119     Wakaba <w@suika.fam.cx>.
120    
121     =head1 LICENSE
122    
123     Copyright 2007 Wakaba <w@suika.fam.cx>
124    
125     This library is free software; you can redistribute it
126     and/or modify it under the same terms as Perl itself.
127    
128     =cut
129    
130     1;
131     ## $Date:$
132    

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24