/[suikacvs]/webroot/www/ja1200/get/get.pl
Suika

Contents of /webroot/www/ja1200/get/get.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download)
Sat Jun 2 12:12:28 2007 UTC (16 years, 11 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
File MIME type: text/plain
New

1 #!/usr/bin/perl
2 use strict;
3
4 use lib q</home/httpd/html/www/markup/html/whatpm/>;
5
6 use Encode;
7 use Whatpm::HTML;
8 use Whatpm::NanoDOM;
9 use File::Path;
10
11 our $UA;
12 our $data_dir_name;
13 my @WGET = (qw/wget/, '-U' => $UA, '-t' => 5);
14 my $list_dir_name = $data_dir_name . q<list/>;
15 my $log_dir_name = $data_dir_name . q<log/>;
16 my $get_log_file_name = $log_dir_name . q<get.log>;
17
18 mkpath $list_dir_name;
19 mkpath $log_dir_name;
20
21 open STDERR, '>>', $get_log_file_name;
22
23 our @keyword;
24
25 for my $i (0..$#keyword) {
26 my $qkeyword = Encode::encode ('utf-8', $keyword[$i]);
27 $qkeyword =~ s/([^0-9A-Za-z_.~-])/sprintf '%%%02X', ord $1/ge;
28
29 my $top10_file_name_stem = qq<$list_dir_name$i-top10>;
30 my $last_file_name_stem = qq<$list_dir_name$i-last>;
31 my $last10_file_name_stem = qq<$list_dir_name$i-last10>;
32 my $page_dir_name = qq<$data_dir_name$i/>;
33 mkpath $page_dir_name;
34
35 qkeyword_to_searchresult ($qkeyword, 0 => $top10_file_name_stem);
36 searchresult_to_urilist ($top10_file_name_stem);
37 urilist_to_files ($top10_file_name_stem => $page_dir_name . q<t->);
38
39 qkeyword_to_searchresult ($qkeyword, 990 => $last_file_name_stem);
40 searchresultlast_to_urilistlast10
41 ($qkeyword, $last_file_name_stem => $last10_file_name_stem);
42 urilist_to_files ($last10_file_name_stem => $page_dir_name . q<l->);
43 }
44
45 sub qkeyword_to_searchresult ($$$) {
46 my ($qkeyword, $start => $file_name_stem) = @_;
47 my $uri = qq<http://www.google.co.jp/search?q=$qkeyword&ie=utf-8&oe=utf-8&hl=ja&lr=lang_ja>;
48 $uri .= q<&start=> . $start if $start;
49 system @WGET, '-O' => "$file_name_stem.html", $uri;
50 } # keyword_to_searchresult
51
52 sub searchresultlast_to_urilistlast10 ($$) {
53 my ($qkeyword, $last_file_name_stem => $last10_file_name_stem) = @_;
54 searchresult_to_urilist ($last_file_name_stem);
55 open my $file, '<', "$last_file_name_stem.list"
56 or die "$0: $last_file_name_stem.list: $!";
57 my @uri = <$file>;
58 my $start = 990 - (10 - @uri);
59 if ($start != 990) {
60 qkeyword_to_searchresult ($qkeyword, $start => $last10_file_name_stem);
61 searchresult_to_urilist ($last10_file_name_stem);
62 } else {
63 system 'cp', "$last_file_name_stem.list" => "$last10_file_name_stem.list";
64 }
65 } # searchresultlast_to_urilistlast10
66
67 sub searchresult_to_urilist ($) {
68 my $file_name_stem = shift;
69
70 open my $html, '<', "$file_name_stem.html"
71 or die "$0: $file_name_stem.html: $!";
72 local $/ = undef;
73 my $doc = Whatpm::HTML->parse_string
74 (scalar <$html> => Whatpm::NanoDOM::Document->new, sub { });
75 close $html;
76
77 my @uri;
78 my @node = @{$doc->child_nodes};
79 while (@node) {
80 my $node = shift @node;
81 if ($node->node_type == 1) {
82 if ($node->namespace_uri eq q<http://www.w3.org/1999/xhtml> and
83 $node->manakai_local_name eq 'a') {
84 if ($node->get_attribute_ns (undef, 'class') eq 'l') {
85 push @uri, $node->get_attribute_ns (undef, 'href');
86 }
87 }
88 }
89 push @node, @{$node->child_nodes};
90 }
91
92 open my $urilist_file, '>', "$file_name_stem.list"
93 or die "$0: $file_name_stem.list: $!";
94 print $urilist_file join "\n", @uri;
95 print $urilist_file "\n";
96 close $urilist_file;
97 } # searchresult_to_urilist
98
99 sub urilist_to_files ($) {
100 my ($urilist_file_name_stem, $file_name_prefix) = @_;
101 open my $urilist_file, '<', "$urilist_file_name_stem.list"
102 or die "$0: $urilist_file_name_stem.list: $!";
103 my $i = 0;
104 while (<$urilist_file>) {
105 tr/\x0D\x0A//d;
106 get_uri ($_ => "$file_name_prefix$i.html");
107 $i++;
108 }
109 close $urilist_file;
110 } # urilist_to_files
111
112 sub get_uri ($$) {
113 my ($uri => $file_name) = @_;
114 system @WGET, '-s', '-O' => $file_name, $uri;
115 } # get_uri
116
117 =head1 AUTHOR
118
119 Wakaba <w@suika.fam.cx>.
120
121 =head1 LICENSE
122
123 Copyright 2007 Wakaba <w@suika.fam.cx>
124
125 This library is free software; you can redistribute it
126 and/or modify it under the same terms as Perl itself.
127
128 =cut
129
130 1;
131 ## $Date:$
132

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24