/[suikacvs]/webroot/www/ja1200/get/gettop.pl
Suika

Contents of /webroot/www/ja1200/get/gettop.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download)
Sat Jun 9 07:56:19 2007 UTC (16 years, 11 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
File MIME type: text/plain
New scripts for keitai sites; Report for attribute values

1 #!/usr/bin/perl
2 use strict;
3
4 use lib q</home/httpd/html/www/markup/html/whatpm/>;
5
6 use Encode;
7 use Whatpm::HTML;
8 use Whatpm::NanoDOM;
9 use File::Path;
10
11 our $UA;
12 our $data_dir_name;
13 my @WGET = (qw/wget/, '-U' => $UA, '-t' => 5);
14 my $list_dir_name = $data_dir_name . q<list/>;
15 my $log_dir_name = $data_dir_name . q<log/>;
16 my $get_log_file_name = $log_dir_name . q<get.log>;
17
18 mkpath $list_dir_name;
19 mkpath $log_dir_name;
20
21 open STDERR, '>>', $get_log_file_name;
22
23 our @keyword;
24
25 for my $i (0..$#keyword) {
26 my $qkeyword = Encode::encode ('utf-8', $keyword[$i]);
27 $qkeyword =~ s/([^0-9A-Za-z_.~-])/sprintf '%%%02X', ord $1/ge;
28
29 my $top10_file_name_stem = qq<$list_dir_name$i-top10>;
30 my $page_dir_name = qq<$data_dir_name$i/>;
31 mkpath $page_dir_name;
32
33 for my $start (0, 10, 20, 30, 40, 50, 60, 70, 80, 90) {
34 my $file_name_stem = qq<$list_dir_name$i-$start>;
35 qkeyword_to_searchresult ($qkeyword, $start => $file_name_stem);
36 searchresult_to_urilist ($file_name_stem);
37 urilist_to_files ($file_name_stem => $page_dir_name . $start . q<->);
38 }
39 }
40
41 our $UseGoogleMobile;
42 sub qkeyword_to_searchresult ($$$) {
43 my ($qkeyword, $start => $file_name_stem) = @_;
44 my $uri = $UseGoogleMobile
45 ? qq<http://www.google.co.jp/m/search?site=mobile&q=$qkeyword&sa=N>
46 : qq<http://www.google.co.jp/search?q=$qkeyword&ie=utf-8&oe=utf-8&hl=ja&lr=lang_ja>;
47 $uri .= q<&start=> . $start if $start;
48 system @WGET, '-O' => "$file_name_stem.html", $uri;
49 } # keyword_to_searchresult
50
51 sub searchresult_to_urilist ($) {
52 my $file_name_stem = shift;
53
54 open my $html, '<', "$file_name_stem.html"
55 or die "$0: $file_name_stem.html: $!";
56 local $/ = undef;
57 my $doc = Whatpm::HTML->parse_string
58 (scalar <$html> => Whatpm::NanoDOM::Document->new, sub { });
59 close $html;
60
61 my @uri;
62 my @node = @{$doc->child_nodes};
63 while (@node) {
64 my $node = shift @node;
65 if ($node->node_type == 1) {
66 if ($node->namespace_uri eq q<http://www.w3.org/1999/xhtml> and
67 $node->manakai_local_name eq 'a') {
68 if ($node->has_attribute_ns (undef, 'accesskey')) { ## Google Mobile
69 push @uri, $node->get_attribute_ns (undef, 'href');
70 }
71 }
72 }
73 push @node, @{$node->child_nodes};
74 }
75
76 open my $urilist_file, '>', "$file_name_stem.list"
77 or die "$0: $file_name_stem.list: $!";
78 print $urilist_file join "\n", @uri;
79 print $urilist_file "\n";
80 close $urilist_file;
81 } # searchresult_to_urilist
82
83 sub urilist_to_files ($) {
84 my ($urilist_file_name_stem, $file_name_prefix) = @_;
85 open my $urilist_file, '<', "$urilist_file_name_stem.list"
86 or die "$0: $urilist_file_name_stem.list: $!";
87 my $i = 0;
88 while (<$urilist_file>) {
89 tr/\x0D\x0A//d;
90 get_uri ($_ => "$file_name_prefix$i.html");
91 $i++;
92 }
93 close $urilist_file;
94 } # urilist_to_files
95
96 sub get_uri ($$) {
97 my ($uri => $file_name) = @_;
98 system @WGET, '-s', '-O' => $file_name, $uri;
99 } # get_uri
100
101 =head1 AUTHOR
102
103 Wakaba <w@suika.fam.cx>.
104
105 =head1 LICENSE
106
107 Copyright 2007 Wakaba <w@suika.fam.cx>
108
109 This library is free software; you can redistribute it
110 and/or modify it under the same terms as Perl itself.
111
112 =cut
113
114 1;
115 ## $Date: 2007/06/02 12:12:28 $
116

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24