1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
|
4 |
use lib qw[/home/httpd/html/www/markup/html/whatpm/]; |
5 |
|
6 |
use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/; |
7 |
use Whatpm::HTML; |
8 |
|
9 |
our $target = shift; |
10 |
our $code = sub { |
11 |
my ($entity, $file_name) = @_; |
12 |
my $http_charset; |
13 |
if ($entity->{field}->{'content-type'}) { |
14 |
if ($entity->{field}->{'content-type'}->[0] =~ /charset\s*=\s*([^\s;]+)/i) { |
15 |
my $charset = $1; |
16 |
if ($charset =~ /euc/i) { |
17 |
$http_charset = 'euc-jp'; |
18 |
} elsif ($charset =~ /shift/i or $charset =~ /sjis/i or |
19 |
$charset =~ /ms932/i or $charset =~ /cp943/i or |
20 |
$charset =~ /windows-31j/i) { |
21 |
$http_charset = 'shift_jis'; |
22 |
} elsif ($charset =~ /none/i or $charset =~ /unknown/i or |
23 |
$charset =~ /jp/i or $charset eq '0') { |
24 |
$http_charset = 'shift_jis' if $target =~ /keitai/i; |
25 |
} else { |
26 |
$http_charset = $charset; |
27 |
} |
28 |
} else { |
29 |
$http_charset = 'shift_jis' if $target =~ /keitai/; |
30 |
} |
31 |
} |
32 |
my $decoder = $http_charset |
33 |
? Encode::find_encoding ($http_charset) |
34 |
: Encode::Guess->guess ($entity->{body}); |
35 |
unless (ref $decoder) { |
36 |
warn "$0: $file_name: $decoder\n"; |
37 |
$decoder = Encode::find_encoding ('shiftjis'); |
38 |
} |
39 |
my $data = $decoder->decode ($entity->{body}, 0); |
40 |
tokenize ($data); |
41 |
}; |
42 |
|
43 |
my $Total; |
44 |
my $Page; |
45 |
|
46 |
$SIG{INT} = \&print_result; |
47 |
|
48 |
require 'foreach.pl'; |
49 |
|
50 |
print_result (); |
51 |
|
52 |
sub tokenize ($) { |
53 |
my $s = \($_[0]); |
54 |
my $p = Whatpm::HTML->new; |
55 |
my $i = 0; |
56 |
$p->{set_next_input_character} = sub { |
57 |
my $self = shift; |
58 |
$self->{next_input_character} = -1 and return if $i >= length $$s; |
59 |
$self->{next_input_character} = ord substr $$s, $i++, 1; |
60 |
|
61 |
if ($self->{next_input_character} == 0x000D) { # CR |
62 |
if ($i >= length $$s) { |
63 |
# |
64 |
} else { |
65 |
my $next_char = ord substr $$s, $i++, 1; |
66 |
if ($next_char == 0x000A) { # LF |
67 |
# |
68 |
} else { |
69 |
push @{$self->{char}}, $next_char; |
70 |
} |
71 |
} |
72 |
$self->{next_input_character} = 0x000A; # LF # MUST |
73 |
} elsif ($self->{next_input_character} > 0x10FFFF) { |
74 |
$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
75 |
} elsif ($self->{next_input_character} == 0x0000) { # NULL |
76 |
$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
77 |
} |
78 |
}; |
79 |
|
80 |
my @token; |
81 |
$p->{parse_error} = sub { |
82 |
push @token, 'ParseError'; |
83 |
}; |
84 |
|
85 |
$p->_initialize_tokenizer; |
86 |
|
87 |
my $start_tag_name = {}; |
88 |
my $end_tag_name = {}; |
89 |
my $attr; |
90 |
my $value; |
91 |
my $value2; # case insensitive |
92 |
|
93 |
while (1) { |
94 |
my $token = $p->_get_next_token; |
95 |
last if $token->{type} eq 'end-of-file'; |
96 |
|
97 |
if ($token->{type} eq 'start tag') { |
98 |
if ({ |
99 |
title => 1, |
100 |
textarea => 1, |
101 |
}->{$token->{tag_name}}) { |
102 |
$p->{content_model} = Whatpm::HTML::RCDATA_CONTENT_MODEL (); |
103 |
} elsif ({ |
104 |
style => 1, |
105 |
script => 1, |
106 |
xmp => 1, |
107 |
noframes => 1, |
108 |
noembed => 1, |
109 |
noscript => 1, |
110 |
iframe => 1, |
111 |
}->{$token->{tag_name}}) { |
112 |
$p->{content_model} = Whatpm::HTML::CDATA_CONTENT_MODEL (); |
113 |
} elsif ($token->{tag_name} eq 'plaintext') { |
114 |
$p->{content_model} = Whatpm::HTML::PLAINTEXT_CONTENT_MODEL (); |
115 |
} |
116 |
$p->{last_emitted_start_tag_name} = $token->{tag_name}; |
117 |
} |
118 |
|
119 |
if ($token->{type} eq 'start tag') { |
120 |
$start_tag_name->{$token->{tag_name}}++; |
121 |
$start_tag_name->{'*'}++; |
122 |
for my $attr_name (keys %{$token->{attributes}}) { |
123 |
$attr->{$token->{tag_name}}->{$attr_name}++; |
124 |
$attr->{'*'}->{$attr_name}++; |
125 |
if (my $v = { |
126 |
a => {charset => 2, directkey => 1, hreflang => 2, |
127 |
ijam => 1, media => 2, name => 2, |
128 |
rel => 2, rev => 2, shape => 2, target => 2, type => 2}, |
129 |
applet => {align => 2, name => 2}, |
130 |
area => {charset => 2, hreflang => 2, media => 2, name => 2, |
131 |
rel => 2, rev => 2, shape => 2, target => 2, type => 2}, |
132 |
base => {target => 2}, |
133 |
basefont => {color => 2, face => 2, size => 1}, |
134 |
bgsound => {loop => 1}, |
135 |
body => {bgproperties => 2, scroll => 2}, |
136 |
button => {name => 2, type => 2}, |
137 |
caption => {align => 2}, |
138 |
col => {align => 2, span => 1, valign => 2}, |
139 |
colgroup => {align => 2, span => 1, valign => 2}, |
140 |
del => {datetime => 1}, |
141 |
div => {align => 2, mode => 2}, |
142 |
embed => {align => 2, allowscriptaccess => 2, autostart => 2, |
143 |
loop => 2, name => 2, quality => 2, |
144 |
showcontrols => 2, type => 2}, |
145 |
font => {face => 2, size => 1}, |
146 |
form => {accept => 2, 'accept-charset' => 2, enctype => 2, |
147 |
method => 2, name => 2}, |
148 |
frame => {name => 2, scrolling => 2}, |
149 |
h1 => {align => 2}, |
150 |
h2 => {align => 2}, |
151 |
h3 => {align => 2}, |
152 |
h4 => {align => 2}, |
153 |
h5 => {align => 2}, |
154 |
h6 => {align => 2}, |
155 |
head => {profile => 1}, |
156 |
hr => {align => 2}, |
157 |
html => {version => 1, xmlns => 1}, |
158 |
iframe => {align => 2, name => 2, scrolling => 2}, |
159 |
ilayer => {name => 2, visibility => 2}, |
160 |
img => {align => 2, alt => 1, border => 1, copyright => 2, |
161 |
galleryimg => 2, localsrc => 2, name => 2, |
162 |
naturalsizeflag => 2, nosave => 2}, |
163 |
input => {accept => 2, 'accept-charset' => 2, autocomplete => 2, |
164 |
autosave => 2, emptyok => 2, enctype => 2, format => 1, |
165 |
inputmode => 2, istyle => 1, localsrc => 2, method => 2, |
166 |
mode => 2, name => 2, placeholder => 1, |
167 |
results => 1, target => 2, type => 2}, |
168 |
ins => {datetime => 1}, |
169 |
layer => {name => 2, visibility => 2}, |
170 |
li => {type => 1, value => 1}, |
171 |
link => {charset => 2, hreflang => 2, media => 2, |
172 |
rel => 2, rev => 2, target => 2, type => 2, |
173 |
xmlns => 1}, |
174 |
marquee => {align => 2, behavior => 2, direction => 2, loop => 1}, |
175 |
map => {name => 2}, |
176 |
meta => {charset => 2, 'http-equiv' => 2, name => 2, scheme => 2, |
177 |
url => 1}, |
178 |
object => {align => 2, classid => 2, |
179 |
codebase => 1, codetype => 2, copyright => 2, |
180 |
name => 2, standby => 1, type => 2}, |
181 |
ol => {start => 1, type => 1}, |
182 |
option => {mode => 2}, |
183 |
p => {align => 2, mode => 2, wrap => 1}, |
184 |
param => {name => 2, valuetype => 2}, |
185 |
pre => {wrap => 1}, |
186 |
rt => {rbspan => 1}, |
187 |
script => {charset => 2, event => 1, for => 1, type => 2}, |
188 |
select => {name => 2}, |
189 |
spacer => {type => 2}, |
190 |
style => {media => 2, type => 2}, |
191 |
table => {align => 2, border => 1, frame => 2, noborder => 1, |
192 |
summary => 1}, |
193 |
tbody => {align => 2, valign => 2}, |
194 |
td => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1, |
195 |
rowspan => 1, scope => 2, valign => 2}, |
196 |
textarea => {autocomplete => 2, istyle => 2, mode => 2, |
197 |
name => 2, wrap => 2}, |
198 |
tfoot => {align => 2, valign => 2}, |
199 |
th => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1, |
200 |
rowspan => 1, scope => 2, valign => 2}, |
201 |
thead => {align => 2, valign => 2}, |
202 |
ul => {type => 2}, |
203 |
xml => {charset => 2}, |
204 |
}->{$token->{tag_name}}->{$attr_name}) { |
205 |
$value->{$token->{tag_name}}->{$attr_name} |
206 |
->{$token->{attributes}->{$attr_name}->{value}}++; |
207 |
$value2->{$token->{tag_name}}->{$attr_name} |
208 |
->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1; |
209 |
} elsif (my $v = { |
210 |
accesskey => 2, |
211 |
class => 2, |
212 |
dir => 2, |
213 |
id => 2, |
214 |
lang => 2, |
215 |
language => 2, |
216 |
role => 1, |
217 |
tabindex => 1, |
218 |
'xml:lang' => 2, |
219 |
}->{$attr_name}) { |
220 |
$value->{'*'}->{$attr_name} |
221 |
->{$token->{attributes}->{$attr_name}->{value}}++; |
222 |
$value2->{'*'}->{$attr_name} |
223 |
->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1; |
224 |
} |
225 |
} |
226 |
} elsif ($token->{type} eq 'end tag') { |
227 |
$end_tag_name->{$token->{tag_name}}++; |
228 |
} |
229 |
} |
230 |
|
231 |
for my $tag_name (keys %$start_tag_name) { |
232 |
if ($start_tag_name->{$tag_name}) { |
233 |
$Total->{start_tag}->{$tag_name} += $start_tag_name->{$tag_name}; |
234 |
$Page->{start_tag}->{$tag_name}++; |
235 |
} |
236 |
|
237 |
for my $attr_name (keys %{$attr->{$tag_name} or {}}) { |
238 |
if ($attr->{$tag_name}->{$attr_name}) { |
239 |
$Total->{attr}->{$tag_name}->{$attr_name} += $attr->{$tag_name}->{$attr_name}; |
240 |
$Page->{attr}->{$tag_name}->{$attr_name}++; |
241 |
} |
242 |
|
243 |
for my $attr_value (keys %{$value->{$tag_name}->{$attr_name} or {}}) { |
244 |
$Total->{value}->{$tag_name}->{$attr_name}->{$attr_value} += $value->{$tag_name}->{$attr_name}->{$attr_value}; |
245 |
$Page->{value}->{$tag_name}->{$attr_name}->{$attr_value}++; |
246 |
} |
247 |
|
248 |
for my $attr_value (keys %{$value2->{$tag_name}->{$attr_name} or {}}) { |
249 |
$Total->{value2}->{$tag_name}->{$attr_name}->{$attr_value} |
250 |
+= $value2->{$tag_name}->{$attr_name}->{$attr_value}; |
251 |
$Page->{value2}->{$tag_name}->{$attr_name}->{$attr_value}++; |
252 |
} |
253 |
} |
254 |
} |
255 |
|
256 |
for (keys %$end_tag_name) { |
257 |
if ($end_tag_name->{$_}) { |
258 |
$Total->{end_tag}->{$_} += $end_tag_name->{$_}; |
259 |
$Page->{end_tag}->{$_}++; |
260 |
} |
261 |
} |
262 |
} # tokenize |
263 |
|
264 |
sub print_result () { |
265 |
delete $SIG{INT}; |
266 |
use Data::Dumper; |
267 |
$Data::Dumper::Sortkeys = 1; |
268 |
print Dumper ([$Page, $Total]); |
269 |
exit; |
270 |
} # print_result |
271 |
|
272 |
=head1 AUTHOR |
273 |
|
274 |
Wakaba <w@suika.fam.cx>. |
275 |
|
276 |
=head1 LICENSE |
277 |
|
278 |
Copyright 2007 Wakaba <w@suika.fam.cx> |
279 |
|
280 |
This library is free software; you can redistribute it |
281 |
and/or modify it under the same terms as Perl itself. |
282 |
|
283 |
=cut |
284 |
|
285 |
1; |
286 |
## $Date: 2007/06/06 14:47:59 $ |
287 |
|