1 |
wakaba |
1.1 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
|
4 |
|
|
use lib qw[/home/httpd/html/www/markup/html/whatpm/]; |
5 |
|
|
|
6 |
|
|
use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/; |
7 |
|
|
use Whatpm::HTML; |
8 |
|
|
|
9 |
|
|
our $target = shift; |
10 |
|
|
our $code = sub { |
11 |
|
|
my $entity = shift; |
12 |
|
|
eval { |
13 |
|
|
my $data = Encode::decode ('Guess', $entity->{body}); |
14 |
|
|
tokenize ($data); |
15 |
|
|
}; |
16 |
|
|
}; |
17 |
|
|
|
18 |
|
|
my $Total; |
19 |
|
|
my $Page; |
20 |
|
|
|
21 |
|
|
$SIG{INT} = \&print_result; |
22 |
|
|
|
23 |
|
|
require 'foreach.pl'; |
24 |
|
|
|
25 |
|
|
print_result (); |
26 |
|
|
|
27 |
|
|
sub tokenize ($) { |
28 |
|
|
my $s = \($_[0]); |
29 |
|
|
my $p = Whatpm::HTML->new; |
30 |
|
|
my $i = 0; |
31 |
|
|
$p->{set_next_input_character} = sub { |
32 |
|
|
my $self = shift; |
33 |
|
|
$self->{next_input_character} = -1 and return if $i >= length $$s; |
34 |
|
|
$self->{next_input_character} = ord substr $$s, $i++, 1; |
35 |
|
|
|
36 |
|
|
if ($self->{next_input_character} == 0x000D) { # CR |
37 |
|
|
if ($i >= length $$s) { |
38 |
|
|
# |
39 |
|
|
} else { |
40 |
|
|
my $next_char = ord substr $$s, $i++, 1; |
41 |
|
|
if ($next_char == 0x000A) { # LF |
42 |
|
|
# |
43 |
|
|
} else { |
44 |
|
|
push @{$self->{char}}, $next_char; |
45 |
|
|
} |
46 |
|
|
} |
47 |
|
|
$self->{next_input_character} = 0x000A; # LF # MUST |
48 |
|
|
} elsif ($self->{next_input_character} > 0x10FFFF) { |
49 |
|
|
$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
50 |
|
|
} elsif ($self->{next_input_character} == 0x0000) { # NULL |
51 |
|
|
$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
52 |
|
|
} |
53 |
|
|
}; |
54 |
|
|
|
55 |
|
|
my @token; |
56 |
|
|
$p->{parse_error} = sub { |
57 |
|
|
push @token, 'ParseError'; |
58 |
|
|
}; |
59 |
|
|
|
60 |
|
|
$p->_initialize_tokenizer; |
61 |
|
|
|
62 |
|
|
my $start_tag_name = {}; |
63 |
|
|
my $end_tag_name = {}; |
64 |
|
|
my $attr; |
65 |
|
|
my $value; |
66 |
wakaba |
1.2 |
my $value2; # case insensitive |
67 |
wakaba |
1.1 |
|
68 |
|
|
while (1) { |
69 |
|
|
my $token = $p->_get_next_token; |
70 |
|
|
last if $token->{type} eq 'end-of-file'; |
71 |
|
|
|
72 |
|
|
if ($token->{type} eq 'start tag') { |
73 |
|
|
if ({ |
74 |
|
|
title => 1, |
75 |
|
|
textarea => 1, |
76 |
|
|
}->{$token->{tag_name}}) { |
77 |
|
|
$p->{content_model_flag} = 'RCDATA'; |
78 |
|
|
} elsif ({ |
79 |
|
|
style => 1, |
80 |
|
|
script => 1, |
81 |
|
|
xmp => 1, |
82 |
|
|
noframes => 1, |
83 |
|
|
noembed => 1, |
84 |
|
|
noscript => 1, |
85 |
|
|
iframe => 1, |
86 |
|
|
}->{$token->{tag_name}}) { |
87 |
|
|
$p->{content_model_flag} = 'CDATA'; |
88 |
|
|
} elsif ($token->{tag_name} eq 'plaintext') { |
89 |
|
|
$p->{content_model_flag} = 'PLAINTEXT'; |
90 |
|
|
} |
91 |
|
|
$p->{last_emitted_start_tag_name} = $token->{tag_name}; |
92 |
|
|
} |
93 |
|
|
|
94 |
|
|
if ($token->{type} eq 'start tag') { |
95 |
|
|
$start_tag_name->{$token->{tag_name}}++; |
96 |
wakaba |
1.3 |
$start_tag_name->{'*'}++; |
97 |
wakaba |
1.1 |
for my $attr_name (keys %{$token->{attributes}}) { |
98 |
|
|
$attr->{$token->{tag_name}}->{$attr_name}++; |
99 |
wakaba |
1.3 |
$attr->{'*'}->{$attr_name}++; |
100 |
wakaba |
1.2 |
if (my $v = { |
101 |
|
|
a => {charset => 2, hreflang => 2, media => 2, name => 2, |
102 |
|
|
rel => 2, rev => 2, shape => 2, target => 2, type => 2}, |
103 |
wakaba |
1.3 |
applet => {align => 2, name => 2}, |
104 |
|
|
area => {charset => 2, hreflang => 2, media => 2, name => 2, |
105 |
wakaba |
1.2 |
rel => 2, rev => 2, shape => 2, target => 2, type => 2}, |
106 |
|
|
base => {target => 2}, |
107 |
|
|
basefont => {color => 2, face => 2, size => 1}, |
108 |
wakaba |
1.1 |
bgsound => {loop => 1}, |
109 |
wakaba |
1.2 |
body => {bgproperties => 2, scroll => 2}, |
110 |
wakaba |
1.3 |
button => {name => 2, type => 2}, |
111 |
wakaba |
1.2 |
caption => {align => 2}, |
112 |
|
|
col => {align => 2, span => 1, valign => 2}, |
113 |
|
|
colgroup => {align => 2, span => 1, valign => 2}, |
114 |
wakaba |
1.1 |
del => {datetime => 1}, |
115 |
wakaba |
1.2 |
div => {align => 2}, |
116 |
|
|
embed => {align => 2, allowscriptaccess => 2, autostart => 2, |
117 |
wakaba |
1.3 |
loop => 2, name => 2, showcontrols => 2, type => 2}, |
118 |
wakaba |
1.2 |
font => {face => 2, size => 1}, |
119 |
|
|
form => {accept => 2, 'accept-charset' => 2, enctype => 2, |
120 |
wakaba |
1.3 |
method => 2, name => 2}, |
121 |
|
|
frame => {name => 2, scrolling => 2}, |
122 |
wakaba |
1.2 |
h1 => {align => 2}, |
123 |
|
|
h2 => {align => 2}, |
124 |
|
|
h3 => {align => 2}, |
125 |
|
|
h4 => {align => 2}, |
126 |
|
|
h5 => {align => 2}, |
127 |
|
|
h6 => {align => 2}, |
128 |
wakaba |
1.1 |
head => {profile => 1}, |
129 |
wakaba |
1.2 |
hr => {align => 2}, |
130 |
wakaba |
1.1 |
html => {version => 1, xmlns => 1}, |
131 |
wakaba |
1.3 |
iframe => {align => 2, name => 2, scrolling => 2}, |
132 |
|
|
ilayer => {name => 2, visibility => 2}, |
133 |
|
|
img => {align => 2, alt => 1, border => 1, name => 2}, |
134 |
wakaba |
1.2 |
input => {accept => 2, 'accept-charset' => 2, autocomplete => 2, |
135 |
|
|
enctype => 2, inputmode => 2, istyle => 1, method => 2, |
136 |
wakaba |
1.3 |
name => 2, target => 2, type => 2}, |
137 |
wakaba |
1.1 |
ins => {datetime => 1}, |
138 |
wakaba |
1.3 |
layer => {name => 2, visibility => 2}, |
139 |
wakaba |
1.1 |
li => {type => 1, value => 1}, |
140 |
wakaba |
1.2 |
link => {charset => 2, hreflang => 2, media => 2, |
141 |
|
|
rel => 2, rev => 2, target => 2, type => 2, |
142 |
wakaba |
1.1 |
xmlns => 1}, |
143 |
wakaba |
1.2 |
marquee => {align => 2, behavior => 2, direction => 2, loop => 1}, |
144 |
wakaba |
1.3 |
map => {name => 2}, |
145 |
wakaba |
1.2 |
meta => {charset => 2, 'http-equiv' => 2, name => 2, scheme => 2, |
146 |
wakaba |
1.1 |
url => 1}, |
147 |
wakaba |
1.3 |
object => {align => 2, classid => 2, |
148 |
|
|
codetype => 2, name => 2, standby => 1, type => 2}, |
149 |
wakaba |
1.1 |
ol => {start => 1, type => 1}, |
150 |
wakaba |
1.2 |
p => {align => 2, wrap => 1}, |
151 |
wakaba |
1.3 |
param => {name => 2, valuetype => 2}, |
152 |
wakaba |
1.1 |
pre => {wrap => 1}, |
153 |
wakaba |
1.2 |
rt => {rbspan => 1}, |
154 |
|
|
script => {charset => 2, event => 1, for => 1, type => 2}, |
155 |
wakaba |
1.3 |
select => {name => 2}, |
156 |
wakaba |
1.2 |
spacer => {type => 2}, |
157 |
|
|
style => {media => 2, type => 2}, |
158 |
|
|
table => {align => 2, border => 1, frame => 2, noborder => 1, |
159 |
|
|
summary => 1}, |
160 |
|
|
tbody => {align => 2, valign => 2}, |
161 |
|
|
td => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1, |
162 |
|
|
rowspan => 1, scope => 2, valign => 2}, |
163 |
wakaba |
1.3 |
textarea => {autocomplete => 2, name => 2, wrap => 1}, |
164 |
wakaba |
1.2 |
tfoot => {align => 2, valign => 2}, |
165 |
|
|
th => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1, |
166 |
|
|
rowspan => 1, scope => 2, valign => 2}, |
167 |
|
|
thead => {align => 2, valign => 2}, |
168 |
wakaba |
1.3 |
ul => {type => 2}, |
169 |
wakaba |
1.2 |
xml => {charset => 2}, |
170 |
wakaba |
1.1 |
}->{$token->{tag_name}}->{$attr_name}) { |
171 |
|
|
$value->{$token->{tag_name}}->{$attr_name} |
172 |
|
|
->{$token->{attributes}->{$attr_name}->{value}}++; |
173 |
wakaba |
1.2 |
$value2->{$token->{tag_name}}->{$attr_name} |
174 |
|
|
->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1; |
175 |
|
|
} elsif (my $v = { |
176 |
|
|
accesskey => 2, |
177 |
|
|
class => 2, |
178 |
|
|
dir => 2, |
179 |
|
|
id => 2, |
180 |
|
|
lang => 2, |
181 |
|
|
language => 2, |
182 |
wakaba |
1.1 |
role => 1, |
183 |
|
|
tabindex => 1, |
184 |
wakaba |
1.2 |
'xml:lang' => 2, |
185 |
wakaba |
1.1 |
}->{$attr_name}) { |
186 |
|
|
$value->{'*'}->{$attr_name} |
187 |
|
|
->{$token->{attributes}->{$attr_name}->{value}}++; |
188 |
wakaba |
1.2 |
$value2->{'*'}->{$attr_name} |
189 |
|
|
->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1; |
190 |
wakaba |
1.1 |
} |
191 |
|
|
} |
192 |
|
|
} elsif ($token->{type} eq 'end tag') { |
193 |
|
|
$end_tag_name->{$token->{tag_name}}++; |
194 |
|
|
} |
195 |
|
|
} |
196 |
|
|
|
197 |
|
|
for my $tag_name (keys %$start_tag_name) { |
198 |
|
|
if ($start_tag_name->{$tag_name}) { |
199 |
|
|
$Total->{start_tag}->{$tag_name} += $start_tag_name->{$tag_name}; |
200 |
|
|
$Page->{start_tag}->{$tag_name}++; |
201 |
|
|
} |
202 |
|
|
|
203 |
|
|
for my $attr_name (keys %{$attr->{$tag_name} or {}}) { |
204 |
|
|
if ($attr->{$tag_name}->{$attr_name}) { |
205 |
|
|
$Total->{attr}->{$tag_name}->{$attr_name} += $attr->{$tag_name}->{$attr_name}; |
206 |
|
|
$Page->{attr}->{$tag_name}->{$attr_name}++; |
207 |
|
|
} |
208 |
|
|
|
209 |
|
|
for my $attr_value (keys %{$value->{$tag_name}->{$attr_name} or {}}) { |
210 |
|
|
$Total->{value}->{$tag_name}->{$attr_name}->{$attr_value} += $value->{$tag_name}->{$attr_name}->{$attr_value}; |
211 |
|
|
$Page->{value}->{$tag_name}->{$attr_name}->{$attr_value}++; |
212 |
|
|
} |
213 |
wakaba |
1.3 |
|
214 |
|
|
for my $attr_value (keys %{$value2->{$tag_name}->{$attr_name} or {}}) { |
215 |
|
|
$Total->{value2}->{$tag_name}->{$attr_name}->{$attr_value} |
216 |
|
|
+= $value2->{$tag_name}->{$attr_name}->{$attr_value}; |
217 |
|
|
$Page->{value2}->{$tag_name}->{$attr_name}->{$attr_value}++; |
218 |
|
|
} |
219 |
wakaba |
1.2 |
} |
220 |
|
|
} |
221 |
wakaba |
1.1 |
|
222 |
|
|
for (keys %$end_tag_name) { |
223 |
|
|
if ($end_tag_name->{$_}) { |
224 |
|
|
$Total->{end_tag}->{$_} += $end_tag_name->{$_}; |
225 |
|
|
$Page->{end_tag}->{$_}++; |
226 |
|
|
} |
227 |
|
|
} |
228 |
|
|
} # tokenize |
229 |
|
|
|
230 |
|
|
sub print_result () { |
231 |
|
|
delete $SIG{INT}; |
232 |
|
|
use Data::Dumper; |
233 |
|
|
$Data::Dumper::Sortkeys = 1; |
234 |
|
|
print Dumper ([$Page, $Total]); |
235 |
|
|
exit; |
236 |
|
|
} # print_result |
237 |
|
|
|
238 |
|
|
=head1 AUTHOR |
239 |
|
|
|
240 |
|
|
Wakaba <w@suika.fam.cx>. |
241 |
|
|
|
242 |
|
|
=head1 LICENSE |
243 |
|
|
|
244 |
|
|
Copyright 2007 Wakaba <w@suika.fam.cx> |
245 |
|
|
|
246 |
|
|
This library is free software; you can redistribute it |
247 |
|
|
and/or modify it under the same terms as Perl itself. |
248 |
|
|
|
249 |
|
|
=cut |
250 |
|
|
|
251 |
|
|
1; |
252 |
wakaba |
1.3 |
## $Date: 2007/06/05 14:18:40 $ |
253 |
wakaba |
1.1 |
|