/[suikacvs]/webroot/www/ja1200/stat/htmltokenize.pl
Suika

Contents of /webroot/www/ja1200/stat/htmltokenize.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations) (download)
Sat Jul 21 05:26:48 2007 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.3: +53 -19 lines
File MIME type: text/plain
Whatpm::HTML content model flag syntax has been changed

1 #!/usr/bin/perl
2 use strict;
3
4 use lib qw[/home/httpd/html/www/markup/html/whatpm/];
5
6 use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/;
7 use Whatpm::HTML;
8
9 our $target = shift;
10 our $code = sub {
11 my ($entity, $file_name) = @_;
12 my $http_charset;
13 if ($entity->{field}->{'content-type'}) {
14 if ($entity->{field}->{'content-type'}->[0] =~ /charset\s*=\s*([^\s;]+)/i) {
15 my $charset = $1;
16 if ($charset =~ /euc/i) {
17 $http_charset = 'euc-jp';
18 } elsif ($charset =~ /shift/i or $charset =~ /sjis/i or
19 $charset =~ /ms932/i or $charset =~ /cp943/i or
20 $charset =~ /windows-31j/i) {
21 $http_charset = 'shift_jis';
22 } elsif ($charset =~ /none/i or $charset =~ /unknown/i or
23 $charset =~ /jp/i or $charset eq '0') {
24 $http_charset = 'shift_jis' if $target =~ /keitai/i;
25 } else {
26 $http_charset = $charset;
27 }
28 } else {
29 $http_charset = 'shift_jis' if $target =~ /keitai/;
30 }
31 }
32 my $decoder = $http_charset
33 ? Encode::find_encoding ($http_charset)
34 : Encode::Guess->guess ($entity->{body});
35 unless (ref $decoder) {
36 warn "$0: $file_name: $decoder\n";
37 $decoder = Encode::find_encoding ('shiftjis');
38 }
39 my $data = $decoder->decode ($entity->{body}, 0);
40 tokenize ($data);
41 };
42
43 my $Total;
44 my $Page;
45
46 $SIG{INT} = \&print_result;
47
48 require 'foreach.pl';
49
50 print_result ();
51
52 sub tokenize ($) {
53 my $s = \($_[0]);
54 my $p = Whatpm::HTML->new;
55 my $i = 0;
56 $p->{set_next_input_character} = sub {
57 my $self = shift;
58 $self->{next_input_character} = -1 and return if $i >= length $$s;
59 $self->{next_input_character} = ord substr $$s, $i++, 1;
60
61 if ($self->{next_input_character} == 0x000D) { # CR
62 if ($i >= length $$s) {
63 #
64 } else {
65 my $next_char = ord substr $$s, $i++, 1;
66 if ($next_char == 0x000A) { # LF
67 #
68 } else {
69 push @{$self->{char}}, $next_char;
70 }
71 }
72 $self->{next_input_character} = 0x000A; # LF # MUST
73 } elsif ($self->{next_input_character} > 0x10FFFF) {
74 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
75 } elsif ($self->{next_input_character} == 0x0000) { # NULL
76 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
77 }
78 };
79
80 my @token;
81 $p->{parse_error} = sub {
82 push @token, 'ParseError';
83 };
84
85 $p->_initialize_tokenizer;
86
87 my $start_tag_name = {};
88 my $end_tag_name = {};
89 my $attr;
90 my $value;
91 my $value2; # case insensitive
92
93 while (1) {
94 my $token = $p->_get_next_token;
95 last if $token->{type} eq 'end-of-file';
96
97 if ($token->{type} eq 'start tag') {
98 if ({
99 title => 1,
100 textarea => 1,
101 }->{$token->{tag_name}}) {
102 $p->{content_model} = Whatpm::HTML::RCDATA_CONTENT_MODEL ();
103 } elsif ({
104 style => 1,
105 script => 1,
106 xmp => 1,
107 noframes => 1,
108 noembed => 1,
109 noscript => 1,
110 iframe => 1,
111 }->{$token->{tag_name}}) {
112 $p->{content_model} = Whatpm::HTML::CDATA_CONTENT_MODEL ();
113 } elsif ($token->{tag_name} eq 'plaintext') {
114 $p->{content_model} = Whatpm::HTML::PLAINTEXT_CONTENT_MODEL ();
115 }
116 $p->{last_emitted_start_tag_name} = $token->{tag_name};
117 }
118
119 if ($token->{type} eq 'start tag') {
120 $start_tag_name->{$token->{tag_name}}++;
121 $start_tag_name->{'*'}++;
122 for my $attr_name (keys %{$token->{attributes}}) {
123 $attr->{$token->{tag_name}}->{$attr_name}++;
124 $attr->{'*'}->{$attr_name}++;
125 if (my $v = {
126 a => {charset => 2, directkey => 1, hreflang => 2,
127 ijam => 1, media => 2, name => 2,
128 rel => 2, rev => 2, shape => 2, target => 2, type => 2},
129 applet => {align => 2, name => 2},
130 area => {charset => 2, hreflang => 2, media => 2, name => 2,
131 rel => 2, rev => 2, shape => 2, target => 2, type => 2},
132 base => {target => 2},
133 basefont => {color => 2, face => 2, size => 1},
134 bgsound => {loop => 1},
135 body => {bgproperties => 2, scroll => 2},
136 button => {name => 2, type => 2},
137 caption => {align => 2},
138 col => {align => 2, span => 1, valign => 2},
139 colgroup => {align => 2, span => 1, valign => 2},
140 del => {datetime => 1},
141 div => {align => 2, mode => 2},
142 embed => {align => 2, allowscriptaccess => 2, autostart => 2,
143 loop => 2, name => 2, quality => 2,
144 showcontrols => 2, type => 2},
145 font => {face => 2, size => 1},
146 form => {accept => 2, 'accept-charset' => 2, enctype => 2,
147 method => 2, name => 2},
148 frame => {name => 2, scrolling => 2},
149 h1 => {align => 2},
150 h2 => {align => 2},
151 h3 => {align => 2},
152 h4 => {align => 2},
153 h5 => {align => 2},
154 h6 => {align => 2},
155 head => {profile => 1},
156 hr => {align => 2},
157 html => {version => 1, xmlns => 1},
158 iframe => {align => 2, name => 2, scrolling => 2},
159 ilayer => {name => 2, visibility => 2},
160 img => {align => 2, alt => 1, border => 1, copyright => 2,
161 galleryimg => 2, localsrc => 2, name => 2,
162 naturalsizeflag => 2, nosave => 2},
163 input => {accept => 2, 'accept-charset' => 2, autocomplete => 2,
164 autosave => 2, emptyok => 2, enctype => 2, format => 1,
165 inputmode => 2, istyle => 1, localsrc => 2, method => 2,
166 mode => 2, name => 2, placeholder => 1,
167 results => 1, target => 2, type => 2},
168 ins => {datetime => 1},
169 layer => {name => 2, visibility => 2},
170 li => {type => 1, value => 1},
171 link => {charset => 2, hreflang => 2, media => 2,
172 rel => 2, rev => 2, target => 2, type => 2,
173 xmlns => 1},
174 marquee => {align => 2, behavior => 2, direction => 2, loop => 1},
175 map => {name => 2},
176 meta => {charset => 2, 'http-equiv' => 2, name => 2, scheme => 2,
177 url => 1},
178 object => {align => 2, classid => 2,
179 codebase => 1, codetype => 2, copyright => 2,
180 name => 2, standby => 1, type => 2},
181 ol => {start => 1, type => 1},
182 option => {mode => 2},
183 p => {align => 2, mode => 2, wrap => 1},
184 param => {name => 2, valuetype => 2},
185 pre => {wrap => 1},
186 rt => {rbspan => 1},
187 script => {charset => 2, event => 1, for => 1, type => 2},
188 select => {name => 2},
189 spacer => {type => 2},
190 style => {media => 2, type => 2},
191 table => {align => 2, border => 1, frame => 2, noborder => 1,
192 summary => 1},
193 tbody => {align => 2, valign => 2},
194 td => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1,
195 rowspan => 1, scope => 2, valign => 2},
196 textarea => {autocomplete => 2, istyle => 2, mode => 2,
197 name => 2, wrap => 2},
198 tfoot => {align => 2, valign => 2},
199 th => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1,
200 rowspan => 1, scope => 2, valign => 2},
201 thead => {align => 2, valign => 2},
202 ul => {type => 2},
203 xml => {charset => 2},
204 }->{$token->{tag_name}}->{$attr_name}) {
205 $value->{$token->{tag_name}}->{$attr_name}
206 ->{$token->{attributes}->{$attr_name}->{value}}++;
207 $value2->{$token->{tag_name}}->{$attr_name}
208 ->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1;
209 } elsif (my $v = {
210 accesskey => 2,
211 class => 2,
212 dir => 2,
213 id => 2,
214 lang => 2,
215 language => 2,
216 role => 1,
217 tabindex => 1,
218 'xml:lang' => 2,
219 }->{$attr_name}) {
220 $value->{'*'}->{$attr_name}
221 ->{$token->{attributes}->{$attr_name}->{value}}++;
222 $value2->{'*'}->{$attr_name}
223 ->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1;
224 }
225 }
226 } elsif ($token->{type} eq 'end tag') {
227 $end_tag_name->{$token->{tag_name}}++;
228 }
229 }
230
231 for my $tag_name (keys %$start_tag_name) {
232 if ($start_tag_name->{$tag_name}) {
233 $Total->{start_tag}->{$tag_name} += $start_tag_name->{$tag_name};
234 $Page->{start_tag}->{$tag_name}++;
235 }
236
237 for my $attr_name (keys %{$attr->{$tag_name} or {}}) {
238 if ($attr->{$tag_name}->{$attr_name}) {
239 $Total->{attr}->{$tag_name}->{$attr_name} += $attr->{$tag_name}->{$attr_name};
240 $Page->{attr}->{$tag_name}->{$attr_name}++;
241 }
242
243 for my $attr_value (keys %{$value->{$tag_name}->{$attr_name} or {}}) {
244 $Total->{value}->{$tag_name}->{$attr_name}->{$attr_value} += $value->{$tag_name}->{$attr_name}->{$attr_value};
245 $Page->{value}->{$tag_name}->{$attr_name}->{$attr_value}++;
246 }
247
248 for my $attr_value (keys %{$value2->{$tag_name}->{$attr_name} or {}}) {
249 $Total->{value2}->{$tag_name}->{$attr_name}->{$attr_value}
250 += $value2->{$tag_name}->{$attr_name}->{$attr_value};
251 $Page->{value2}->{$tag_name}->{$attr_name}->{$attr_value}++;
252 }
253 }
254 }
255
256 for (keys %$end_tag_name) {
257 if ($end_tag_name->{$_}) {
258 $Total->{end_tag}->{$_} += $end_tag_name->{$_};
259 $Page->{end_tag}->{$_}++;
260 }
261 }
262 } # tokenize
263
264 sub print_result () {
265 delete $SIG{INT};
266 use Data::Dumper;
267 $Data::Dumper::Sortkeys = 1;
268 print Dumper ([$Page, $Total]);
269 exit;
270 } # print_result
271
272 =head1 AUTHOR
273
274 Wakaba <w@suika.fam.cx>.
275
276 =head1 LICENSE
277
278 Copyright 2007 Wakaba <w@suika.fam.cx>
279
280 This library is free software; you can redistribute it
281 and/or modify it under the same terms as Perl itself.
282
283 =cut
284
285 1;
286 ## $Date: 2007/06/06 14:47:59 $
287

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24