/[suikacvs]/webroot/www/ja1200/stat/htmltokenize.pl
Suika

Contents of /webroot/www/ja1200/stat/htmltokenize.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Tue Jun 5 14:18:40 2007 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +79 -58 lines
File MIME type: text/plain
New

1 wakaba 1.1 #!/usr/bin/perl
2     use strict;
3    
4     use lib qw[/home/httpd/html/www/markup/html/whatpm/];
5    
6     use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/;
7     use Whatpm::HTML;
8    
9     our $target = shift;
10     our $code = sub {
11     my $entity = shift;
12     eval {
13     my $data = Encode::decode ('Guess', $entity->{body});
14     tokenize ($data);
15     };
16     };
17    
18     my $Total;
19     my $Page;
20    
21     $SIG{INT} = \&print_result;
22    
23     require 'foreach.pl';
24    
25     print_result ();
26    
27     sub tokenize ($) {
28     my $s = \($_[0]);
29     my $p = Whatpm::HTML->new;
30     my $i = 0;
31     $p->{set_next_input_character} = sub {
32     my $self = shift;
33     $self->{next_input_character} = -1 and return if $i >= length $$s;
34     $self->{next_input_character} = ord substr $$s, $i++, 1;
35    
36     if ($self->{next_input_character} == 0x000D) { # CR
37     if ($i >= length $$s) {
38     #
39     } else {
40     my $next_char = ord substr $$s, $i++, 1;
41     if ($next_char == 0x000A) { # LF
42     #
43     } else {
44     push @{$self->{char}}, $next_char;
45     }
46     }
47     $self->{next_input_character} = 0x000A; # LF # MUST
48     } elsif ($self->{next_input_character} > 0x10FFFF) {
49     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
50     } elsif ($self->{next_input_character} == 0x0000) { # NULL
51     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
52     }
53     };
54    
55     my @token;
56     $p->{parse_error} = sub {
57     push @token, 'ParseError';
58     };
59    
60     $p->_initialize_tokenizer;
61    
62     my $start_tag_name = {};
63     my $end_tag_name = {};
64     my $attr;
65     my $value;
66 wakaba 1.2 my $value2; # case insensitive
67 wakaba 1.1
68     while (1) {
69     my $token = $p->_get_next_token;
70     last if $token->{type} eq 'end-of-file';
71    
72     if ($token->{type} eq 'start tag') {
73     if ({
74     title => 1,
75     textarea => 1,
76     }->{$token->{tag_name}}) {
77     $p->{content_model_flag} = 'RCDATA';
78     } elsif ({
79     style => 1,
80     script => 1,
81     xmp => 1,
82     noframes => 1,
83     noembed => 1,
84     noscript => 1,
85     iframe => 1,
86     }->{$token->{tag_name}}) {
87     $p->{content_model_flag} = 'CDATA';
88     } elsif ($token->{tag_name} eq 'plaintext') {
89     $p->{content_model_flag} = 'PLAINTEXT';
90     }
91     $p->{last_emitted_start_tag_name} = $token->{tag_name};
92     }
93    
94     if ($token->{type} eq 'start tag') {
95     $start_tag_name->{$token->{tag_name}}++;
96     for my $attr_name (keys %{$token->{attributes}}) {
97     $attr->{$token->{tag_name}}->{$attr_name}++;
98 wakaba 1.2 if (my $v = {
99     a => {charset => 2, hreflang => 2, media => 2, name => 2,
100     rel => 2, rev => 2, shape => 2, target => 2, type => 2},
101     applet => {align => 2},
102     area => {charset => 2, hreflang => 2, media => 2,
103     rel => 2, rev => 2, shape => 2, target => 2, type => 2},
104     base => {target => 2},
105     basefont => {color => 2, face => 2, size => 1},
106 wakaba 1.1 bgsound => {loop => 1},
107 wakaba 1.2 body => {bgproperties => 2, scroll => 2},
108     button => {type => 2},
109     caption => {align => 2},
110     col => {align => 2, span => 1, valign => 2},
111     colgroup => {align => 2, span => 1, valign => 2},
112 wakaba 1.1 del => {datetime => 1},
113 wakaba 1.2 div => {align => 2},
114     embed => {align => 2, allowscriptaccess => 2, autostart => 2,
115     loop => 2, showcontrols => 2, type => 2},
116     font => {face => 2, size => 1},
117     form => {accept => 2, 'accept-charset' => 2, enctype => 2,
118     method => 2},
119     frame => {scrolling => 2},
120     h1 => {align => 2},
121     h2 => {align => 2},
122     h3 => {align => 2},
123     h4 => {align => 2},
124     h5 => {align => 2},
125     h6 => {align => 2},
126 wakaba 1.1 head => {profile => 1},
127 wakaba 1.2 hr => {align => 2},
128 wakaba 1.1 html => {version => 1, xmlns => 1},
129 wakaba 1.2 iframe => {align => 2, scrolling => 2},
130     ilayer => {visibility => 2},
131     img => {align => 2, border => 1},
132     input => {accept => 2, 'accept-charset' => 2, autocomplete => 2,
133     enctype => 2, inputmode => 2, istyle => 1, method => 2,
134     target => 2, type => 2},
135 wakaba 1.1 ins => {datetime => 1},
136 wakaba 1.2 layer => {visibility => 2},
137 wakaba 1.1 li => {type => 1, value => 1},
138 wakaba 1.2 link => {charset => 2, hreflang => 2, media => 2,
139     rel => 2, rev => 2, target => 2, type => 2,
140 wakaba 1.1 xmlns => 1},
141 wakaba 1.2 marquee => {align => 2, behavior => 2, direction => 2, loop => 1},
142     meta => {charset => 2, 'http-equiv' => 2, name => 2, scheme => 2,
143 wakaba 1.1 url => 1},
144 wakaba 1.2 object => {align => 2, classid => 1,
145     codetype => 2, standby => 1, type => 2},
146 wakaba 1.1 ol => {start => 1, type => 1},
147 wakaba 1.2 p => {align => 2, wrap => 1},
148     param => {valuetype => 2},
149 wakaba 1.1 pre => {wrap => 1},
150 wakaba 1.2 rt => {rbspan => 1},
151     script => {charset => 2, event => 1, for => 1, type => 2},
152     spacer => {type => 2},
153     style => {media => 2, type => 2},
154     table => {align => 2, border => 1, frame => 2, noborder => 1,
155     summary => 1},
156     tbody => {align => 2, valign => 2},
157     td => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1,
158     rowspan => 1, scope => 2, valign => 2},
159     textarea => {autocomplete => 2, wrap => 1},
160     tfoot => {align => 2, valign => 2},
161     th => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1,
162     rowspan => 1, scope => 2, valign => 2},
163     thead => {align => 2, valign => 2},
164 wakaba 1.1 ul => {type => 1},
165 wakaba 1.2 xml => {charset => 2},
166 wakaba 1.1 }->{$token->{tag_name}}->{$attr_name}) {
167     $value->{$token->{tag_name}}->{$attr_name}
168     ->{$token->{attributes}->{$attr_name}->{value}}++;
169 wakaba 1.2 $value2->{$token->{tag_name}}->{$attr_name}
170     ->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1;
171     } elsif (my $v = {
172     accesskey => 2,
173     class => 2,
174     dir => 2,
175     id => 2,
176     lang => 2,
177     language => 2,
178 wakaba 1.1 role => 1,
179     tabindex => 1,
180 wakaba 1.2 'xml:lang' => 2,
181 wakaba 1.1 }->{$attr_name}) {
182     $value->{'*'}->{$attr_name}
183     ->{$token->{attributes}->{$attr_name}->{value}}++;
184 wakaba 1.2 $value2->{'*'}->{$attr_name}
185     ->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1;
186 wakaba 1.1 }
187     }
188     } elsif ($token->{type} eq 'end tag') {
189     $end_tag_name->{$token->{tag_name}}++;
190     }
191     }
192    
193     for my $tag_name (keys %$start_tag_name) {
194     if ($start_tag_name->{$tag_name}) {
195     $Total->{start_tag}->{$tag_name} += $start_tag_name->{$tag_name};
196     $Page->{start_tag}->{$tag_name}++;
197     }
198    
199     for my $attr_name (keys %{$attr->{$tag_name} or {}}) {
200     if ($attr->{$tag_name}->{$attr_name}) {
201     $Total->{attr}->{$tag_name}->{$attr_name} += $attr->{$tag_name}->{$attr_name};
202     $Page->{attr}->{$tag_name}->{$attr_name}++;
203     }
204    
205     for my $attr_value (keys %{$value->{$tag_name}->{$attr_name} or {}}) {
206     $Total->{value}->{$tag_name}->{$attr_name}->{$attr_value} += $value->{$tag_name}->{$attr_name}->{$attr_value};
207     $Page->{value}->{$tag_name}->{$attr_name}->{$attr_value}++;
208     }
209     }
210     }
211    
212     for my $attr_name (keys %{$value->{'*'} or {}}) {
213     for my $attr_value (keys %{$value->{'*'}->{$attr_name} or {}}) {
214     $Total->{value}->{'*'}->{$attr_name}->{$attr_value}
215     += $value->{'*'}->{$attr_name}->{$attr_value};
216     $Page->{value}->{'*'}->{$attr_name}->{$attr_value}++;
217     }
218     }
219 wakaba 1.2
220     for my $attr_name (keys %{$value2->{'*'} or {}}) {
221     for my $attr_value (keys %{$value2->{'*'}->{$attr_name} or {}}) {
222     $Total->{value2}->{'*'}->{$attr_name}->{$attr_value}
223     += $value2->{'*'}->{$attr_name}->{$attr_value};
224     $Page->{value2}->{'*'}->{$attr_name}->{$attr_value}++;
225     }
226     }
227 wakaba 1.1
228     for (keys %$end_tag_name) {
229     if ($end_tag_name->{$_}) {
230     $Total->{end_tag}->{$_} += $end_tag_name->{$_};
231     $Page->{end_tag}->{$_}++;
232     }
233     }
234     } # tokenize
235    
236     sub print_result () {
237     delete $SIG{INT};
238     use Data::Dumper;
239     $Data::Dumper::Sortkeys = 1;
240     print Dumper ([$Page, $Total]);
241     exit;
242     } # print_result
243    
244     =head1 AUTHOR
245    
246     Wakaba <w@suika.fam.cx>.
247    
248     =head1 LICENSE
249    
250     Copyright 2007 Wakaba <w@suika.fam.cx>
251    
252     This library is free software; you can redistribute it
253     and/or modify it under the same terms as Perl itself.
254    
255     =cut
256    
257     1;
258 wakaba 1.2 ## $Date: 2007/06/02 12:12:28 $
259 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24