/[suikacvs]/webroot/www/ja1200/stat/htmltokenize.pl
Suika

Contents of /webroot/www/ja1200/stat/htmltokenize.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations) (download)
Wed Jun 6 14:47:59 2007 UTC (17 years, 5 months ago) by wakaba
Branch: MAIN
Changes since 1.2: +27 -33 lines
File MIME type: text/plain
Case-insensitive attribute value mode

1 wakaba 1.1 #!/usr/bin/perl
2     use strict;
3    
4     use lib qw[/home/httpd/html/www/markup/html/whatpm/];
5    
6     use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/;
7     use Whatpm::HTML;
8    
9     our $target = shift;
10     our $code = sub {
11     my $entity = shift;
12     eval {
13     my $data = Encode::decode ('Guess', $entity->{body});
14     tokenize ($data);
15     };
16     };
17    
18     my $Total;
19     my $Page;
20    
21     $SIG{INT} = \&print_result;
22    
23     require 'foreach.pl';
24    
25     print_result ();
26    
27     sub tokenize ($) {
28     my $s = \($_[0]);
29     my $p = Whatpm::HTML->new;
30     my $i = 0;
31     $p->{set_next_input_character} = sub {
32     my $self = shift;
33     $self->{next_input_character} = -1 and return if $i >= length $$s;
34     $self->{next_input_character} = ord substr $$s, $i++, 1;
35    
36     if ($self->{next_input_character} == 0x000D) { # CR
37     if ($i >= length $$s) {
38     #
39     } else {
40     my $next_char = ord substr $$s, $i++, 1;
41     if ($next_char == 0x000A) { # LF
42     #
43     } else {
44     push @{$self->{char}}, $next_char;
45     }
46     }
47     $self->{next_input_character} = 0x000A; # LF # MUST
48     } elsif ($self->{next_input_character} > 0x10FFFF) {
49     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
50     } elsif ($self->{next_input_character} == 0x0000) { # NULL
51     $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
52     }
53     };
54    
55     my @token;
56     $p->{parse_error} = sub {
57     push @token, 'ParseError';
58     };
59    
60     $p->_initialize_tokenizer;
61    
62     my $start_tag_name = {};
63     my $end_tag_name = {};
64     my $attr;
65     my $value;
66 wakaba 1.2 my $value2; # case insensitive
67 wakaba 1.1
68     while (1) {
69     my $token = $p->_get_next_token;
70     last if $token->{type} eq 'end-of-file';
71    
72     if ($token->{type} eq 'start tag') {
73     if ({
74     title => 1,
75     textarea => 1,
76     }->{$token->{tag_name}}) {
77     $p->{content_model_flag} = 'RCDATA';
78     } elsif ({
79     style => 1,
80     script => 1,
81     xmp => 1,
82     noframes => 1,
83     noembed => 1,
84     noscript => 1,
85     iframe => 1,
86     }->{$token->{tag_name}}) {
87     $p->{content_model_flag} = 'CDATA';
88     } elsif ($token->{tag_name} eq 'plaintext') {
89     $p->{content_model_flag} = 'PLAINTEXT';
90     }
91     $p->{last_emitted_start_tag_name} = $token->{tag_name};
92     }
93    
94     if ($token->{type} eq 'start tag') {
95     $start_tag_name->{$token->{tag_name}}++;
96 wakaba 1.3 $start_tag_name->{'*'}++;
97 wakaba 1.1 for my $attr_name (keys %{$token->{attributes}}) {
98     $attr->{$token->{tag_name}}->{$attr_name}++;
99 wakaba 1.3 $attr->{'*'}->{$attr_name}++;
100 wakaba 1.2 if (my $v = {
101     a => {charset => 2, hreflang => 2, media => 2, name => 2,
102     rel => 2, rev => 2, shape => 2, target => 2, type => 2},
103 wakaba 1.3 applet => {align => 2, name => 2},
104     area => {charset => 2, hreflang => 2, media => 2, name => 2,
105 wakaba 1.2 rel => 2, rev => 2, shape => 2, target => 2, type => 2},
106     base => {target => 2},
107     basefont => {color => 2, face => 2, size => 1},
108 wakaba 1.1 bgsound => {loop => 1},
109 wakaba 1.2 body => {bgproperties => 2, scroll => 2},
110 wakaba 1.3 button => {name => 2, type => 2},
111 wakaba 1.2 caption => {align => 2},
112     col => {align => 2, span => 1, valign => 2},
113     colgroup => {align => 2, span => 1, valign => 2},
114 wakaba 1.1 del => {datetime => 1},
115 wakaba 1.2 div => {align => 2},
116     embed => {align => 2, allowscriptaccess => 2, autostart => 2,
117 wakaba 1.3 loop => 2, name => 2, showcontrols => 2, type => 2},
118 wakaba 1.2 font => {face => 2, size => 1},
119     form => {accept => 2, 'accept-charset' => 2, enctype => 2,
120 wakaba 1.3 method => 2, name => 2},
121     frame => {name => 2, scrolling => 2},
122 wakaba 1.2 h1 => {align => 2},
123     h2 => {align => 2},
124     h3 => {align => 2},
125     h4 => {align => 2},
126     h5 => {align => 2},
127     h6 => {align => 2},
128 wakaba 1.1 head => {profile => 1},
129 wakaba 1.2 hr => {align => 2},
130 wakaba 1.1 html => {version => 1, xmlns => 1},
131 wakaba 1.3 iframe => {align => 2, name => 2, scrolling => 2},
132     ilayer => {name => 2, visibility => 2},
133     img => {align => 2, alt => 1, border => 1, name => 2},
134 wakaba 1.2 input => {accept => 2, 'accept-charset' => 2, autocomplete => 2,
135     enctype => 2, inputmode => 2, istyle => 1, method => 2,
136 wakaba 1.3 name => 2, target => 2, type => 2},
137 wakaba 1.1 ins => {datetime => 1},
138 wakaba 1.3 layer => {name => 2, visibility => 2},
139 wakaba 1.1 li => {type => 1, value => 1},
140 wakaba 1.2 link => {charset => 2, hreflang => 2, media => 2,
141     rel => 2, rev => 2, target => 2, type => 2,
142 wakaba 1.1 xmlns => 1},
143 wakaba 1.2 marquee => {align => 2, behavior => 2, direction => 2, loop => 1},
144 wakaba 1.3 map => {name => 2},
145 wakaba 1.2 meta => {charset => 2, 'http-equiv' => 2, name => 2, scheme => 2,
146 wakaba 1.1 url => 1},
147 wakaba 1.3 object => {align => 2, classid => 2,
148     codetype => 2, name => 2, standby => 1, type => 2},
149 wakaba 1.1 ol => {start => 1, type => 1},
150 wakaba 1.2 p => {align => 2, wrap => 1},
151 wakaba 1.3 param => {name => 2, valuetype => 2},
152 wakaba 1.1 pre => {wrap => 1},
153 wakaba 1.2 rt => {rbspan => 1},
154     script => {charset => 2, event => 1, for => 1, type => 2},
155 wakaba 1.3 select => {name => 2},
156 wakaba 1.2 spacer => {type => 2},
157     style => {media => 2, type => 2},
158     table => {align => 2, border => 1, frame => 2, noborder => 1,
159     summary => 1},
160     tbody => {align => 2, valign => 2},
161     td => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1,
162     rowspan => 1, scope => 2, valign => 2},
163 wakaba 1.3 textarea => {autocomplete => 2, name => 2, wrap => 1},
164 wakaba 1.2 tfoot => {align => 2, valign => 2},
165     th => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1,
166     rowspan => 1, scope => 2, valign => 2},
167     thead => {align => 2, valign => 2},
168 wakaba 1.3 ul => {type => 2},
169 wakaba 1.2 xml => {charset => 2},
170 wakaba 1.1 }->{$token->{tag_name}}->{$attr_name}) {
171     $value->{$token->{tag_name}}->{$attr_name}
172     ->{$token->{attributes}->{$attr_name}->{value}}++;
173 wakaba 1.2 $value2->{$token->{tag_name}}->{$attr_name}
174     ->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1;
175     } elsif (my $v = {
176     accesskey => 2,
177     class => 2,
178     dir => 2,
179     id => 2,
180     lang => 2,
181     language => 2,
182 wakaba 1.1 role => 1,
183     tabindex => 1,
184 wakaba 1.2 'xml:lang' => 2,
185 wakaba 1.1 }->{$attr_name}) {
186     $value->{'*'}->{$attr_name}
187     ->{$token->{attributes}->{$attr_name}->{value}}++;
188 wakaba 1.2 $value2->{'*'}->{$attr_name}
189     ->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1;
190 wakaba 1.1 }
191     }
192     } elsif ($token->{type} eq 'end tag') {
193     $end_tag_name->{$token->{tag_name}}++;
194     }
195     }
196    
197     for my $tag_name (keys %$start_tag_name) {
198     if ($start_tag_name->{$tag_name}) {
199     $Total->{start_tag}->{$tag_name} += $start_tag_name->{$tag_name};
200     $Page->{start_tag}->{$tag_name}++;
201     }
202    
203     for my $attr_name (keys %{$attr->{$tag_name} or {}}) {
204     if ($attr->{$tag_name}->{$attr_name}) {
205     $Total->{attr}->{$tag_name}->{$attr_name} += $attr->{$tag_name}->{$attr_name};
206     $Page->{attr}->{$tag_name}->{$attr_name}++;
207     }
208    
209     for my $attr_value (keys %{$value->{$tag_name}->{$attr_name} or {}}) {
210     $Total->{value}->{$tag_name}->{$attr_name}->{$attr_value} += $value->{$tag_name}->{$attr_name}->{$attr_value};
211     $Page->{value}->{$tag_name}->{$attr_name}->{$attr_value}++;
212     }
213 wakaba 1.3
214     for my $attr_value (keys %{$value2->{$tag_name}->{$attr_name} or {}}) {
215     $Total->{value2}->{$tag_name}->{$attr_name}->{$attr_value}
216     += $value2->{$tag_name}->{$attr_name}->{$attr_value};
217     $Page->{value2}->{$tag_name}->{$attr_name}->{$attr_value}++;
218     }
219 wakaba 1.2 }
220     }
221 wakaba 1.1
222     for (keys %$end_tag_name) {
223     if ($end_tag_name->{$_}) {
224     $Total->{end_tag}->{$_} += $end_tag_name->{$_};
225     $Page->{end_tag}->{$_}++;
226     }
227     }
228     } # tokenize
229    
230     sub print_result () {
231     delete $SIG{INT};
232     use Data::Dumper;
233     $Data::Dumper::Sortkeys = 1;
234     print Dumper ([$Page, $Total]);
235     exit;
236     } # print_result
237    
238     =head1 AUTHOR
239    
240     Wakaba <w@suika.fam.cx>.
241    
242     =head1 LICENSE
243    
244     Copyright 2007 Wakaba <w@suika.fam.cx>
245    
246     This library is free software; you can redistribute it
247     and/or modify it under the same terms as Perl itself.
248    
249     =cut
250    
251     1;
252 wakaba 1.3 ## $Date: 2007/06/05 14:18:40 $
253 wakaba 1.1

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24