#!/usr/bin/perl use strict; use lib qw[/home/httpd/html/www/markup/html/whatpm/]; use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/; use Whatpm::HTML; our $target = shift; our $code = sub { my ($entity, $file_name) = @_; my $http_charset; if ($entity->{field}->{'content-type'}) { if ($entity->{field}->{'content-type'}->[0] =~ /charset\s*=\s*([^\s;]+)/i) { my $charset = $1; if ($charset =~ /euc/i) { $http_charset = 'euc-jp'; } elsif ($charset =~ /shift/i or $charset =~ /sjis/i or $charset =~ /ms932/i or $charset =~ /cp943/i or $charset =~ /windows-31j/i) { $http_charset = 'shift_jis'; } elsif ($charset =~ /none/i or $charset =~ /unknown/i or $charset =~ /jp/i or $charset eq '0') { $http_charset = 'shift_jis' if $target =~ /keitai/i; } else { $http_charset = $charset; } } else { $http_charset = 'shift_jis' if $target =~ /keitai/; } } my $decoder = $http_charset ? Encode::find_encoding ($http_charset) : Encode::Guess->guess ($entity->{body}); unless (ref $decoder) { warn "$0: $file_name: $decoder\n"; $decoder = Encode::find_encoding ('shiftjis'); } my $data = $decoder->decode ($entity->{body}, 0); tokenize ($data); }; my $Total; my $Page; $SIG{INT} = \&print_result; require 'foreach.pl'; print_result (); sub tokenize ($) { my $s = \($_[0]); my $p = Whatpm::HTML->new; my $i = 0; $p->{set_next_input_character} = sub { my $self = shift; $self->{next_input_character} = -1 and return if $i >= length $$s; $self->{next_input_character} = ord substr $$s, $i++, 1; if ($self->{next_input_character} == 0x000D) { # CR if ($i >= length $$s) { # } else { my $next_char = ord substr $$s, $i++, 1; if ($next_char == 0x000A) { # LF # } else { push @{$self->{char}}, $next_char; } } $self->{next_input_character} = 0x000A; # LF # MUST } elsif ($self->{next_input_character} > 0x10FFFF) { $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST } elsif ($self->{next_input_character} == 0x0000) { # NULL $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST } }; my @token; $p->{parse_error} = sub { push @token, 'ParseError'; }; $p->_initialize_tokenizer; my $start_tag_name = {}; my $end_tag_name = {}; my $attr; my $value; my $value2; # case insensitive while (1) { my $token = $p->_get_next_token; last if $token->{type} eq 'end-of-file'; if ($token->{type} eq 'start tag') { if ({ title => 1, textarea => 1, }->{$token->{tag_name}}) { $p->{content_model} = Whatpm::HTML::RCDATA_CONTENT_MODEL (); } elsif ({ style => 1, script => 1, xmp => 1, noframes => 1, noembed => 1, noscript => 1, iframe => 1, }->{$token->{tag_name}}) { $p->{content_model} = Whatpm::HTML::CDATA_CONTENT_MODEL (); } elsif ($token->{tag_name} eq 'plaintext') { $p->{content_model} = Whatpm::HTML::PLAINTEXT_CONTENT_MODEL (); } $p->{last_emitted_start_tag_name} = $token->{tag_name}; } if ($token->{type} eq 'start tag') { $start_tag_name->{$token->{tag_name}}++; $start_tag_name->{'*'}++; for my $attr_name (keys %{$token->{attributes}}) { $attr->{$token->{tag_name}}->{$attr_name}++; $attr->{'*'}->{$attr_name}++; if (my $v = { a => {charset => 2, directkey => 1, hreflang => 2, ijam => 1, media => 2, name => 2, rel => 2, rev => 2, shape => 2, target => 2, type => 2}, applet => {align => 2, name => 2}, area => {charset => 2, hreflang => 2, media => 2, name => 2, rel => 2, rev => 2, shape => 2, target => 2, type => 2}, base => {target => 2}, basefont => {color => 2, face => 2, size => 1}, bgsound => {loop => 1}, body => {bgproperties => 2, scroll => 2}, button => {name => 2, type => 2}, caption => {align => 2}, col => {align => 2, span => 1, valign => 2}, colgroup => {align => 2, span => 1, valign => 2}, del => {datetime => 1}, div => {align => 2, mode => 2}, embed => {align => 2, allowscriptaccess => 2, autostart => 2, loop => 2, name => 2, quality => 2, showcontrols => 2, type => 2}, font => {face => 2, size => 1}, form => {accept => 2, 'accept-charset' => 2, enctype => 2, method => 2, name => 2}, frame => {name => 2, scrolling => 2}, h1 => {align => 2}, h2 => {align => 2}, h3 => {align => 2}, h4 => {align => 2}, h5 => {align => 2}, h6 => {align => 2}, head => {profile => 1}, hr => {align => 2}, html => {version => 1, xmlns => 1}, iframe => {align => 2, name => 2, scrolling => 2}, ilayer => {name => 2, visibility => 2}, img => {align => 2, alt => 1, border => 1, copyright => 2, galleryimg => 2, localsrc => 2, name => 2, naturalsizeflag => 2, nosave => 2}, input => {accept => 2, 'accept-charset' => 2, autocomplete => 2, autosave => 2, emptyok => 2, enctype => 2, format => 1, inputmode => 2, istyle => 1, localsrc => 2, method => 2, mode => 2, name => 2, placeholder => 1, results => 1, target => 2, type => 2}, ins => {datetime => 1}, layer => {name => 2, visibility => 2}, li => {type => 1, value => 1}, link => {charset => 2, hreflang => 2, media => 2, rel => 2, rev => 2, target => 2, type => 2, xmlns => 1}, marquee => {align => 2, behavior => 2, direction => 2, loop => 1}, map => {name => 2}, meta => {charset => 2, 'http-equiv' => 2, name => 2, scheme => 2, url => 1}, object => {align => 2, classid => 2, codebase => 1, codetype => 2, copyright => 2, name => 2, standby => 1, type => 2}, ol => {start => 1, type => 1}, option => {mode => 2}, p => {align => 2, mode => 2, wrap => 1}, param => {name => 2, valuetype => 2}, pre => {wrap => 1}, rt => {rbspan => 1}, script => {charset => 2, event => 1, for => 1, type => 2}, select => {name => 2}, spacer => {type => 2}, style => {media => 2, type => 2}, table => {align => 2, border => 1, frame => 2, noborder => 1, summary => 1}, tbody => {align => 2, valign => 2}, td => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1, rowspan => 1, scope => 2, valign => 2}, textarea => {autocomplete => 2, istyle => 2, mode => 2, name => 2, wrap => 2}, tfoot => {align => 2, valign => 2}, th => {abbr => 1, align => 2, axis => 1, colspan => 1, headers => 1, rowspan => 1, scope => 2, valign => 2}, thead => {align => 2, valign => 2}, ul => {type => 2}, xml => {charset => 2}, }->{$token->{tag_name}}->{$attr_name}) { $value->{$token->{tag_name}}->{$attr_name} ->{$token->{attributes}->{$attr_name}->{value}}++; $value2->{$token->{tag_name}}->{$attr_name} ->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1; } elsif (my $v = { accesskey => 2, class => 2, dir => 2, id => 2, lang => 2, language => 2, role => 1, tabindex => 1, 'xml:lang' => 2, }->{$attr_name}) { $value->{'*'}->{$attr_name} ->{$token->{attributes}->{$attr_name}->{value}}++; $value2->{'*'}->{$attr_name} ->{lc $token->{attributes}->{$attr_name}->{value}}++ if $v > 1; } } } elsif ($token->{type} eq 'end tag') { $end_tag_name->{$token->{tag_name}}++; } } for my $tag_name (keys %$start_tag_name) { if ($start_tag_name->{$tag_name}) { $Total->{start_tag}->{$tag_name} += $start_tag_name->{$tag_name}; $Page->{start_tag}->{$tag_name}++; } for my $attr_name (keys %{$attr->{$tag_name} or {}}) { if ($attr->{$tag_name}->{$attr_name}) { $Total->{attr}->{$tag_name}->{$attr_name} += $attr->{$tag_name}->{$attr_name}; $Page->{attr}->{$tag_name}->{$attr_name}++; } for my $attr_value (keys %{$value->{$tag_name}->{$attr_name} or {}}) { $Total->{value}->{$tag_name}->{$attr_name}->{$attr_value} += $value->{$tag_name}->{$attr_name}->{$attr_value}; $Page->{value}->{$tag_name}->{$attr_name}->{$attr_value}++; } for my $attr_value (keys %{$value2->{$tag_name}->{$attr_name} or {}}) { $Total->{value2}->{$tag_name}->{$attr_name}->{$attr_value} += $value2->{$tag_name}->{$attr_name}->{$attr_value}; $Page->{value2}->{$tag_name}->{$attr_name}->{$attr_value}++; } } } for (keys %$end_tag_name) { if ($end_tag_name->{$_}) { $Total->{end_tag}->{$_} += $end_tag_name->{$_}; $Page->{end_tag}->{$_}++; } } } # tokenize sub print_result () { delete $SIG{INT}; use Data::Dumper; $Data::Dumper::Sortkeys = 1; print Dumper ([$Page, $Total]); exit; } # print_result =head1 AUTHOR Wakaba . =head1 LICENSE Copyright 2007 Wakaba This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1; ## $Date: 2007/07/21 05:26:48 $