/[suikacvs]/webroot/www/ja1200/stat/htmlcomp.pl
Suika

Contents of /webroot/www/ja1200/stat/htmlcomp.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (show annotations) (download)
Sat Jun 9 07:56:19 2007 UTC (16 years, 11 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
File MIME type: text/plain
New scripts for keitai sites; Report for attribute values

1 #!/usr/bin/perl
2 use strict;
3
4 use lib qw[/home/httpd/html/www/markup/html/whatpm/];
5
6 use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/;
7 use Whatpm::HTML;
8
9 $| = 1;
10
11 my $current_category = 0;
12 our $target = shift;
13 our $code = sub {
14 my ($entity, $file_name, $category) = @_;
15 if ($current_category != $category) {
16 print "\n\n";
17 $current_category = $category;
18 }
19 eval {
20 my $data = Encode::decode ('Guess', $entity->{body});
21 tokenize ($data);
22 };
23 };
24
25 require 'foreach.pl';
26
27 sub tokenize ($) {
28 my %data;
29
30 my $s = \($_[0]);
31 my $p = Whatpm::HTML->new;
32 my $i = 0;
33 $p->{set_next_input_character} = sub {
34 my $self = shift;
35 $self->{next_input_character} = -1 and return if $i >= length $$s;
36 $self->{next_input_character} = ord substr $$s, $i++, 1;
37
38 if ($self->{next_input_character} == 0x000D) { # CR
39 if ($i >= length $$s) {
40 #
41 } else {
42 my $next_char = ord substr $$s, $i++, 1;
43 if ($next_char == 0x000A) { # LF
44 #
45 } else {
46 push @{$self->{char}}, $next_char;
47 }
48 }
49 $self->{next_input_character} = 0x000A; # LF # MUST
50 } elsif ($self->{next_input_character} > 0x10FFFF) {
51 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
52 } elsif ($self->{next_input_character} == 0x0000) { # NULL
53 $self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
54 }
55 };
56
57 $p->{parse_error} = sub { };
58
59 $p->_initialize_tokenizer;
60
61 my $start_tag_name = {};
62 my $end_tag_name = {};
63 my $attr;
64 my $value;
65 my $value2; # case insensitive
66
67 while (1) {
68 my $token = $p->_get_next_token;
69 last if $token->{type} eq 'end-of-file';
70
71 if ($token->{type} eq 'start tag') {
72 if ({
73 title => 1,
74 textarea => 1,
75 }->{$token->{tag_name}}) {
76 $p->{content_model_flag} = 'RCDATA';
77 } elsif ({
78 style => 1,
79 script => 1,
80 xmp => 1,
81 noframes => 1,
82 noembed => 1,
83 noscript => 1,
84 iframe => 1,
85 }->{$token->{tag_name}}) {
86 $p->{content_model_flag} = 'CDATA';
87 } elsif ($token->{tag_name} eq 'plaintext') {
88 $p->{content_model_flag} = 'PLAINTEXT';
89 }
90 $p->{last_emitted_start_tag_name} = $token->{tag_name};
91 }
92
93 if ($token->{type} eq 'start tag') {
94 if ({
95 div => 1, span => 1,
96 td => 1, th => 1,
97 br => 1,
98 b => 1, i => 1, s => 1, strike => 1, u => 1, tt => 1,
99 big => 1, small => 1,
100 font => 1, basefont => 1,
101 marquee => 1, blink => 1,
102 hr => 1,
103 center => 1,
104 nobr => 1,
105 layer => 1, ilayer => 1,
106 spacer => 1,
107 multicol => 1,
108 }->{$token->{tag_name}}) {
109 $data{presentational}++;
110 } elsif ({
111 img => 1, object => 1, applet => 1, embed => 1, iframe => 1,
112 frame => 1, script => 1, canvas => 1, video => 1, audio => 1,
113 bgsound => 1,
114 input => 1, select => 1, textarea => 1, output => 1,
115 datalist => 1, datagrid => 1,
116 area => 1,
117 }->{$token->{tag_name}}) {
118 $data{object}++;
119 } elsif ({
120 h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1,
121 li => 1,
122 dt => 1, dd => 1,
123 meta => 1, link => 1,
124 acronym => 1, abbr => 1,
125 cite => 1, ins => 1, del => 1, dfn => 1,
126 strong => 1, em => 1,
127 address => 1,
128 caption => 1, thead => 1, tfoot => 1, col => 1, colgroup => 1,
129 }->{$token->{tag_name}}) {
130 $data{semantic}++;
131 }
132 } elsif ($token->{type} eq 'character') {
133 $token->{data} =~ s/\s+//g;
134 $data{text} += length $token->{data};
135 }
136 }
137
138 my $x = 1000 / ($data{text} or 1);
139 print $data{$_} * $x, "\t" for qw/presentational semantic object/;
140 print "\n";
141 } # tokenize
142
143 =head1 AUTHOR
144
145 Wakaba <w@suika.fam.cx>.
146
147 =head1 LICENSE
148
149 Copyright 2007 Wakaba <w@suika.fam.cx>
150
151 This library is free software; you can redistribute it
152 and/or modify it under the same terms as Perl itself.
153
154 =cut
155
156 1;
157 ## $Date: 2007/06/05 14:18:40 $
158

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24