1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
|
4 |
use lib qw[/home/httpd/html/www/markup/html/whatpm/]; |
5 |
|
6 |
use Encode::Guess qw/euc-jp shiftjis 7bit-jis utf8/; |
7 |
use Whatpm::HTML; |
8 |
|
9 |
$| = 1; |
10 |
|
11 |
my $current_category = 0; |
12 |
our $target = shift; |
13 |
our $code = sub { |
14 |
my ($entity, $file_name, $category) = @_; |
15 |
if ($current_category != $category) { |
16 |
print "\n\n"; |
17 |
$current_category = $category; |
18 |
} |
19 |
eval { |
20 |
my $data = Encode::decode ('Guess', $entity->{body}); |
21 |
tokenize ($data); |
22 |
}; |
23 |
}; |
24 |
|
25 |
require 'foreach.pl'; |
26 |
|
27 |
sub tokenize ($) { |
28 |
my %data; |
29 |
|
30 |
my $s = \($_[0]); |
31 |
my $p = Whatpm::HTML->new; |
32 |
my $i = 0; |
33 |
$p->{set_next_input_character} = sub { |
34 |
my $self = shift; |
35 |
$self->{next_input_character} = -1 and return if $i >= length $$s; |
36 |
$self->{next_input_character} = ord substr $$s, $i++, 1; |
37 |
|
38 |
if ($self->{next_input_character} == 0x000D) { # CR |
39 |
if ($i >= length $$s) { |
40 |
# |
41 |
} else { |
42 |
my $next_char = ord substr $$s, $i++, 1; |
43 |
if ($next_char == 0x000A) { # LF |
44 |
# |
45 |
} else { |
46 |
push @{$self->{char}}, $next_char; |
47 |
} |
48 |
} |
49 |
$self->{next_input_character} = 0x000A; # LF # MUST |
50 |
} elsif ($self->{next_input_character} > 0x10FFFF) { |
51 |
$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
52 |
} elsif ($self->{next_input_character} == 0x0000) { # NULL |
53 |
$self->{next_input_character} = 0xFFFD; # REPLACEMENT CHARACTER # MUST |
54 |
} |
55 |
}; |
56 |
|
57 |
$p->{parse_error} = sub { }; |
58 |
|
59 |
$p->_initialize_tokenizer; |
60 |
|
61 |
my $start_tag_name = {}; |
62 |
my $end_tag_name = {}; |
63 |
my $attr; |
64 |
my $value; |
65 |
my $value2; # case insensitive |
66 |
|
67 |
while (1) { |
68 |
my $token = $p->_get_next_token; |
69 |
last if $token->{type} eq 'end-of-file'; |
70 |
|
71 |
if ($token->{type} eq 'start tag') { |
72 |
if ({ |
73 |
title => 1, |
74 |
textarea => 1, |
75 |
}->{$token->{tag_name}}) { |
76 |
$p->{content_model_flag} = 'RCDATA'; |
77 |
} elsif ({ |
78 |
style => 1, |
79 |
script => 1, |
80 |
xmp => 1, |
81 |
noframes => 1, |
82 |
noembed => 1, |
83 |
noscript => 1, |
84 |
iframe => 1, |
85 |
}->{$token->{tag_name}}) { |
86 |
$p->{content_model_flag} = 'CDATA'; |
87 |
} elsif ($token->{tag_name} eq 'plaintext') { |
88 |
$p->{content_model_flag} = 'PLAINTEXT'; |
89 |
} |
90 |
$p->{last_emitted_start_tag_name} = $token->{tag_name}; |
91 |
} |
92 |
|
93 |
if ($token->{type} eq 'start tag') { |
94 |
if ({ |
95 |
div => 1, span => 1, |
96 |
td => 1, th => 1, |
97 |
br => 1, |
98 |
b => 1, i => 1, s => 1, strike => 1, u => 1, tt => 1, |
99 |
big => 1, small => 1, |
100 |
font => 1, basefont => 1, |
101 |
marquee => 1, blink => 1, |
102 |
hr => 1, |
103 |
center => 1, |
104 |
nobr => 1, |
105 |
layer => 1, ilayer => 1, |
106 |
spacer => 1, |
107 |
multicol => 1, |
108 |
}->{$token->{tag_name}}) { |
109 |
$data{presentational}++; |
110 |
} elsif ({ |
111 |
img => 1, object => 1, applet => 1, embed => 1, iframe => 1, |
112 |
frame => 1, script => 1, canvas => 1, video => 1, audio => 1, |
113 |
bgsound => 1, |
114 |
input => 1, select => 1, textarea => 1, output => 1, |
115 |
datalist => 1, datagrid => 1, |
116 |
area => 1, |
117 |
}->{$token->{tag_name}}) { |
118 |
$data{object}++; |
119 |
} elsif ({ |
120 |
h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1, |
121 |
li => 1, |
122 |
dt => 1, dd => 1, |
123 |
meta => 1, link => 1, |
124 |
acronym => 1, abbr => 1, |
125 |
cite => 1, ins => 1, del => 1, dfn => 1, |
126 |
strong => 1, em => 1, |
127 |
address => 1, |
128 |
caption => 1, thead => 1, tfoot => 1, col => 1, colgroup => 1, |
129 |
}->{$token->{tag_name}}) { |
130 |
$data{semantic}++; |
131 |
} |
132 |
} elsif ($token->{type} eq 'character') { |
133 |
$token->{data} =~ s/\s+//g; |
134 |
$data{text} += length $token->{data}; |
135 |
} |
136 |
} |
137 |
|
138 |
my $x = 1000 / ($data{text} or 1); |
139 |
print $data{$_} * $x, "\t" for qw/presentational semantic object/; |
140 |
print "\n"; |
141 |
} # tokenize |
142 |
|
143 |
=head1 AUTHOR |
144 |
|
145 |
Wakaba <w@suika.fam.cx>. |
146 |
|
147 |
=head1 LICENSE |
148 |
|
149 |
Copyright 2007 Wakaba <w@suika.fam.cx> |
150 |
|
151 |
This library is free software; you can redistribute it |
152 |
and/or modify it under the same terms as Perl itself. |
153 |
|
154 |
=cut |
155 |
|
156 |
1; |
157 |
## $Date: 2007/06/05 14:18:40 $ |
158 |
|