1 |
# |
2 |
# -*- Perl -*- |
3 |
# $Id: html.pl,v 1.3 2001/11/04 05:07:28 wakaba Exp $ |
4 |
# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved. |
5 |
# Copyright (C) 2000 Namazu Project All rights reserved. |
6 |
# This is free software with ABSOLUTELY NO WARRANTY. |
7 |
# |
8 |
# This program is free software; you can redistribute it and/or modify |
9 |
# it under the terms of the GNU General Public License as published by |
10 |
# the Free Software Foundation; either versions 2, or (at your option) |
11 |
# any later version. |
12 |
# |
13 |
# This program is distributed in the hope that it will be useful |
14 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
# GNU General Public License for more details. |
17 |
# |
18 |
# You should have received a copy of the GNU General Public License |
19 |
# along with this program; if not, write to the Free Software |
20 |
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
21 |
# 02111-1307, USA |
22 |
# |
23 |
# This file must be encoded in EUC-JP encoding |
24 |
# |
25 |
|
26 |
package html; |
27 |
use strict; |
28 |
require 'gfilter.pl'; |
29 |
my %map_entity_jisx0213 = ( |
30 |
#nbsp => "a9a2", |
31 |
iexcl => "a9a3", curren => "a9a4", brvbar => "a9a5", |
32 |
copy => "a9a6", ordf => "a9a7", laquo => "a9a8", |
33 |
shy => "a9a9", reg => "a9aa", macr => "a9ab", |
34 |
sup2 => "a9ac", sup3 => "a9ad", middot => "a9ae", |
35 |
cedil => "a9af", sup1 => "a9b0", ordm => "a9b1", |
36 |
raquo => "a9b2", frac14 => "a9b3", frac12 => "a9b4", |
37 |
frac34 => "a9b5", iquest => "a9b6", Agrave => "a9b7", |
38 |
Aacute => "a9b8", Acirc => "a9b9", Atilde => "a9ba", |
39 |
Auml => "a9bb", Aring => "a9bc", AElig => "a9bd", |
40 |
Ccedil => "a9be", Egrave => "a9bf", Eacute => "a9c0", |
41 |
Ecirc => "a9c1", Euml => "a9c2", Igrave => "a9c3", |
42 |
Iacute => "a9c4", Icirc => "a9c5", Iuml => "a9c6", |
43 |
ETH => "a9c7", Ntilde => "a9c8", Ograve => "a9c9", |
44 |
Oacute => "a9ca", Ocirc => "a9cb", Otilde => "a9cc", |
45 |
Ouml => "a9cd", Oslash => "a9ce", Ugrave => "a9cf", |
46 |
Uacute => "a9d0", Ucirc => "a9d1", Uuml => "a9d2", |
47 |
Yacute => "a9d3", THORN => "a9d4", szlig => "a9d5", |
48 |
agrave => "a9d6", aacute => "a9d7", acirc => "a9d8", |
49 |
atilde => "a9d9", auml => "a9da", aring => "a9db", |
50 |
aelig => "a9dc", ccedil => "a9dd", egrave => "a9de", |
51 |
eacute => "a9df", ecirc => "a9e0", euml => "a9e1", |
52 |
igrave => "a9e2", iacute => "a9e3", icirc => "a9e4", |
53 |
iuml => "a9e5", eth => "a9e6", ntilde => "a9e7", |
54 |
ograve => "a9e8", oacute => "a9e9", ocirc => "a9ea", |
55 |
otilde => "a9eb", ouml => "a9ec", oslash => "a9ed", |
56 |
ugrave => "a9ee", uacute => "a9ef", ucirc => "a9f0", |
57 |
uuml => "a9f1", yacute => "a9f2", thorn => "a9f3", |
58 |
yuml => "a9f4", OElig => "abab", oelig => "abaa", |
59 |
Scaron => "aaa6", scaron => "aab2", ndash => "a3fc", |
60 |
euro => "a9a1", sigmaf => "a6d9", bull => "a3c0", |
61 |
alefsym => "a3dc", harr => "a2f1", empty => "a2c7", |
62 |
notin => "a2c6", cong => "a2ed", asymp => "a2ee", |
63 |
nsub => "a2c2", oplus => "a2d1", otimes => "a2d3", |
64 |
spades => "a6ba", clubs => "a6c0", hearts => "a6be", |
65 |
diams => "a6bc", |
66 |
); |
67 |
|
68 |
sub mediatype() { |
69 |
return ('text/html'); |
70 |
} |
71 |
|
72 |
sub status() { |
73 |
return 'yes'; |
74 |
} |
75 |
|
76 |
sub recursive() { |
77 |
return 0; |
78 |
} |
79 |
|
80 |
sub pre_codeconv() { |
81 |
return 1; |
82 |
} |
83 |
|
84 |
sub post_codeconv () { |
85 |
return 0; |
86 |
} |
87 |
|
88 |
sub add_magic ($) { |
89 |
my $magic = shift; |
90 |
$magic->addSpecials("text/html", "<!DOCTYPE HTML PUBLIC"); |
91 |
$magic->addSpecials("text/html", "<!DOCTYPE html PUBLIC"); |
92 |
$magic->addSpecials("text/html", "-//W3C//DTD XHTML"); |
93 |
$magic->addSpecials("text/html", "-//W3C//DTD HTML"); |
94 |
$magic->addSpecials("text/html", 'xmlns="http://www.w3.org/1999/xhtml"'); |
95 |
$magic->addSpecials("text/html", "<meta http-equiv"); |
96 |
$magic->addSpecials("text/html", "</HTML>"); |
97 |
$magic->addSpecials("text/html", "</Html>"); |
98 |
$magic->addSpecials("text/html", "</html>"); |
99 |
$magic->addFileExts('\\.html$' => 'text/html'); |
100 |
$magic->addFileExts('\\.html\\..+$' => 'text/html'); |
101 |
return; |
102 |
} |
103 |
|
104 |
sub filter ($$$$$) { |
105 |
my ($orig_cfile, $cont, $weighted_str, $headings, $fields) |
106 |
= @_; |
107 |
my $cfile = defined $orig_cfile ? $$orig_cfile : ''; |
108 |
|
109 |
util::vprint("Processing html file ...\n"); |
110 |
|
111 |
if ($var::Opt{'robotexclude'}) { |
112 |
my $err = isexcluded($cont); |
113 |
return $err if $err; |
114 |
} |
115 |
|
116 |
html_filter($cont, $weighted_str, $fields, $headings); |
117 |
|
118 |
gfilter::line_adjust_filter($cont); |
119 |
gfilter::line_adjust_filter($weighted_str); |
120 |
gfilter::white_space_adjust_filter($cont); |
121 |
gfilter::show_filter_debug_info($cont, $weighted_str, |
122 |
$fields, $headings); |
123 |
return undef; |
124 |
} |
125 |
|
126 |
# Check wheter or not the given URI is excluded. |
127 |
sub isexcluded ($) { |
128 |
my ($contref) = @_; |
129 |
my $err = undef; |
130 |
|
131 |
if ($$contref =~ /META\s+NAME\s*=\s*([\'\"]?)ROBOTS\1\s+[^>]* |
132 |
CONTENT\s*=\s*([\'\"]?).*?(NOINDEX|NONE).*?\2[^>]*>/ix) #" |
133 |
{ |
134 |
$err = _("is excluded because of <meta name=\"robots\" ...>"); |
135 |
} |
136 |
return $err; |
137 |
} |
138 |
|
139 |
|
140 |
sub html_filter ($$$$) { |
141 |
my ($contref, $weighted_str, $fields, $headings) = @_; |
142 |
|
143 |
html::escape_lt_gt($contref); |
144 |
$fields->{'title'} = html::get_title($contref, $weighted_str); |
145 |
html::get_author($contref, $fields); |
146 |
html::get_meta_tags($contref, $weighted_str, $fields); |
147 |
html::get_img_alt($contref); |
148 |
html::get_table_summary($contref); |
149 |
html::get_title_attr($contref); |
150 |
html::normalize_html_element($contref); |
151 |
html::erase_above_body($contref); |
152 |
html::weight_element($contref, $weighted_str, $headings); |
153 |
html::remove_html_elements($contref); |
154 |
# restore entities of each content. |
155 |
html::decode_entity($contref); |
156 |
html::decode_entity($weighted_str); |
157 |
html::decode_entity($headings); |
158 |
for my $key (keys %{$fields}) { |
159 |
html::decode_entity(\$fields->{$key}); |
160 |
} |
161 |
} |
162 |
|
163 |
# Convert independent < > s into entity references for escaping. |
164 |
# Substitute twice for safe. |
165 |
sub escape_lt_gt ($) { |
166 |
my ($contref) = @_; |
167 |
|
168 |
$$contref =~ s/\s<\s/ < /g; |
169 |
$$contref =~ s/\s>\s/ > /g; |
170 |
$$contref =~ s/\s<\s/ < /g; |
171 |
$$contref =~ s/\s>\s/ > /g; |
172 |
} |
173 |
|
174 |
sub get_author ($$) { |
175 |
my ($contref, $fields) = @_; |
176 |
|
177 |
# <LINK REV=MADE HREF="mailto:ccsatoru@vega.aichi-u.ac.jp"> |
178 |
|
179 |
if ($$contref =~ m!META\s+NAME\s*=\s*([\'\"]?)AUTHOR\1\s+[^>]* |
180 |
CONTENT\s*=\s*([\'\"]?)(.*?)\2[^>]*>!ix) { #" |
181 |
$fields->{'author'} = $3; |
182 |
} elsif ($$contref =~ m!<LINK\s[^>]*?HREF=([\"\'])mailto:(.*?)\1\s*>!i) { #" |
183 |
$fields->{'author'} = $2; |
184 |
} elsif ($$contref =~ m!.*<ADDRESS[^>]*>([^<]*?)</ADDRESS>!i) { |
185 |
my $tmp = $1; |
186 |
# $tmp =~ s/\s//g; |
187 |
if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) { |
188 |
$fields->{'author'} = $1; |
189 |
} |
190 |
} |
191 |
} |
192 |
|
193 |
|
194 |
# Get title from <title>..</title> |
195 |
# It's okay to exits two or more <title>...</TITLE>. |
196 |
# First one will be retrieved. |
197 |
sub get_title ($$) { |
198 |
my ($contref, $weighted_str) = @_; |
199 |
my $title = ''; |
200 |
|
201 |
if ($$contref =~ s!<TITLE[^>]*>([^<]+)</TITLE>!!i) { |
202 |
$title = $1; |
203 |
$title =~ s/\s+/ /g; |
204 |
$title =~ s/^\s+//; |
205 |
$title =~ s/\s+$//; |
206 |
my $weight = $conf::Weight{'html'}->{'title'}; |
207 |
$$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n"; |
208 |
} else { |
209 |
$title = $conf::NO_TITLE; |
210 |
} |
211 |
|
212 |
return $title; |
213 |
} |
214 |
|
215 |
# get foo bar from <META NAME="keywords|description" CONTENT="foo bar"> |
216 |
sub get_meta_tags ($$$) { |
217 |
my ($contref, $weighted_str, $fields) = @_; |
218 |
|
219 |
# <meta name="keywords" content="foo bar baz"> |
220 |
|
221 |
my $weight = $conf::Weight{'metakey'}; |
222 |
$$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n" |
223 |
if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?) #" |
224 |
keywords\1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #" |
225 |
|
226 |
# <meta name="description" content="foo bar baz"> |
227 |
$$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n" |
228 |
if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?)description #" |
229 |
\1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #" |
230 |
|
231 |
if ($var::Opt{'meta'}) { |
232 |
my @keys = split '\|', $conf::META_TAGS; |
233 |
for my $key (@keys) { |
234 |
while ($$contref =~ /<meta\s+name\s*=\s*([\'\"]?)$key #" |
235 |
\1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/gix) |
236 |
{ |
237 |
$fields->{$key} .= $3 . " "; |
238 |
} |
239 |
util::dprint("meta: $key: $fields->{$key}\n") |
240 |
if defined $fields->{$key}; |
241 |
} |
242 |
} |
243 |
} |
244 |
|
245 |
# Get foo from <IMG ... ALT="foo"> |
246 |
# It's not to handle HTML strictly. |
247 |
sub get_img_alt ($) { |
248 |
my ($contref) = @_; |
249 |
|
250 |
$$contref =~ s/<IMG[^>]*\s+ALT\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #" |
251 |
} |
252 |
|
253 |
# Get foo from <TABLE ... SUMMARY="foo"> |
254 |
sub get_table_summary ($) { |
255 |
my ($contref) = @_; |
256 |
|
257 |
$$contref =~ s/<TABLE[^>]*\s+SUMMARY\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #" |
258 |
} |
259 |
|
260 |
# Get foo from <XXX ... TITLE="foo"> |
261 |
sub get_title_attr ($) { |
262 |
my ($contref) = @_; |
263 |
|
264 |
$$contref =~ s/<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #" |
265 |
} |
266 |
|
267 |
# Normalize elements like: <A HREF...> -> <A> |
268 |
sub normalize_html_element ($) { |
269 |
my ($contref) = @_; |
270 |
|
271 |
$$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g; |
272 |
} |
273 |
|
274 |
# Remove contents above <body>. |
275 |
sub erase_above_body ($) { |
276 |
my ($contref) = @_; |
277 |
|
278 |
$$contref =~ s/^.*<body>//is; |
279 |
} |
280 |
|
281 |
|
282 |
# Weight a score of a keyword in a given text using %conf::Weight hash. |
283 |
# This process make the text be surround by temporary tags |
284 |
# \x7fXX\x7f and \x7f/XX\x7f. XX represents score. |
285 |
# Sort keys of %conf::Weight for processing <a> first. |
286 |
# Because <a> has a tendency to be inside of other tags. |
287 |
# Thus, it does'not processing for nexted tags strictly. |
288 |
# Moreover, it does special processing for <h[1-6]> for summarization. |
289 |
sub weight_element ($$$ ) { |
290 |
my ($contref, $weighted_str, $headings) = @_; |
291 |
|
292 |
for my $element (sort keys(%{$conf::Weight{'html'}})) { |
293 |
my $tmp = ""; |
294 |
$$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies; |
295 |
$$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} |
296 |
&& $tmp; |
297 |
my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ? |
298 |
$conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1; |
299 |
$$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp; |
300 |
} |
301 |
} |
302 |
|
303 |
sub weight_element_sub ($$$) { |
304 |
my ($element, $text, $tmp) = @_; |
305 |
|
306 |
my $space = element_space($element); |
307 |
$text =~ s/<[^>]*>//g; |
308 |
$$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG; |
309 |
$element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ? " " : "$space$text$space"; |
310 |
} |
311 |
|
312 |
|
313 |
# determine whether a given element should be delete or be substituted with space |
314 |
sub element_space ($) { |
315 |
$_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " "; |
316 |
} |
317 |
|
318 |
# remove all HTML elements. it's not perfect but almost works. |
319 |
sub remove_html_elements ($) { |
320 |
my ($contref) = @_; |
321 |
|
322 |
# remove all comments |
323 |
$$contref =~ s/<!--.*?-->//gs; |
324 |
|
325 |
# remove all elements |
326 |
$$contref =~ s!</?([A-Z]\w*)(?:\s+[A-Z]\w*(?:\s*=\s*(?:(["']).*?\2|[\w\-.]+))?)*\s*>!element_space($1)!gsixe; |
327 |
|
328 |
} |
329 |
|
330 |
# Decode a numberd entity. Exclude an invalid number. |
331 |
sub decode_numbered_entity ($) { |
332 |
my ($num) = @_; |
333 |
return "" |
334 |
if $num >= 0 && $num <= 8 || $num >= 11 && $num <= 31 || $num >=127; |
335 |
sprintf ("%c",$num); |
336 |
} |
337 |
|
338 |
sub decode_entity_jisx0213($) { |
339 |
my $name = shift; |
340 |
my $euc = $map_entity_jisx0213{$name}; |
341 |
if ($euc) { |
342 |
return pack "H4", $euc; |
343 |
} else { |
344 |
return '&'.$name.';'; |
345 |
} |
346 |
} |
347 |
|
348 |
# Decode an entity. Ignore characters of right half of ISO-8859-1. |
349 |
# Because it can't be handled in EUC encoding. |
350 |
# This function provides sequential entities like: " < > |
351 |
sub decode_entity ($) { |
352 |
my ($text) = @_; |
353 |
|
354 |
return unless defined($$text); |
355 |
|
356 |
$$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge; |
357 |
$$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei; |
358 |
$$text =~ s/"[;\s]/\"/g; #" |
359 |
$$text =~ s/&[;\s]/&/g; |
360 |
$$text =~ s/<[;\s]/</g; |
361 |
$$text =~ s/>[;\s]/>/g; |
362 |
$$text =~ s/ [;\s]/ /g; |
363 |
$$text =~ s/&([A-Za-z0-9]+)[;\s]/&decode_entity_jisx0213($1)/ge; |
364 |
} |
365 |
|
366 |
|
367 |
# encode entities: only '<', '>', and '&' |
368 |
sub encode_entity ($) { |
369 |
my ($tmp) = @_; |
370 |
|
371 |
$$tmp =~ s/&/&/g; # & should be processed first |
372 |
$$tmp =~ s/</</g; |
373 |
$$tmp =~ s/>/>/g; |
374 |
$$tmp; |
375 |
} |
376 |
|
377 |
1; |