/[suikacvs]/www/namazu/filter/html.pl
Suika

Contents of /www/namazu/filter/html.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Nov 30 07:56:45 2001 UTC (22 years, 5 months ago) by wakaba
Branch: MAIN, wakaba
CVS Tags: initial, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain

1 #
2 # -*- Perl -*-
3 # $Id: html.pl,v 1.3 2001/11/04 05:07:28 wakaba Exp $
4 # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
5 # Copyright (C) 2000 Namazu Project All rights reserved.
6 # This is free software with ABSOLUTELY NO WARRANTY.
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either versions 2, or (at your option)
11 # any later version.
12 #
13 # This program is distributed in the hope that it will be useful
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 # 02111-1307, USA
22 #
23 # This file must be encoded in EUC-JP encoding
24 #
25
26 package html;
27 use strict;
28 require 'gfilter.pl';
29 my %map_entity_jisx0213 = (
30 #nbsp => "a9a2",
31 iexcl => "a9a3", curren => "a9a4", brvbar => "a9a5",
32 copy => "a9a6", ordf => "a9a7", laquo => "a9a8",
33 shy => "a9a9", reg => "a9aa", macr => "a9ab",
34 sup2 => "a9ac", sup3 => "a9ad", middot => "a9ae",
35 cedil => "a9af", sup1 => "a9b0", ordm => "a9b1",
36 raquo => "a9b2", frac14 => "a9b3", frac12 => "a9b4",
37 frac34 => "a9b5", iquest => "a9b6", Agrave => "a9b7",
38 Aacute => "a9b8", Acirc => "a9b9", Atilde => "a9ba",
39 Auml => "a9bb", Aring => "a9bc", AElig => "a9bd",
40 Ccedil => "a9be", Egrave => "a9bf", Eacute => "a9c0",
41 Ecirc => "a9c1", Euml => "a9c2", Igrave => "a9c3",
42 Iacute => "a9c4", Icirc => "a9c5", Iuml => "a9c6",
43 ETH => "a9c7", Ntilde => "a9c8", Ograve => "a9c9",
44 Oacute => "a9ca", Ocirc => "a9cb", Otilde => "a9cc",
45 Ouml => "a9cd", Oslash => "a9ce", Ugrave => "a9cf",
46 Uacute => "a9d0", Ucirc => "a9d1", Uuml => "a9d2",
47 Yacute => "a9d3", THORN => "a9d4", szlig => "a9d5",
48 agrave => "a9d6", aacute => "a9d7", acirc => "a9d8",
49 atilde => "a9d9", auml => "a9da", aring => "a9db",
50 aelig => "a9dc", ccedil => "a9dd", egrave => "a9de",
51 eacute => "a9df", ecirc => "a9e0", euml => "a9e1",
52 igrave => "a9e2", iacute => "a9e3", icirc => "a9e4",
53 iuml => "a9e5", eth => "a9e6", ntilde => "a9e7",
54 ograve => "a9e8", oacute => "a9e9", ocirc => "a9ea",
55 otilde => "a9eb", ouml => "a9ec", oslash => "a9ed",
56 ugrave => "a9ee", uacute => "a9ef", ucirc => "a9f0",
57 uuml => "a9f1", yacute => "a9f2", thorn => "a9f3",
58 yuml => "a9f4", OElig => "abab", oelig => "abaa",
59 Scaron => "aaa6", scaron => "aab2", ndash => "a3fc",
60 euro => "a9a1", sigmaf => "a6d9", bull => "a3c0",
61 alefsym => "a3dc", harr => "a2f1", empty => "a2c7",
62 notin => "a2c6", cong => "a2ed", asymp => "a2ee",
63 nsub => "a2c2", oplus => "a2d1", otimes => "a2d3",
64 spades => "a6ba", clubs => "a6c0", hearts => "a6be",
65 diams => "a6bc",
66 );
67
68 sub mediatype() {
69 return ('text/html');
70 }
71
72 sub status() {
73 return 'yes';
74 }
75
76 sub recursive() {
77 return 0;
78 }
79
80 sub pre_codeconv() {
81 return 1;
82 }
83
84 sub post_codeconv () {
85 return 0;
86 }
87
88 sub add_magic ($) {
89 my $magic = shift;
90 $magic->addSpecials("text/html", "<!DOCTYPE HTML PUBLIC");
91 $magic->addSpecials("text/html", "<!DOCTYPE html PUBLIC");
92 $magic->addSpecials("text/html", "-//W3C//DTD XHTML");
93 $magic->addSpecials("text/html", "-//W3C//DTD HTML");
94 $magic->addSpecials("text/html", 'xmlns="http://www.w3.org/1999/xhtml"');
95 $magic->addSpecials("text/html", "<meta http-equiv");
96 $magic->addSpecials("text/html", "</HTML>");
97 $magic->addSpecials("text/html", "</Html>");
98 $magic->addSpecials("text/html", "</html>");
99 $magic->addFileExts('\\.html$' => 'text/html');
100 $magic->addFileExts('\\.html\\..+$' => 'text/html');
101 return;
102 }
103
104 sub filter ($$$$$) {
105 my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
106 = @_;
107 my $cfile = defined $orig_cfile ? $$orig_cfile : '';
108
109 util::vprint("Processing html file ...\n");
110
111 if ($var::Opt{'robotexclude'}) {
112 my $err = isexcluded($cont);
113 return $err if $err;
114 }
115
116 html_filter($cont, $weighted_str, $fields, $headings);
117
118 gfilter::line_adjust_filter($cont);
119 gfilter::line_adjust_filter($weighted_str);
120 gfilter::white_space_adjust_filter($cont);
121 gfilter::show_filter_debug_info($cont, $weighted_str,
122 $fields, $headings);
123 return undef;
124 }
125
126 # Check wheter or not the given URI is excluded.
127 sub isexcluded ($) {
128 my ($contref) = @_;
129 my $err = undef;
130
131 if ($$contref =~ /META\s+NAME\s*=\s*([\'\"]?)ROBOTS\1\s+[^>]*
132 CONTENT\s*=\s*([\'\"]?).*?(NOINDEX|NONE).*?\2[^>]*>/ix) #"
133 {
134 $err = _("is excluded because of <meta name=\"robots\" ...>");
135 }
136 return $err;
137 }
138
139
140 sub html_filter ($$$$) {
141 my ($contref, $weighted_str, $fields, $headings) = @_;
142
143 html::escape_lt_gt($contref);
144 $fields->{'title'} = html::get_title($contref, $weighted_str);
145 html::get_author($contref, $fields);
146 html::get_meta_tags($contref, $weighted_str, $fields);
147 html::get_img_alt($contref);
148 html::get_table_summary($contref);
149 html::get_title_attr($contref);
150 html::normalize_html_element($contref);
151 html::erase_above_body($contref);
152 html::weight_element($contref, $weighted_str, $headings);
153 html::remove_html_elements($contref);
154 # restore entities of each content.
155 html::decode_entity($contref);
156 html::decode_entity($weighted_str);
157 html::decode_entity($headings);
158 for my $key (keys %{$fields}) {
159 html::decode_entity(\$fields->{$key});
160 }
161 }
162
163 # Convert independent < > s into entity references for escaping.
164 # Substitute twice for safe.
165 sub escape_lt_gt ($) {
166 my ($contref) = @_;
167
168 $$contref =~ s/\s<\s/ &lt; /g;
169 $$contref =~ s/\s>\s/ &gt; /g;
170 $$contref =~ s/\s<\s/ &lt; /g;
171 $$contref =~ s/\s>\s/ &gt; /g;
172 }
173
174 sub get_author ($$) {
175 my ($contref, $fields) = @_;
176
177 # <LINK REV=MADE HREF="mailto:ccsatoru@vega.aichi-u.ac.jp">
178
179 if ($$contref =~ m!META\s+NAME\s*=\s*([\'\"]?)AUTHOR\1\s+[^>]*
180 CONTENT\s*=\s*([\'\"]?)(.*?)\2[^>]*>!ix) { #"
181 $fields->{'author'} = $3;
182 } elsif ($$contref =~ m!<LINK\s[^>]*?HREF=([\"\'])mailto:(.*?)\1\s*>!i) { #"
183 $fields->{'author'} = $2;
184 } elsif ($$contref =~ m!.*<ADDRESS[^>]*>([^<]*?)</ADDRESS>!i) {
185 my $tmp = $1;
186 # $tmp =~ s/\s//g;
187 if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) {
188 $fields->{'author'} = $1;
189 }
190 }
191 }
192
193
194 # Get title from <title>..</title>
195 # It's okay to exits two or more <title>...</TITLE>.
196 # First one will be retrieved.
197 sub get_title ($$) {
198 my ($contref, $weighted_str) = @_;
199 my $title = '';
200
201 if ($$contref =~ s!<TITLE[^>]*>([^<]+)</TITLE>!!i) {
202 $title = $1;
203 $title =~ s/\s+/ /g;
204 $title =~ s/^\s+//;
205 $title =~ s/\s+$//;
206 my $weight = $conf::Weight{'html'}->{'title'};
207 $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
208 } else {
209 $title = $conf::NO_TITLE;
210 }
211
212 return $title;
213 }
214
215 # get foo bar from <META NAME="keywords|description" CONTENT="foo bar">
216 sub get_meta_tags ($$$) {
217 my ($contref, $weighted_str, $fields) = @_;
218
219 # <meta name="keywords" content="foo bar baz">
220
221 my $weight = $conf::Weight{'metakey'};
222 $$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n"
223 if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?) #"
224 keywords\1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #"
225
226 # <meta name="description" content="foo bar baz">
227 $$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n"
228 if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?)description #"
229 \1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #"
230
231 if ($var::Opt{'meta'}) {
232 my @keys = split '\|', $conf::META_TAGS;
233 for my $key (@keys) {
234 while ($$contref =~ /<meta\s+name\s*=\s*([\'\"]?)$key #"
235 \1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/gix)
236 {
237 $fields->{$key} .= $3 . " ";
238 }
239 util::dprint("meta: $key: $fields->{$key}\n")
240 if defined $fields->{$key};
241 }
242 }
243 }
244
245 # Get foo from <IMG ... ALT="foo">
246 # It's not to handle HTML strictly.
247 sub get_img_alt ($) {
248 my ($contref) = @_;
249
250 $$contref =~ s/<IMG[^>]*\s+ALT\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
251 }
252
253 # Get foo from <TABLE ... SUMMARY="foo">
254 sub get_table_summary ($) {
255 my ($contref) = @_;
256
257 $$contref =~ s/<TABLE[^>]*\s+SUMMARY\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
258 }
259
260 # Get foo from <XXX ... TITLE="foo">
261 sub get_title_attr ($) {
262 my ($contref) = @_;
263
264 $$contref =~ s/<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
265 }
266
267 # Normalize elements like: <A HREF...> -> <A>
268 sub normalize_html_element ($) {
269 my ($contref) = @_;
270
271 $$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g;
272 }
273
274 # Remove contents above <body>.
275 sub erase_above_body ($) {
276 my ($contref) = @_;
277
278 $$contref =~ s/^.*<body>//is;
279 }
280
281
282 # Weight a score of a keyword in a given text using %conf::Weight hash.
283 # This process make the text be surround by temporary tags
284 # \x7fXX\x7f and \x7f/XX\x7f. XX represents score.
285 # Sort keys of %conf::Weight for processing <a> first.
286 # Because <a> has a tendency to be inside of other tags.
287 # Thus, it does'not processing for nexted tags strictly.
288 # Moreover, it does special processing for <h[1-6]> for summarization.
289 sub weight_element ($$$ ) {
290 my ($contref, $weighted_str, $headings) = @_;
291
292 for my $element (sort keys(%{$conf::Weight{'html'}})) {
293 my $tmp = "";
294 $$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies;
295 $$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'}
296 && $tmp;
297 my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ?
298 $conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1;
299 $$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp;
300 }
301 }
302
303 sub weight_element_sub ($$$) {
304 my ($element, $text, $tmp) = @_;
305
306 my $space = element_space($element);
307 $text =~ s/<[^>]*>//g;
308 $$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG;
309 $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ? " " : "$space$text$space";
310 }
311
312
313 # determine whether a given element should be delete or be substituted with space
314 sub element_space ($) {
315 $_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " ";
316 }
317
318 # remove all HTML elements. it's not perfect but almost works.
319 sub remove_html_elements ($) {
320 my ($contref) = @_;
321
322 # remove all comments
323 $$contref =~ s/<!--.*?-->//gs;
324
325 # remove all elements
326 $$contref =~ s!</?([A-Z]\w*)(?:\s+[A-Z]\w*(?:\s*=\s*(?:(["']).*?\2|[\w\-.]+))?)*\s*>!element_space($1)!gsixe;
327
328 }
329
330 # Decode a numberd entity. Exclude an invalid number.
331 sub decode_numbered_entity ($) {
332 my ($num) = @_;
333 return ""
334 if $num >= 0 && $num <= 8 || $num >= 11 && $num <= 31 || $num >=127;
335 sprintf ("%c",$num);
336 }
337
338 sub decode_entity_jisx0213($) {
339 my $name = shift;
340 my $euc = $map_entity_jisx0213{$name};
341 if ($euc) {
342 return pack "H4", $euc;
343 } else {
344 return '&'.$name.';';
345 }
346 }
347
348 # Decode an entity. Ignore characters of right half of ISO-8859-1.
349 # Because it can't be handled in EUC encoding.
350 # This function provides sequential entities like: &quot &lt &gt;
351 sub decode_entity ($) {
352 my ($text) = @_;
353
354 return unless defined($$text);
355
356 $$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge;
357 $$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei;
358 $$text =~ s/&quot[;\s]/\"/g; #"
359 $$text =~ s/&amp[;\s]/&/g;
360 $$text =~ s/&lt[;\s]/</g;
361 $$text =~ s/&gt[;\s]/>/g;
362 $$text =~ s/&nbsp[;\s]/ /g;
363 $$text =~ s/&([A-Za-z0-9]+)[;\s]/&decode_entity_jisx0213($1)/ge;
364 }
365
366
367 # encode entities: only '<', '>', and '&'
368 sub encode_entity ($) {
369 my ($tmp) = @_;
370
371 $$tmp =~ s/&/&amp;/g; # &amp; should be processed first
372 $$tmp =~ s/</&lt;/g;
373 $$tmp =~ s/>/&gt;/g;
374 $$tmp;
375 }
376
377 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24