1 |
# |
2 |
# -*- Perl -*- |
3 |
# $Id: hdml.pl,v 1.6 2001/02/07 10:23:10 knok Exp $ |
4 |
# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved. |
5 |
# Copyright (C) 2000 Namazu Project All rights reserved. |
6 |
# This is free software with ABSOLUTELY NO WARRANTY. |
7 |
# |
8 |
# This program is free software; you can redistribute it and/or modify |
9 |
# it under the terms of the GNU General Public License as published by |
10 |
# the Free Software Foundation; either versions 2, or (at your option) |
11 |
# any later version. |
12 |
# |
13 |
# This program is distributed in the hope that it will be useful |
14 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
# GNU General Public License for more details. |
17 |
# |
18 |
# You should have received a copy of the GNU General Public License |
19 |
# along with this program; if not, write to the Free Software |
20 |
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
21 |
# 02111-1307, USA |
22 |
# |
23 |
# This file must be encoded in EUC-JP encoding |
24 |
# |
25 |
|
26 |
package hdml; |
27 |
use strict; |
28 |
require 'gfilter.pl'; |
29 |
|
30 |
sub mediatype() { |
31 |
return ('text/x-hdml'); |
32 |
} |
33 |
|
34 |
sub status() { |
35 |
return 'yes'; |
36 |
} |
37 |
|
38 |
sub recursive() { |
39 |
return 0; |
40 |
} |
41 |
|
42 |
sub pre_codeconv() { |
43 |
return 1; |
44 |
} |
45 |
|
46 |
sub post_codeconv () { |
47 |
return 0; |
48 |
} |
49 |
|
50 |
sub add_magic ($) { |
51 |
my $magic = shift @_; |
52 |
$magic->addSpecials('text/x-hdml', '<[Hh][Dd][Mm][Ll][^>]*>'); |
53 |
return; |
54 |
} |
55 |
|
56 |
sub filter ($$$$$) { |
57 |
my ($orig_cfile, $cont, $weighted_str, $headings, $fields) |
58 |
= @_; |
59 |
my $cfile = defined $orig_cfile ? $$orig_cfile : ''; |
60 |
|
61 |
util::vprint("Processing hdml file ...\n"); |
62 |
|
63 |
hdml($cont, $weighted_str, $fields, $headings); |
64 |
|
65 |
gfilter::line_adjust_filter($cont); |
66 |
gfilter::line_adjust_filter($weighted_str); |
67 |
gfilter::white_space_adjust_filter($cont); |
68 |
gfilter::show_filter_debug_info($cont, $weighted_str, |
69 |
$fields, $headings); |
70 |
return undef; |
71 |
} |
72 |
|
73 |
sub hdml ($$$$) { |
74 |
my ($contref, $weighted_str, $fields, $headings) = @_; |
75 |
|
76 |
hdml::escape_lt_gt($contref); |
77 |
$fields->{'title'} = hdml::get_title($contref, $weighted_str); |
78 |
hdml::get_img_alt($contref); |
79 |
hdml::get_title_attr($contref); |
80 |
hdml::normalize_hdml_element($contref); |
81 |
hdml::weight_element($contref, $weighted_str, $headings); |
82 |
hdml::remove_hdml_elements($contref); |
83 |
# restore entities of each content. |
84 |
hdml::decode_entity($contref); |
85 |
hdml::decode_entity($weighted_str); |
86 |
hdml::decode_entity($headings); |
87 |
for my $key (keys %{$fields}) { |
88 |
hdml::decode_entity(\$fields->{$key}); |
89 |
} |
90 |
} |
91 |
|
92 |
# Convert independent < > s into entity references for escaping. |
93 |
# Substitute twice for safe. |
94 |
sub escape_lt_gt ($) { |
95 |
my ($contref) = @_; |
96 |
|
97 |
$$contref =~ s/\s<\s/ < /g; |
98 |
$$contref =~ s/\s>\s/ > /g; |
99 |
$$contref =~ s/\s<\s/ < /g; |
100 |
$$contref =~ s/\s>\s/ > /g; |
101 |
} |
102 |
|
103 |
sub get_title ($$) { |
104 |
my ($contref, $weighted_str) = @_; |
105 |
my $title = ''; |
106 |
|
107 |
if ($$contref =~ s!<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\'>]*)[\"\']?[^>]*>! $1 !i) { |
108 |
$title = $1; |
109 |
$title =~ s/\s+/ /g; |
110 |
$title =~ s/^\s+//; |
111 |
$title =~ s/\s+$//; |
112 |
my $weight = $conf::Weight{'html'}->{'title'}; |
113 |
$$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n"; |
114 |
} else { |
115 |
$title = $conf::NO_TITLE; |
116 |
} |
117 |
|
118 |
return $title; |
119 |
} |
120 |
|
121 |
# Get foo from <IMG ... ALT="foo"> |
122 |
# It's not to handle HTML strictly. |
123 |
sub get_img_alt ($) { |
124 |
my ($contref) = @_; |
125 |
|
126 |
$$contref =~ s/<IMG[^>]*\s+ALT\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #" |
127 |
} |
128 |
|
129 |
# Get foo from <XXX ... TITLE="foo"> |
130 |
sub get_title_attr ($) { |
131 |
my ($contref) = @_; |
132 |
|
133 |
$$contref =~ s/<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #" |
134 |
} |
135 |
|
136 |
# Normalize elements like: <A HREF...> -> <A> |
137 |
sub normalize_hdml_element ($) { |
138 |
my ($contref) = @_; |
139 |
|
140 |
$$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g; |
141 |
} |
142 |
|
143 |
|
144 |
# Weight a score of a keyword in a given text using %conf::Weight hash. |
145 |
# This process make the text be surround by temporary tags |
146 |
# \x7fXX\x7f and \x7f/XX\x7f. XX represents score. |
147 |
# Sort keys of %conf::Weight for processing <a> first. |
148 |
# Because <a> has a tendency to be inside of other tags. |
149 |
# Thus, it does'not processing for nexted tags strictly. |
150 |
# Moreover, it does special processing for <h[1-6]> for summarization. |
151 |
sub weight_element ($$$ ) { |
152 |
my ($contref, $weighted_str, $headings) = @_; |
153 |
|
154 |
for my $element (sort keys(%{$conf::Weight{'html'}})) { |
155 |
my $tmp = ""; |
156 |
$$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies; |
157 |
$$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} |
158 |
&& $tmp; |
159 |
my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} ? |
160 |
$conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1; |
161 |
$$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp; |
162 |
} |
163 |
} |
164 |
|
165 |
sub weight_element_sub ($$$) { |
166 |
my ($element, $text, $tmp) = @_; |
167 |
|
168 |
my $space = element_space($element); |
169 |
$text =~ s/<[^>]*>//g; |
170 |
$$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG; |
171 |
$element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} ? " " : "$space$text$space"; |
172 |
} |
173 |
|
174 |
|
175 |
# determine whether a given element should be delete or be substituted with space |
176 |
sub element_space ($) { |
177 |
$_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " "; |
178 |
} |
179 |
|
180 |
# remove all HTML elements. it's not perfect but almost works. |
181 |
sub remove_hdml_elements ($) { |
182 |
my ($contref) = @_; |
183 |
|
184 |
# remove all comments |
185 |
$$contref =~ s/<!?--.*?-->//gs; |
186 |
|
187 |
# remove all elements |
188 |
$$contref =~ s!</?([A-Z]\w*)(?:\s+[A-Z]\w*(?:\s*=\s*(?:(["']).*?\2|[\w\-.]+))?)*\s*>!element_space($1)!gsixe; |
189 |
|
190 |
} |
191 |
|
192 |
# Decode a numberd entity. Exclude an invalid number. |
193 |
sub decode_numbered_entity ($) { |
194 |
my ($num) = @_; |
195 |
return "" |
196 |
if $num >= 0 && $num <= 8 || $num >= 11 && $num <= 31 || $num >=127; |
197 |
sprintf ("%c",$num); |
198 |
} |
199 |
|
200 |
|
201 |
# Decode an entity. Ignore characters of right half of ISO-8859-1. |
202 |
# Because it can't be handled in EUC encoding. |
203 |
# This function provides sequential entities like: " < > |
204 |
sub decode_entity ($) { |
205 |
my ($text) = @_; |
206 |
|
207 |
return unless defined($$text); |
208 |
|
209 |
$$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge; |
210 |
$$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei; |
211 |
$$text =~ s/"[;\s]/\"/g; #" |
212 |
$$text =~ s/&[;\s]/&/g; |
213 |
$$text =~ s/<[;\s]/</g; |
214 |
$$text =~ s/>[;\s]/>/g; |
215 |
$$text =~ s/ [;\s]/ /g; |
216 |
} |
217 |
|
218 |
|
219 |
# encode entities: only '<', '>', and '&' |
220 |
sub encode_entity ($) { |
221 |
my ($tmp) = @_; |
222 |
|
223 |
$$tmp =~ s/&/&/g; # & should be processed first |
224 |
$$tmp =~ s/</</g; |
225 |
$$tmp =~ s/>/>/g; |
226 |
$$tmp; |
227 |
} |
228 |
|
229 |
1; |