/[suikacvs]/www/namazu/filter/hdml.pl
Suika

Contents of /www/namazu/filter/hdml.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Nov 30 07:56:45 2001 UTC (22 years, 5 months ago) by wakaba
Branch: MAIN, wakaba
CVS Tags: initial, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain

1 #
2 # -*- Perl -*-
3 # $Id: hdml.pl,v 1.6 2001/02/07 10:23:10 knok Exp $
4 # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
5 # Copyright (C) 2000 Namazu Project All rights reserved.
6 # This is free software with ABSOLUTELY NO WARRANTY.
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either versions 2, or (at your option)
11 # any later version.
12 #
13 # This program is distributed in the hope that it will be useful
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 # 02111-1307, USA
22 #
23 # This file must be encoded in EUC-JP encoding
24 #
25
26 package hdml;
27 use strict;
28 require 'gfilter.pl';
29
30 sub mediatype() {
31 return ('text/x-hdml');
32 }
33
34 sub status() {
35 return 'yes';
36 }
37
38 sub recursive() {
39 return 0;
40 }
41
42 sub pre_codeconv() {
43 return 1;
44 }
45
46 sub post_codeconv () {
47 return 0;
48 }
49
50 sub add_magic ($) {
51 my $magic = shift @_;
52 $magic->addSpecials('text/x-hdml', '<[Hh][Dd][Mm][Ll][^>]*>');
53 return;
54 }
55
56 sub filter ($$$$$) {
57 my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
58 = @_;
59 my $cfile = defined $orig_cfile ? $$orig_cfile : '';
60
61 util::vprint("Processing hdml file ...\n");
62
63 hdml($cont, $weighted_str, $fields, $headings);
64
65 gfilter::line_adjust_filter($cont);
66 gfilter::line_adjust_filter($weighted_str);
67 gfilter::white_space_adjust_filter($cont);
68 gfilter::show_filter_debug_info($cont, $weighted_str,
69 $fields, $headings);
70 return undef;
71 }
72
73 sub hdml ($$$$) {
74 my ($contref, $weighted_str, $fields, $headings) = @_;
75
76 hdml::escape_lt_gt($contref);
77 $fields->{'title'} = hdml::get_title($contref, $weighted_str);
78 hdml::get_img_alt($contref);
79 hdml::get_title_attr($contref);
80 hdml::normalize_hdml_element($contref);
81 hdml::weight_element($contref, $weighted_str, $headings);
82 hdml::remove_hdml_elements($contref);
83 # restore entities of each content.
84 hdml::decode_entity($contref);
85 hdml::decode_entity($weighted_str);
86 hdml::decode_entity($headings);
87 for my $key (keys %{$fields}) {
88 hdml::decode_entity(\$fields->{$key});
89 }
90 }
91
92 # Convert independent < > s into entity references for escaping.
93 # Substitute twice for safe.
94 sub escape_lt_gt ($) {
95 my ($contref) = @_;
96
97 $$contref =~ s/\s<\s/ &lt; /g;
98 $$contref =~ s/\s>\s/ &gt; /g;
99 $$contref =~ s/\s<\s/ &lt; /g;
100 $$contref =~ s/\s>\s/ &gt; /g;
101 }
102
103 sub get_title ($$) {
104 my ($contref, $weighted_str) = @_;
105 my $title = '';
106
107 if ($$contref =~ s!<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\'>]*)[\"\']?[^>]*>! $1 !i) {
108 $title = $1;
109 $title =~ s/\s+/ /g;
110 $title =~ s/^\s+//;
111 $title =~ s/\s+$//;
112 my $weight = $conf::Weight{'html'}->{'title'};
113 $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
114 } else {
115 $title = $conf::NO_TITLE;
116 }
117
118 return $title;
119 }
120
121 # Get foo from <IMG ... ALT="foo">
122 # It's not to handle HTML strictly.
123 sub get_img_alt ($) {
124 my ($contref) = @_;
125
126 $$contref =~ s/<IMG[^>]*\s+ALT\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
127 }
128
129 # Get foo from <XXX ... TITLE="foo">
130 sub get_title_attr ($) {
131 my ($contref) = @_;
132
133 $$contref =~ s/<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
134 }
135
136 # Normalize elements like: <A HREF...> -> <A>
137 sub normalize_hdml_element ($) {
138 my ($contref) = @_;
139
140 $$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g;
141 }
142
143
144 # Weight a score of a keyword in a given text using %conf::Weight hash.
145 # This process make the text be surround by temporary tags
146 # \x7fXX\x7f and \x7f/XX\x7f. XX represents score.
147 # Sort keys of %conf::Weight for processing <a> first.
148 # Because <a> has a tendency to be inside of other tags.
149 # Thus, it does'not processing for nexted tags strictly.
150 # Moreover, it does special processing for <h[1-6]> for summarization.
151 sub weight_element ($$$ ) {
152 my ($contref, $weighted_str, $headings) = @_;
153
154 for my $element (sort keys(%{$conf::Weight{'html'}})) {
155 my $tmp = "";
156 $$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies;
157 $$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'}
158 && $tmp;
159 my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} ?
160 $conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1;
161 $$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp;
162 }
163 }
164
165 sub weight_element_sub ($$$) {
166 my ($element, $text, $tmp) = @_;
167
168 my $space = element_space($element);
169 $text =~ s/<[^>]*>//g;
170 $$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG;
171 $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} ? " " : "$space$text$space";
172 }
173
174
175 # determine whether a given element should be delete or be substituted with space
176 sub element_space ($) {
177 $_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " ";
178 }
179
180 # remove all HTML elements. it's not perfect but almost works.
181 sub remove_hdml_elements ($) {
182 my ($contref) = @_;
183
184 # remove all comments
185 $$contref =~ s/<!?--.*?-->//gs;
186
187 # remove all elements
188 $$contref =~ s!</?([A-Z]\w*)(?:\s+[A-Z]\w*(?:\s*=\s*(?:(["']).*?\2|[\w\-.]+))?)*\s*>!element_space($1)!gsixe;
189
190 }
191
192 # Decode a numberd entity. Exclude an invalid number.
193 sub decode_numbered_entity ($) {
194 my ($num) = @_;
195 return ""
196 if $num >= 0 && $num <= 8 || $num >= 11 && $num <= 31 || $num >=127;
197 sprintf ("%c",$num);
198 }
199
200
201 # Decode an entity. Ignore characters of right half of ISO-8859-1.
202 # Because it can't be handled in EUC encoding.
203 # This function provides sequential entities like: &quot &lt &gt;
204 sub decode_entity ($) {
205 my ($text) = @_;
206
207 return unless defined($$text);
208
209 $$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge;
210 $$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei;
211 $$text =~ s/&quot[;\s]/\"/g; #"
212 $$text =~ s/&amp[;\s]/&/g;
213 $$text =~ s/&lt[;\s]/</g;
214 $$text =~ s/&gt[;\s]/>/g;
215 $$text =~ s/&nbsp[;\s]/ /g;
216 }
217
218
219 # encode entities: only '<', '>', and '&'
220 sub encode_entity ($) {
221 my ($tmp) = @_;
222
223 $$tmp =~ s/&/&amp;/g; # &amp; should be processed first
224 $$tmp =~ s/</&lt;/g;
225 $$tmp =~ s/>/&gt;/g;
226 $$tmp;
227 }
228
229 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24