namazu/filter/hdml.pl

#
# -*- Perl -*-
# $Id: hdml.pl,v 1.6 2001/02/07 10:23:10 knok Exp $
# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
# Copyright (C) 2000 Namazu Project All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
# 
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding
#

package hdml;
use strict;
require 'gfilter.pl';

sub mediatype() {
    return ('text/x-hdml');
}

sub status() {
    return 'yes';
}

sub recursive() {
    return 0;
}

sub pre_codeconv() {
    return 1;
}

sub post_codeconv () {
    return 0;
}

sub add_magic ($) {
    my $magic = shift @_;
    $magic->addSpecials('text/x-hdml', '<[Hh][Dd][Mm][Ll][^>]*>');
    return;
}

sub filter ($$$$$) {
    my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
      = @_;
    my $cfile = defined $orig_cfile ? $$orig_cfile : '';

    util::vprint("Processing hdml file ...\n");

    hdml($cont, $weighted_str, $fields, $headings);
    
    gfilter::line_adjust_filter($cont);
    gfilter::line_adjust_filter($weighted_str);
    gfilter::white_space_adjust_filter($cont);
    gfilter::show_filter_debug_info($cont, $weighted_str,
                           $fields, $headings);
    return undef;
}

sub hdml ($$$$) {
    my ($contref, $weighted_str, $fields, $headings) = @_;

    hdml::escape_lt_gt($contref);
    $fields->{'title'} = hdml::get_title($contref, $weighted_str);
    hdml::get_img_alt($contref);
    hdml::get_title_attr($contref);
    hdml::normalize_hdml_element($contref);
    hdml::weight_element($contref, $weighted_str, $headings);
    hdml::remove_hdml_elements($contref);
    # restore entities of each content.
    hdml::decode_entity($contref);
    hdml::decode_entity($weighted_str);
    hdml::decode_entity($headings);
    for my $key (keys %{$fields}) {
        hdml::decode_entity(\$fields->{$key});
    }
}

# Convert independent < > s into entity references for escaping.
# Substitute twice for safe.
sub escape_lt_gt ($) {
    my ($contref) = @_;

    $$contref =~ s/\s<\s/ &lt; /g;
    $$contref =~ s/\s>\s/ &gt; /g;
    $$contref =~ s/\s<\s/ &lt; /g;
    $$contref =~ s/\s>\s/ &gt; /g;
}

sub get_title ($$) {
    my ($contref, $weighted_str) = @_;
    my $title = '';
    
    if ($$contref =~ s!<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\'>]*)[\"\']?[^>]*>! $1 !i) {
        $title = $1;
        $title =~ s/\s+/ /g;
        $title =~ s/^\s+//;
        $title =~ s/\s+$//;
        my $weight = $conf::Weight{'html'}->{'title'};
        $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
    } else {
        $title = $conf::NO_TITLE;
    }

    return $title;
}

# Get foo from <IMG ... ALT="foo">
# It's not to handle HTML strictly.
sub get_img_alt ($) {
    my ($contref) = @_;

    $$contref =~ s/<IMG[^>]*\s+ALT\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
}

# Get foo from <XXX ... TITLE="foo">
sub get_title_attr ($) {
    my ($contref) = @_;

    $$contref =~ s/<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
}

# Normalize elements like: <A HREF...> -> <A>
sub normalize_hdml_element ($) {
    my ($contref) = @_;

    $$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g;
}


# Weight a score of a keyword in a given text using %conf::Weight hash.
# This process make the text be surround by temporary tags 
# \x7fXX\x7f and \x7f/XX\x7f. XX represents score.
# Sort keys of %conf::Weight for processing <a> first.
# Because <a> has a tendency to be inside of other tags.
# Thus, it does'not processing for nexted tags strictly.
# Moreover, it does special processing for <h[1-6]> for summarization.
sub weight_element ($$$ ) {
    my ($contref, $weighted_str, $headings) = @_;

    for my $element (sort keys(%{$conf::Weight{'html'}})) {
        my $tmp = "";
        $$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies;
        $$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} 
            && $tmp;
        my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} ? 
            $conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1;
        $$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp;
    }
}

sub weight_element_sub ($$$) {
    my ($element, $text, $tmp) = @_;

    my $space = element_space($element);
    $text =~ s/<[^>]*>//g;
    $$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG;
    $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'}  ? " " : "$space$text$space";
}


# determine whether a given element should be delete or be substituted with space
sub element_space ($) {
    $_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " ";
}

# remove all HTML elements. it's not perfect but almost works.
sub remove_hdml_elements ($) {
    my ($contref) = @_;

    # remove all comments
    $$contref =~ s/<!?--.*?-->//gs;

    # remove all elements
    $$contref =~ s!</?([A-Z]\w*)(?:\s+[A-Z]\w*(?:\s*=\s*(?:(["']).*?\2|[\w\-.]+))?)*\s*>!element_space($1)!gsixe;

}

# Decode a numberd entity. Exclude an invalid number.
sub decode_numbered_entity ($) {
    my ($num) = @_;
    return ""
        if $num >= 0 && $num <= 8 ||  $num >= 11 && $num <= 31 || $num >=127;
    sprintf ("%c",$num);
}


# Decode an entity. Ignore characters of right half of ISO-8859-1.
# Because it can't be handled in EUC encoding.
# This function provides sequential entities like: &quot &lt &gt;
sub decode_entity ($) {
    my ($text) = @_;

    return unless defined($$text);

    $$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge;
    $$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei;
    $$text =~ s/&quot[;\s]/\"/g; #"
    $$text =~ s/&amp[;\s]/&/g;
    $$text =~ s/&lt[;\s]/</g;
    $$text =~ s/&gt[;\s]/>/g;
    $$text =~ s/&nbsp[;\s]/ /g;
}


# encode entities: only '<', '>', and '&'
sub encode_entity ($) {
    my ($tmp) = @_;

    $$tmp =~ s/&/&amp;/g;    # &amp; should be processed first
    $$tmp =~ s/</&lt;/g;
    $$tmp =~ s/>/&gt;/g;
    $$tmp;
}

1;
1	#
2	# -- Perl --
3	# $Id: hdml.pl,v 1.6 2001/02/07 10:23:10 knok Exp $
4	# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
5	# Copyright (C) 2000 Namazu Project All rights reserved.
6	# This is free software with ABSOLUTELY NO WARRANTY.
7	#
8	# This program is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation; either versions 2, or (at your option)
11	# any later version.
12	#
13	# This program is distributed in the hope that it will be useful
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.
17	#
18	# You should have received a copy of the GNU General Public License
19	# along with this program; if not, write to the Free Software
20	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21	# 02111-1307, USA
22	#
23	# This file must be encoded in EUC-JP encoding
24	#
25
26	package hdml;
27	use strict;
28	require 'gfilter.pl';
29
30	sub mediatype() {
31	return ('text/x-hdml');
32	}
33
34	sub status() {
35	return 'yes';
36	}
37
38	sub recursive() {
39	return 0;
40	}
41
42	sub pre_codeconv() {
43	return 1;
44	}
45
46	sub post_codeconv () {
47	return 0;
48	}
49
50	sub add_magic ($) {
51	my $magic = shift @_;
52	$magic->addSpecials('text/x-hdml', '<[Hh][Dd][Mm][Ll][^>]*>');
53	return;
54	}
55
56	sub filter ($$$$$) {
57	my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
58	= @_;
59	my $cfile = defined $orig_cfile ? $$orig_cfile : '';
60
61	util::vprint("Processing hdml file ...\n");
62
63	hdml($cont, $weighted_str, $fields, $headings);
64
65	gfilter::line_adjust_filter($cont);
66	gfilter::line_adjust_filter($weighted_str);
67	gfilter::white_space_adjust_filter($cont);
68	gfilter::show_filter_debug_info($cont, $weighted_str,
69	$fields, $headings);
70	return undef;
71	}
72
73	sub hdml ($$$$) {
74	my ($contref, $weighted_str, $fields, $headings) = @_;
75
76	hdml::escape_lt_gt($contref);
77	$fields->{'title'} = hdml::get_title($contref, $weighted_str);
78	hdml::get_img_alt($contref);
79	hdml::get_title_attr($contref);
80	hdml::normalize_hdml_element($contref);
81	hdml::weight_element($contref, $weighted_str, $headings);
82	hdml::remove_hdml_elements($contref);
83	# restore entities of each content.
84	hdml::decode_entity($contref);
85	hdml::decode_entity($weighted_str);
86	hdml::decode_entity($headings);
87	for my $key (keys %{$fields}) {
88	hdml::decode_entity(\$fields->{$key});
89	}
90	}
91
92	# Convert independent < > s into entity references for escaping.
93	# Substitute twice for safe.
94	sub escape_lt_gt ($) {
95	my ($contref) = @_;
96
97	$$contref =~ s/\s<\s/ < /g;
98	$$contref =~ s/\s>\s/ > /g;
99	$$contref =~ s/\s<\s/ < /g;
100	$$contref =~ s/\s>\s/ > /g;
101	}
102
103	sub get_title ($$) {
104	my ($contref, $weighted_str) = @_;
105	my $title = '';
106
107	if ($$contref =~ s!<[A-Z]+[^>]\s+TITLE\s=\s[\"\']?([^\"\'>])[\"\']?[^>]*>! $1 !i) {
108	$title = $1;
109	$title =~ s/\s+/ /g;
110	$title =~ s/^\s+//;
111	$title =~ s/\s+$//;
112	my $weight = $conf::Weight{'html'}->{'title'};
113	$$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
114	} else {
115	$title = $conf::NO_TITLE;
116	}
117
118	return $title;
119	}
120
121	# Get foo from <IMG ... ALT="foo">
122	# It's not to handle HTML strictly.
123	sub get_img_alt ($) {
124	my ($contref) = @_;
125
126	$$contref =~ s/<IMG[^>]\s+ALT\s=\s[\"\']?([^\"\'])[\"\']?[^>]*>/ $1 /gi; #"
127	}
128
129	# Get foo from <XXX ... TITLE="foo">
130	sub get_title_attr ($) {
131	my ($contref) = @_;
132
133	$$contref =~ s/<[A-Z]+[^>]\s+TITLE\s=\s[\"\']?([^\"\'])[\"\']?[^>]*>/ $1 /gi; #"
134	}
135
136	# Normalize elements like: <A HREF...> -> <A>
137	sub normalize_hdml_element ($) {
138	my ($contref) = @_;
139
140	$$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g;
141	}
142
143
144	# Weight a score of a keyword in a given text using %conf::Weight hash.
145	# This process make the text be surround by temporary tags
146	# \x7fXX\x7f and \x7f/XX\x7f. XX represents score.
147	# Sort keys of %conf::Weight for processing <a> first.
148	# Because <a> has a tendency to be inside of other tags.
149	# Thus, it does'not processing for nexted tags strictly.
150	# Moreover, it does special processing for <h[1-6]> for summarization.
151	sub weight_element ($$$ ) {
152	my ($contref, $weighted_str, $headings) = @_;
153
154	for my $element (sort keys(%{$conf::Weight{'html'}})) {
155	my $tmp = "";
156	$$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies;
157	$$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'}
158	&& $tmp;
159	my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} ?
160	$conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1;
161	$$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp;
162	}
163	}
164
165	sub weight_element_sub ($$$) {
166	my ($element, $text, $tmp) = @_;
167
168	my $space = element_space($element);
169	$text =~ s/<[^>]*>//g;
170	$$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG;
171	$element =~ /^H[1-6]$/i && ! $var::Opt{'NoHeadAbst'} ? " " : "$space$text$space";
172	}
173
174
175	# determine whether a given element should be delete or be substituted with space
176	sub element_space ($) {
177	$_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " ";
178	}
179
180	# remove all HTML elements. it's not perfect but almost works.
181	sub remove_hdml_elements ($) {
182	my ($contref) = @_;
183
184	# remove all comments
185	$$contref =~ s/<!?--.*?-->//gs;
186
187	# remove all elements
188	$$contref =~ s!</?([A-Z]\w)(?:\s+[A-Z]\w(?:\s=\s(?:(["']).?\2\|[\w\-.]+))?)\s*>!element_space($1)!gsixe;
189
190	}
191
192	# Decode a numberd entity. Exclude an invalid number.
193	sub decode_numbered_entity ($) {
194	my ($num) = @_;
195	return ""
196	if $num >= 0 && $num <= 8 \|\| $num >= 11 && $num <= 31 \|\| $num >=127;
197	sprintf ("%c",$num);
198	}
199
200
201	# Decode an entity. Ignore characters of right half of ISO-8859-1.
202	# Because it can't be handled in EUC encoding.
203	# This function provides sequential entities like: &quot &lt >
204	sub decode_entity ($) {
205	my ($text) = @_;
206
207	return unless defined($$text);
208
209	$$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge;
210	$$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei;
211	$$text =~ s/&quot[;\s]/\"/g; #"
212	$$text =~ s/&amp[;\s]/&/g;
213	$$text =~ s/&lt[;\s]/</g;
214	$$text =~ s/&gt[;\s]/>/g;
215	$$text =~ s/&nbsp[;\s]/ /g;
216	}
217
218
219	# encode entities: only '<', '>', and '&'
220	sub encode_entity ($) {
221	my ($tmp) = @_;
222
223	$$tmp =~ s/&/&/g; # & should be processed first
224	$$tmp =~ s/</</g;
225	$$tmp =~ s/>/>/g;
226	$$tmp;
227	}
228
229	1;