namazu/filter/html.pl

#
# -*- Perl -*-
# $Id: html.pl,v 1.3 2001/11/04 05:07:28 wakaba Exp $
# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
# Copyright (C) 2000 Namazu Project All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
# 
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding
#

package html;
use strict;
require 'gfilter.pl';
my %map_entity_jisx0213 = (
    #nbsp => "a9a2",
    iexcl => "a9a3",    curren => "a9a4",       brvbar => "a9a5",
    copy => "a9a6",     ordf => "a9a7", laquo => "a9a8",
    shy => "a9a9",      reg => "a9aa",  macr => "a9ab",
    sup2 => "a9ac",     sup3 => "a9ad", middot => "a9ae",
    cedil => "a9af",    sup1 => "a9b0", ordm => "a9b1",
    raquo => "a9b2",    frac14 => "a9b3",       frac12 => "a9b4",
    frac34 => "a9b5",   iquest => "a9b6",       Agrave => "a9b7",
    Aacute => "a9b8",   Acirc => "a9b9",        Atilde => "a9ba",
    Auml => "a9bb",     Aring => "a9bc",        AElig => "a9bd",
    Ccedil => "a9be",   Egrave => "a9bf",       Eacute => "a9c0",
    Ecirc => "a9c1",    Euml => "a9c2", Igrave => "a9c3",
    Iacute => "a9c4",   Icirc => "a9c5",        Iuml => "a9c6",
    ETH => "a9c7",      Ntilde => "a9c8",       Ograve => "a9c9",
    Oacute => "a9ca",   Ocirc => "a9cb",        Otilde => "a9cc",
    Ouml => "a9cd",     Oslash => "a9ce",       Ugrave => "a9cf",
    Uacute => "a9d0",   Ucirc => "a9d1",        Uuml => "a9d2",
    Yacute => "a9d3",   THORN => "a9d4",        szlig => "a9d5",
    agrave => "a9d6",   aacute => "a9d7",       acirc => "a9d8",
    atilde => "a9d9",   auml => "a9da", aring => "a9db",
    aelig => "a9dc",    ccedil => "a9dd",       egrave => "a9de",
    eacute => "a9df",   ecirc => "a9e0",        euml => "a9e1",
    igrave => "a9e2",   iacute => "a9e3",       icirc => "a9e4",
    iuml => "a9e5",     eth => "a9e6",  ntilde => "a9e7",
    ograve => "a9e8",   oacute => "a9e9",       ocirc => "a9ea",
    otilde => "a9eb",   ouml => "a9ec", oslash => "a9ed",
    ugrave => "a9ee",   uacute => "a9ef",       ucirc => "a9f0",
    uuml => "a9f1",     yacute => "a9f2",       thorn => "a9f3",
    yuml => "a9f4",     OElig => "abab",        oelig => "abaa",
    Scaron => "aaa6",   scaron => "aab2",       ndash => "a3fc",
    euro => "a9a1",     sigmaf => "a6d9",       bull => "a3c0",
    alefsym => "a3dc",  harr => "a2f1", empty => "a2c7",
    notin => "a2c6",    cong => "a2ed", asymp => "a2ee",
    nsub => "a2c2",     oplus => "a2d1",        otimes => "a2d3",
    spades => "a6ba",   clubs => "a6c0",        hearts => "a6be",
    diams => "a6bc",
);

sub mediatype() {
    return ('text/html');
}

sub status() {
    return 'yes';
}

sub recursive() {
    return 0;
}

sub pre_codeconv() {
    return 1;
}

sub post_codeconv () {
    return 0;
}

sub add_magic ($) {
    my $magic = shift;
    $magic->addSpecials("text/html", "<!DOCTYPE HTML PUBLIC");
    $magic->addSpecials("text/html", "<!DOCTYPE html PUBLIC");
    $magic->addSpecials("text/html", "-//W3C//DTD XHTML");
    $magic->addSpecials("text/html", "-//W3C//DTD HTML");
    $magic->addSpecials("text/html", 'xmlns="http://www.w3.org/1999/xhtml"');
    $magic->addSpecials("text/html", "<meta http-equiv");
    $magic->addSpecials("text/html", "</HTML>");
    $magic->addSpecials("text/html", "</Html>");
    $magic->addSpecials("text/html", "</html>");
    $magic->addFileExts('\\.html$' => 'text/html');
    $magic->addFileExts('\\.html\\..+$' => 'text/html');
    return;
}

sub filter ($$$$$) {
    my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
      = @_;
    my $cfile = defined $orig_cfile ? $$orig_cfile : '';

    util::vprint("Processing html file ...\n");

    if ($var::Opt{'robotexclude'}) {
        my $err = isexcluded($cont);
        return $err if $err;
    }

    html_filter($cont, $weighted_str, $fields, $headings);
    
    gfilter::line_adjust_filter($cont);
    gfilter::line_adjust_filter($weighted_str);
    gfilter::white_space_adjust_filter($cont);
    gfilter::show_filter_debug_info($cont, $weighted_str,
                           $fields, $headings);
    return undef;
}

# Check wheter or not the given URI is excluded.
sub isexcluded ($) {
    my ($contref) = @_;
    my $err = undef;

    if ($$contref =~ /META\s+NAME\s*=\s*([\'\"]?)ROBOTS\1\s+[^>]*
        CONTENT\s*=\s*([\'\"]?).*?(NOINDEX|NONE).*?\2[^>]*>/ix)  #"
    {
        $err = _("is excluded because of <meta name=\"robots\" ...>");
    }
    return $err;
}


sub html_filter ($$$$) {
    my ($contref, $weighted_str, $fields, $headings) = @_;

    html::escape_lt_gt($contref);
    $fields->{'title'} = html::get_title($contref, $weighted_str);
    html::get_author($contref, $fields);
    html::get_meta_tags($contref, $weighted_str, $fields);
    html::get_img_alt($contref);
    html::get_table_summary($contref);
    html::get_title_attr($contref);
    html::normalize_html_element($contref);
    html::erase_above_body($contref);
    html::weight_element($contref, $weighted_str, $headings);
    html::remove_html_elements($contref);
    # restore entities of each content.
    html::decode_entity($contref);
    html::decode_entity($weighted_str);
    html::decode_entity($headings);
    for my $key (keys %{$fields}) {
        html::decode_entity(\$fields->{$key});
    }
}

# Convert independent < > s into entity references for escaping.
# Substitute twice for safe.
sub escape_lt_gt ($) {
    my ($contref) = @_;

    $$contref =~ s/\s<\s/ &lt; /g;
    $$contref =~ s/\s>\s/ &gt; /g;
    $$contref =~ s/\s<\s/ &lt; /g;
    $$contref =~ s/\s>\s/ &gt; /g;
}

sub get_author ($$) {
    my ($contref, $fields) = @_;

    # <LINK REV=MADE HREF="mailto:ccsatoru@vega.aichi-u.ac.jp">

    if ($$contref =~ m!META\s+NAME\s*=\s*([\'\"]?)AUTHOR\1\s+[^>]*
        CONTENT\s*=\s*([\'\"]?)(.*?)\2[^>]*>!ix) { #"
        $fields->{'author'} = $3;
    } elsif ($$contref =~ m!<LINK\s[^>]*?HREF=([\"\'])mailto:(.*?)\1\s*>!i) { #"
            $fields->{'author'} = $2;
    } elsif ($$contref =~ m!.*<ADDRESS[^>]*>([^<]*?)</ADDRESS>!i) {
        my $tmp = $1;
#       $tmp =~ s/\s//g;
        if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) {
            $fields->{'author'} = $1;
        }
    }
}


# Get title from <title>..</title>
# It's okay to exits two or more <title>...</TITLE>. 
# First one will be retrieved.
sub get_title ($$) {
    my ($contref, $weighted_str) = @_;
    my $title = '';
    
    if ($$contref =~ s!<TITLE[^>]*>([^<]+)</TITLE>!!i) {
        $title = $1;
        $title =~ s/\s+/ /g;
        $title =~ s/^\s+//;
        $title =~ s/\s+$//;
        my $weight = $conf::Weight{'html'}->{'title'};
        $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
    } else {
        $title = $conf::NO_TITLE;
    }

    return $title;
}

# get foo bar from <META NAME="keywords|description" CONTENT="foo bar"> 
sub get_meta_tags ($$$) {
    my ($contref, $weighted_str, $fields) = @_;
    
    # <meta name="keywords" content="foo bar baz">

    my $weight = $conf::Weight{'metakey'};
    $$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n" 
        if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?) #"
            keywords\1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #"

    # <meta name="description" content="foo bar baz">
    $$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n" 
        if $$contref =~ /<meta\s+name\s*=\s*([\'\"]?)description #"
            \1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/ix; #"

    if ($var::Opt{'meta'}) {
        my @keys = split '\|', $conf::META_TAGS;
        for my $key (@keys) {
            while ($$contref =~ /<meta\s+name\s*=\s*([\'\"]?)$key #"
               \1\s+[^>]*content\s*=\s*([\'\"]?)([^>]*?)\2[^>]*>/gix) 
            {
                $fields->{$key} .= $3 . " ";
            }
            util::dprint("meta: $key: $fields->{$key}\n") 
                if defined $fields->{$key};
        }
    }
}

# Get foo from <IMG ... ALT="foo">
# It's not to handle HTML strictly.
sub get_img_alt ($) {
    my ($contref) = @_;

    $$contref =~ s/<IMG[^>]*\s+ALT\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
}

# Get foo from <TABLE ... SUMMARY="foo">
sub get_table_summary ($) {
    my ($contref) = @_;

    $$contref =~ s/<TABLE[^>]*\s+SUMMARY\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
}

# Get foo from <XXX ... TITLE="foo">
sub get_title_attr ($) {
    my ($contref) = @_;

    $$contref =~ s/<[A-Z]+[^>]*\s+TITLE\s*=\s*[\"\']?([^\"\']*)[\"\']?[^>]*>/ $1 /gi; #"
}

# Normalize elements like: <A HREF...> -> <A>
sub normalize_html_element ($) {
    my ($contref) = @_;

    $$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g;
}

# Remove contents above <body>.
sub erase_above_body ($) {
    my ($contref) = @_;

    $$contref =~ s/^.*<body>//is;
}


# Weight a score of a keyword in a given text using %conf::Weight hash.
# This process make the text be surround by temporary tags 
# \x7fXX\x7f and \x7f/XX\x7f. XX represents score.
# Sort keys of %conf::Weight for processing <a> first.
# Because <a> has a tendency to be inside of other tags.
# Thus, it does'not processing for nexted tags strictly.
# Moreover, it does special processing for <h[1-6]> for summarization.
sub weight_element ($$$ ) {
    my ($contref, $weighted_str, $headings) = @_;

    for my $element (sort keys(%{$conf::Weight{'html'}})) {
        my $tmp = "";
        $$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies;
        $$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} 
            && $tmp;
        my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ? 
            $conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1;
        $$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp;
    }
}

sub weight_element_sub ($$$) {
    my ($element, $text, $tmp) = @_;

    my $space = element_space($element);
    $text =~ s/<[^>]*>//g;
    $$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG;
    $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'}  ? " " : "$space$text$space";
}


# determine whether a given element should be delete or be substituted with space
sub element_space ($) {
    $_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " ";
}

# remove all HTML elements. it's not perfect but almost works.
sub remove_html_elements ($) {
    my ($contref) = @_;

    # remove all comments
    $$contref =~ s/<!--.*?-->//gs;

    # remove all elements
    $$contref =~ s!</?([A-Z]\w*)(?:\s+[A-Z]\w*(?:\s*=\s*(?:(["']).*?\2|[\w\-.]+))?)*\s*>!element_space($1)!gsixe;

}

# Decode a numberd entity. Exclude an invalid number.
sub decode_numbered_entity ($) {
    my ($num) = @_;
    return ""
        if $num >= 0 && $num <= 8 ||  $num >= 11 && $num <= 31 || $num >=127;
    sprintf ("%c",$num);
}

sub decode_entity_jisx0213($) {
    my $name = shift;
    my $euc = $map_entity_jisx0213{$name};
    if ($euc) {
      return pack "H4", $euc;
    } else {
      return '&'.$name.';';
    }
}

# Decode an entity. Ignore characters of right half of ISO-8859-1.
# Because it can't be handled in EUC encoding.
# This function provides sequential entities like: &quot &lt &gt;
sub decode_entity ($) {
    my ($text) = @_;

    return unless defined($$text);

    $$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge;
    $$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei;
    $$text =~ s/&quot[;\s]/\"/g; #"
    $$text =~ s/&amp[;\s]/&/g;
    $$text =~ s/&lt[;\s]/</g;
    $$text =~ s/&gt[;\s]/>/g;
    $$text =~ s/&nbsp[;\s]/ /g;
    $$text =~ s/&([A-Za-z0-9]+)[;\s]/&decode_entity_jisx0213($1)/ge;
}


# encode entities: only '<', '>', and '&'
sub encode_entity ($) {
    my ($tmp) = @_;

    $$tmp =~ s/&/&amp;/g;    # &amp; should be processed first
    $$tmp =~ s/</&lt;/g;
    $$tmp =~ s/>/&gt;/g;
    $$tmp;
}

1;
1	#
2	# -- Perl --
3	# $Id: html.pl,v 1.3 2001/11/04 05:07:28 wakaba Exp $
4	# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
5	# Copyright (C) 2000 Namazu Project All rights reserved.
6	# This is free software with ABSOLUTELY NO WARRANTY.
7	#
8	# This program is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation; either versions 2, or (at your option)
11	# any later version.
12	#
13	# This program is distributed in the hope that it will be useful
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.
17	#
18	# You should have received a copy of the GNU General Public License
19	# along with this program; if not, write to the Free Software
20	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21	# 02111-1307, USA
22	#
23	# This file must be encoded in EUC-JP encoding
24	#
25
26	package html;
27	use strict;
28	require 'gfilter.pl';
29	my %map_entity_jisx0213 = (
30	#nbsp => "a9a2",
31	iexcl => "a9a3", curren => "a9a4", brvbar => "a9a5",
32	copy => "a9a6", ordf => "a9a7", laquo => "a9a8",
33	shy => "a9a9", reg => "a9aa", macr => "a9ab",
34	sup2 => "a9ac", sup3 => "a9ad", middot => "a9ae",
35	cedil => "a9af", sup1 => "a9b0", ordm => "a9b1",
36	raquo => "a9b2", frac14 => "a9b3", frac12 => "a9b4",
37	frac34 => "a9b5", iquest => "a9b6", Agrave => "a9b7",
38	Aacute => "a9b8", Acirc => "a9b9", Atilde => "a9ba",
39	Auml => "a9bb", Aring => "a9bc", AElig => "a9bd",
40	Ccedil => "a9be", Egrave => "a9bf", Eacute => "a9c0",
41	Ecirc => "a9c1", Euml => "a9c2", Igrave => "a9c3",
42	Iacute => "a9c4", Icirc => "a9c5", Iuml => "a9c6",
43	ETH => "a9c7", Ntilde => "a9c8", Ograve => "a9c9",
44	Oacute => "a9ca", Ocirc => "a9cb", Otilde => "a9cc",
45	Ouml => "a9cd", Oslash => "a9ce", Ugrave => "a9cf",
46	Uacute => "a9d0", Ucirc => "a9d1", Uuml => "a9d2",
47	Yacute => "a9d3", THORN => "a9d4", szlig => "a9d5",
48	agrave => "a9d6", aacute => "a9d7", acirc => "a9d8",
49	atilde => "a9d9", auml => "a9da", aring => "a9db",
50	aelig => "a9dc", ccedil => "a9dd", egrave => "a9de",
51	eacute => "a9df", ecirc => "a9e0", euml => "a9e1",
52	igrave => "a9e2", iacute => "a9e3", icirc => "a9e4",
53	iuml => "a9e5", eth => "a9e6", ntilde => "a9e7",
54	ograve => "a9e8", oacute => "a9e9", ocirc => "a9ea",
55	otilde => "a9eb", ouml => "a9ec", oslash => "a9ed",
56	ugrave => "a9ee", uacute => "a9ef", ucirc => "a9f0",
57	uuml => "a9f1", yacute => "a9f2", thorn => "a9f3",
58	yuml => "a9f4", OElig => "abab", oelig => "abaa",
59	Scaron => "aaa6", scaron => "aab2", ndash => "a3fc",
60	euro => "a9a1", sigmaf => "a6d9", bull => "a3c0",
61	alefsym => "a3dc", harr => "a2f1", empty => "a2c7",
62	notin => "a2c6", cong => "a2ed", asymp => "a2ee",
63	nsub => "a2c2", oplus => "a2d1", otimes => "a2d3",
64	spades => "a6ba", clubs => "a6c0", hearts => "a6be",
65	diams => "a6bc",
66	);
67
68	sub mediatype() {
69	return ('text/html');
70	}
71
72	sub status() {
73	return 'yes';
74	}
75
76	sub recursive() {
77	return 0;
78	}
79
80	sub pre_codeconv() {
81	return 1;
82	}
83
84	sub post_codeconv () {
85	return 0;
86	}
87
88	sub add_magic ($) {
89	my $magic = shift;
90	$magic->addSpecials("text/html", "<!DOCTYPE HTML PUBLIC");
91	$magic->addSpecials("text/html", "<!DOCTYPE html PUBLIC");
92	$magic->addSpecials("text/html", "-//W3C//DTD XHTML");
93	$magic->addSpecials("text/html", "-//W3C//DTD HTML");
94	$magic->addSpecials("text/html", 'xmlns="http://www.w3.org/1999/xhtml"');
95	$magic->addSpecials("text/html", "<meta http-equiv");
96	$magic->addSpecials("text/html", "</HTML>");
97	$magic->addSpecials("text/html", "</Html>");
98	$magic->addSpecials("text/html", "</html>");
99	$magic->addFileExts('\\.html$' => 'text/html');
100	$magic->addFileExts('\\.html\\..+$' => 'text/html');
101	return;
102	}
103
104	sub filter ($$$$$) {
105	my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
106	= @_;
107	my $cfile = defined $orig_cfile ? $$orig_cfile : '';
108
109	util::vprint("Processing html file ...\n");
110
111	if ($var::Opt{'robotexclude'}) {
112	my $err = isexcluded($cont);
113	return $err if $err;
114	}
115
116	html_filter($cont, $weighted_str, $fields, $headings);
117
118	gfilter::line_adjust_filter($cont);
119	gfilter::line_adjust_filter($weighted_str);
120	gfilter::white_space_adjust_filter($cont);
121	gfilter::show_filter_debug_info($cont, $weighted_str,
122	$fields, $headings);
123	return undef;
124	}
125
126	# Check wheter or not the given URI is excluded.
127	sub isexcluded ($) {
128	my ($contref) = @_;
129	my $err = undef;
130
131	if ($$contref =~ /META\s+NAME\s=\s([\'\"]?)ROBOTS\1\s+[^>]*
132	CONTENT\s=\s([\'\"]?).?(NOINDEX\|NONE).?\2[^>]*>/ix) #"
133	{
134	$err = _("is excluded because of <meta name=\"robots\" ...>");
135	}
136	return $err;
137	}
138
139
140	sub html_filter ($$$$) {
141	my ($contref, $weighted_str, $fields, $headings) = @_;
142
143	html::escape_lt_gt($contref);
144	$fields->{'title'} = html::get_title($contref, $weighted_str);
145	html::get_author($contref, $fields);
146	html::get_meta_tags($contref, $weighted_str, $fields);
147	html::get_img_alt($contref);
148	html::get_table_summary($contref);
149	html::get_title_attr($contref);
150	html::normalize_html_element($contref);
151	html::erase_above_body($contref);
152	html::weight_element($contref, $weighted_str, $headings);
153	html::remove_html_elements($contref);
154	# restore entities of each content.
155	html::decode_entity($contref);
156	html::decode_entity($weighted_str);
157	html::decode_entity($headings);
158	for my $key (keys %{$fields}) {
159	html::decode_entity(\$fields->{$key});
160	}
161	}
162
163	# Convert independent < > s into entity references for escaping.
164	# Substitute twice for safe.
165	sub escape_lt_gt ($) {
166	my ($contref) = @_;
167
168	$$contref =~ s/\s<\s/ < /g;
169	$$contref =~ s/\s>\s/ > /g;
170	$$contref =~ s/\s<\s/ < /g;
171	$$contref =~ s/\s>\s/ > /g;
172	}
173
174	sub get_author ($$) {
175	my ($contref, $fields) = @_;
176
177	# <LINK REV=MADE HREF="mailto:ccsatoru@vega.aichi-u.ac.jp">
178
179	if ($$contref =~ m!META\s+NAME\s=\s([\'\"]?)AUTHOR\1\s+[^>]*
180	CONTENT\s=\s([\'\"]?)(.?)\2[^>]>!ix) { #"
181	$fields->{'author'} = $3;
182	} elsif ($$contref =~ m!<LINK\s[^>]?HREF=([\"\'])mailto:(.?)\1\s*>!i) { #"
183	$fields->{'author'} = $2;
184	} elsif ($$contref =~ m!.<ADDRESS[^>]>([^<]*?)</ADDRESS>!i) {
185	my $tmp = $1;
186	# $tmp =~ s/\s//g;
187	if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) {
188	$fields->{'author'} = $1;
189	}
190	}
191	}
192
193
194	# Get title from <title>..</title>
195	# It's okay to exits two or more <title>...</TITLE>.
196	# First one will be retrieved.
197	sub get_title ($$) {
198	my ($contref, $weighted_str) = @_;
199	my $title = '';
200
201	if ($$contref =~ s!<TITLE[^>]*>([^<]+)</TITLE>!!i) {
202	$title = $1;
203	$title =~ s/\s+/ /g;
204	$title =~ s/^\s+//;
205	$title =~ s/\s+$//;
206	my $weight = $conf::Weight{'html'}->{'title'};
207	$$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
208	} else {
209	$title = $conf::NO_TITLE;
210	}
211
212	return $title;
213	}
214
215	# get foo bar from <META NAME="keywords\|description" CONTENT="foo bar">
216	sub get_meta_tags ($$$) {
217	my ($contref, $weighted_str, $fields) = @_;
218
219	# <meta name="keywords" content="foo bar baz">
220
221	my $weight = $conf::Weight{'metakey'};
222	$$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n"
223	if $$contref =~ /<meta\s+name\s=\s([\'\"]?) #"
224	keywords\1\s+[^>]content\s=\s([\'\"]?)([^>]?)\2[^>]*>/ix; #"
225
226	# <meta name="description" content="foo bar baz">
227	$$weighted_str .= "\x7f$weight\x7f$3\x7f/$weight\x7f\n"
228	if $$contref =~ /<meta\s+name\s=\s([\'\"]?)description #"
229	\1\s+[^>]content\s=\s([\'\"]?)([^>]?)\2[^>]*>/ix; #"
230
231	if ($var::Opt{'meta'}) {
232	my @keys = split '\\|', $conf::META_TAGS;
233	for my $key (@keys) {
234	while ($$contref =~ /<meta\s+name\s=\s([\'\"]?)$key #"
235	\1\s+[^>]content\s=\s([\'\"]?)([^>]?)\2[^>]*>/gix)
236	{
237	$fields->{$key} .= $3 . " ";
238	}
239	util::dprint("meta: $key: $fields->{$key}\n")
240	if defined $fields->{$key};
241	}
242	}
243	}
244
245	# Get foo from <IMG ... ALT="foo">
246	# It's not to handle HTML strictly.
247	sub get_img_alt ($) {
248	my ($contref) = @_;
249
250	$$contref =~ s/<IMG[^>]\s+ALT\s=\s[\"\']?([^\"\'])[\"\']?[^>]*>/ $1 /gi; #"
251	}
252
253	# Get foo from <TABLE ... SUMMARY="foo">
254	sub get_table_summary ($) {
255	my ($contref) = @_;
256
257	$$contref =~ s/<TABLE[^>]\s+SUMMARY\s=\s[\"\']?([^\"\'])[\"\']?[^>]*>/ $1 /gi; #"
258	}
259
260	# Get foo from <XXX ... TITLE="foo">
261	sub get_title_attr ($) {
262	my ($contref) = @_;
263
264	$$contref =~ s/<[A-Z]+[^>]\s+TITLE\s=\s[\"\']?([^\"\'])[\"\']?[^>]*>/ $1 /gi; #"
265	}
266
267	# Normalize elements like: <A HREF...> -> <A>
268	sub normalize_html_element ($) {
269	my ($contref) = @_;
270
271	$$contref =~ s/<([!\w]+)\s+[^>]*>/<$1>/g;
272	}
273
274	# Remove contents above <body>.
275	sub erase_above_body ($) {
276	my ($contref) = @_;
277
278	$$contref =~ s/^.*<body>//is;
279	}
280
281
282	# Weight a score of a keyword in a given text using %conf::Weight hash.
283	# This process make the text be surround by temporary tags
284	# \x7fXX\x7f and \x7f/XX\x7f. XX represents score.
285	# Sort keys of %conf::Weight for processing <a> first.
286	# Because <a> has a tendency to be inside of other tags.
287	# Thus, it does'not processing for nexted tags strictly.
288	# Moreover, it does special processing for <h[1-6]> for summarization.
289	sub weight_element ($$$ ) {
290	my ($contref, $weighted_str, $headings) = @_;
291
292	for my $element (sort keys(%{$conf::Weight{'html'}})) {
293	my $tmp = "";
294	$$contref =~ s!<($element)>(.*?)</$element>!weight_element_sub($1, $2, \$tmp)!gies;
295	$$headings .= $tmp if $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'}
296	&& $tmp;
297	my $weight = $element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ?
298	$conf::Weight{'html'}->{$element} : $conf::Weight{'html'}->{$element} - 1;
299	$$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n" if $tmp;
300	}
301	}
302
303	sub weight_element_sub ($$$) {
304	my ($element, $text, $tmp) = @_;
305
306	my $space = element_space($element);
307	$text =~ s/<[^>]*>//g;
308	$$tmp .= "$text " if (length($text)) < $conf::INVALID_LENG;
309	$element =~ /^H[1-6]$/i && ! $var::Opt{'noheadabst'} ? " " : "$space$text$space";
310	}
311
312
313	# determine whether a given element should be delete or be substituted with space
314	sub element_space ($) {
315	$_[0] =~ /^($conf::NON_SEPARATION_ELEMENTS)$/io ? "" : " ";
316	}
317
318	# remove all HTML elements. it's not perfect but almost works.
319	sub remove_html_elements ($) {
320	my ($contref) = @_;
321
322	# remove all comments
323	$$contref =~ s/<!--.*?-->//gs;
324
325	# remove all elements
326	$$contref =~ s!</?([A-Z]\w)(?:\s+[A-Z]\w(?:\s=\s(?:(["']).?\2\|[\w\-.]+))?)\s*>!element_space($1)!gsixe;
327
328	}
329
330	# Decode a numberd entity. Exclude an invalid number.
331	sub decode_numbered_entity ($) {
332	my ($num) = @_;
333	return ""
334	if $num >= 0 && $num <= 8 \|\| $num >= 11 && $num <= 31 \|\| $num >=127;
335	sprintf ("%c",$num);
336	}
337
338	sub decode_entity_jisx0213($) {
339	my $name = shift;
340	my $euc = $map_entity_jisx0213{$name};
341	if ($euc) {
342	return pack "H4", $euc;
343	} else {
344	return '&'.$name.';';
345	}
346	}
347
348	# Decode an entity. Ignore characters of right half of ISO-8859-1.
349	# Because it can't be handled in EUC encoding.
350	# This function provides sequential entities like: &quot &lt >
351	sub decode_entity ($) {
352	my ($text) = @_;
353
354	return unless defined($$text);
355
356	$$text =~ s/&#(\d{2,3})[;\s]/decode_numbered_entity($1)/ge;
357	$$text =~ s/&#x([\da-f]+)[;\s]/decode_numbered_entity(hex($1))/gei;
358	$$text =~ s/&quot[;\s]/\"/g; #"
359	$$text =~ s/&amp[;\s]/&/g;
360	$$text =~ s/&lt[;\s]/</g;
361	$$text =~ s/&gt[;\s]/>/g;
362	$$text =~ s/&nbsp[;\s]/ /g;
363	$$text =~ s/&([A-Za-z0-9]+)[;\s]/&decode_entity_jisx0213($1)/ge;
364	}
365
366
367	# encode entities: only '<', '>', and '&'
368	sub encode_entity ($) {
369	my ($tmp) = @_;
370
371	$$tmp =~ s/&/&/g; # & should be processed first
372	$$tmp =~ s/</</g;
373	$$tmp =~ s/>/>/g;
374	$$tmp;
375	}
376
377	1;