namazu/filter/gfilter.pl

#
# -*- Perl -*-
# $Id: gfilter.pl,v 1.13.8.1 2001/07/09 09:55:39 takesako Exp $
# Copyright (C) 1999 Satoru Takabayashi ,
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
# 
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding
#

package gfilter;

# Show debug information for filters
sub show_filter_debug_info ($$$$) {
    my ($contref, $weighted_str, $fields, $headings) = @_;
    util::dprint("-- title --\n$fields->{'title'}\n") 
        if defined $fields->{'title'};
    util::dprint("-- content --\n$$contref\n");
    util::dprint("-- weighted_str: --\n$$weighted_str\n");
    util::dprint("-- headings --\n$$headings\n");
}

# Adjust white spaces
sub white_space_adjust_filter ($) {
    my ($text) = @_;

    $$text =~ s/[ \t]+/ /g;
    $$text =~ s/\r\n/\n/g;
    $$text =~ s/\r/\n/g;
    $$text =~ s/\n+/\n/g;
    $$text =~ s/^ +//gm;
    $$text =~ s/ +$//gm;
    $$text =~ s/ +/ /g;
    # Control characters be into space
    $$text =~ tr/\x00-\x09\x0b-\x1f/  /;
}

# get a title from a file name.
sub filename_to_title ($$) {
    my ($cfile, $weighted_str) = @_;

    # for MSWin32's filename using Shift_JIS [1998-09-24]
    if (($mknmz::SYSTEM eq "MSWin32") || ($mknmz::SYSTEM eq "os2")) {
        $cfile = codeconv::shiftjis_to_eucjp($cfile);
        $cfile = codeconv::eucjp_han2zen_kana($cfile);
    }
    
    $cfile =~ m!^.*/([^/]*)$!;
    my $filename = $1;

    # get keywords from a file name.
    # modified [1998-09-18] 
    my $tmp = $filename;
    $tmp =~ s|/\\_\.-| |g;

    my $weight = $conf::Weight{'html'}->{'title'};
    $$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n";

    my $title = $filename;
    return $title
}

# Remove SPACE/TAB at the beginning or ending of the line.
# And remove '>|#:' at the begenning of the line.
# Join hyphenation for English text.
# Remove LF if the line is ended with a Japanese character and 
# length of the line is 40 or more longer.
#
# Original of this code was contributed by <furukawa@tcp-ip.or.jp>. 
# [1997-09-15]
#
sub line_adjust_filter ($) {
    my ($text) = @_;
    return undef unless defined($$text);

    my @tmp = split(/\n/, $$text);
    for my $line (@tmp) {
        $line .= "\n";
        $line =~ s/^[ \>\|\#\:]+//;
        $line =~ s/ +$//;
        $line =~ s/\n// if (($line =~ /[\xa1-\xfe]\n*$/) &&
                            (length($line) >=40));
        $line =~ s/(��|��)$/$1\n/;
        $line =~ s/([a-z])-\n/$1/;  # for hyphenation.
    }
    $$text = join('', @tmp);
}

# not implimented yet.
sub analize_rcs_stamp()
{
}

1;

1	#
2	# -- Perl --
3	# $Id: gfilter.pl,v 1.13.8.1 2001/07/09 09:55:39 takesako Exp $
4	# Copyright (C) 1999 Satoru Takabayashi ,
5	# This is free software with ABSOLUTELY NO WARRANTY.
6	#
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either versions 2, or (at your option)
10	# any later version.
11	#
12	# This program is distributed in the hope that it will be useful
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20	# 02111-1307, USA
21	#
22	# This file must be encoded in EUC-JP encoding
23	#
24
25	package gfilter;
26
27	# Show debug information for filters
28	sub show_filter_debug_info ($$$$) {
29	my ($contref, $weighted_str, $fields, $headings) = @_;
30	util::dprint("-- title --\n$fields->{'title'}\n")
31	if defined $fields->{'title'};
32	util::dprint("-- content --\n$$contref\n");
33	util::dprint("-- weighted_str: --\n$$weighted_str\n");
34	util::dprint("-- headings --\n$$headings\n");
35	}
36
37	# Adjust white spaces
38	sub white_space_adjust_filter ($) {
39	my ($text) = @_;
40
41	$$text =~ s/[ \t]+/ /g;
42	$$text =~ s/\r\n/\n/g;
43	$$text =~ s/\r/\n/g;
44	$$text =~ s/\n+/\n/g;
45	$$text =~ s/^ +//gm;
46	$$text =~ s/ +$//gm;
47	$$text =~ s/ +/ /g;
48	# Control characters be into space
49	$$text =~ tr/\x00-\x09\x0b-\x1f/ /;
50	}
51
52	# get a title from a file name.
53	sub filename_to_title ($$) {
54	my ($cfile, $weighted_str) = @_;
55
56	# for MSWin32's filename using Shift_JIS [1998-09-24]
57	if (($mknmz::SYSTEM eq "MSWin32") \|\| ($mknmz::SYSTEM eq "os2")) {
58	$cfile = codeconv::shiftjis_to_eucjp($cfile);
59	$cfile = codeconv::eucjp_han2zen_kana($cfile);
60	}
61
62	$cfile =~ m!^./([^/])$!;
63	my $filename = $1;
64
65	# get keywords from a file name.
66	# modified [1998-09-18]
67	my $tmp = $filename;
68	$tmp =~ s\|/\\_\.-\| \|g;
69
70	my $weight = $conf::Weight{'html'}->{'title'};
71	$$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n";
72
73	my $title = $filename;
74	return $title
75	}
76
77	# Remove SPACE/TAB at the beginning or ending of the line.
78	# And remove '>\|#:' at the begenning of the line.
79	# Join hyphenation for English text.
80	# Remove LF if the line is ended with a Japanese character and
81	# length of the line is 40 or more longer.
82	#
83	# Original of this code was contributed by <furukawa@tcp-ip.or.jp>.
84	# [1997-09-15]
85	#
86	sub line_adjust_filter ($) {
87	my ($text) = @_;
88	return undef unless defined($$text);
89
90	my @tmp = split(/\n/, $$text);
91	for my $line (@tmp) {
92	$line .= "\n";
93	$line =~ s/^[ \>\\|\#\:]+//;
94	$line =~ s/ +$//;
95	$line =~ s/\n// if (($line =~ /[\xa1-\xfe]\n*$/) &&
96	(length($line) >=40));
97	$line =~ s/(��\|��)$/$1\n/;
98	$line =~ s/([a-z])-\n/$1/; # for hyphenation.
99	}
100	$$text = join('', @tmp);
101	}
102
103	# not implimented yet.
104	sub analize_rcs_stamp()
105	{
106	}
107
108	1;
109