1 |
# |
2 |
# -*- Perl -*- |
3 |
# $Id: gfilter.pl,v 1.13.8.1 2001/07/09 09:55:39 takesako Exp $ |
4 |
# Copyright (C) 1999 Satoru Takabayashi , |
5 |
# This is free software with ABSOLUTELY NO WARRANTY. |
6 |
# |
7 |
# This program is free software; you can redistribute it and/or modify |
8 |
# it under the terms of the GNU General Public License as published by |
9 |
# the Free Software Foundation; either versions 2, or (at your option) |
10 |
# any later version. |
11 |
# |
12 |
# This program is distributed in the hope that it will be useful |
13 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 |
# GNU General Public License for more details. |
16 |
# |
17 |
# You should have received a copy of the GNU General Public License |
18 |
# along with this program; if not, write to the Free Software |
19 |
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
20 |
# 02111-1307, USA |
21 |
# |
22 |
# This file must be encoded in EUC-JP encoding |
23 |
# |
24 |
|
25 |
package gfilter; |
26 |
|
27 |
# Show debug information for filters |
28 |
sub show_filter_debug_info ($$$$) { |
29 |
my ($contref, $weighted_str, $fields, $headings) = @_; |
30 |
util::dprint("-- title --\n$fields->{'title'}\n") |
31 |
if defined $fields->{'title'}; |
32 |
util::dprint("-- content --\n$$contref\n"); |
33 |
util::dprint("-- weighted_str: --\n$$weighted_str\n"); |
34 |
util::dprint("-- headings --\n$$headings\n"); |
35 |
} |
36 |
|
37 |
# Adjust white spaces |
38 |
sub white_space_adjust_filter ($) { |
39 |
my ($text) = @_; |
40 |
|
41 |
$$text =~ s/[ \t]+/ /g; |
42 |
$$text =~ s/\r\n/\n/g; |
43 |
$$text =~ s/\r/\n/g; |
44 |
$$text =~ s/\n+/\n/g; |
45 |
$$text =~ s/^ +//gm; |
46 |
$$text =~ s/ +$//gm; |
47 |
$$text =~ s/ +/ /g; |
48 |
# Control characters be into space |
49 |
$$text =~ tr/\x00-\x09\x0b-\x1f/ /; |
50 |
} |
51 |
|
52 |
# get a title from a file name. |
53 |
sub filename_to_title ($$) { |
54 |
my ($cfile, $weighted_str) = @_; |
55 |
|
56 |
# for MSWin32's filename using Shift_JIS [1998-09-24] |
57 |
if (($mknmz::SYSTEM eq "MSWin32") || ($mknmz::SYSTEM eq "os2")) { |
58 |
$cfile = codeconv::shiftjis_to_eucjp($cfile); |
59 |
$cfile = codeconv::eucjp_han2zen_kana($cfile); |
60 |
} |
61 |
|
62 |
$cfile =~ m!^.*/([^/]*)$!; |
63 |
my $filename = $1; |
64 |
|
65 |
# get keywords from a file name. |
66 |
# modified [1998-09-18] |
67 |
my $tmp = $filename; |
68 |
$tmp =~ s|/\\_\.-| |g; |
69 |
|
70 |
my $weight = $conf::Weight{'html'}->{'title'}; |
71 |
$$weighted_str .= "\x7f$weight\x7f$tmp\x7f/$weight\x7f\n"; |
72 |
|
73 |
my $title = $filename; |
74 |
return $title |
75 |
} |
76 |
|
77 |
# Remove SPACE/TAB at the beginning or ending of the line. |
78 |
# And remove '>|#:' at the begenning of the line. |
79 |
# Join hyphenation for English text. |
80 |
# Remove LF if the line is ended with a Japanese character and |
81 |
# length of the line is 40 or more longer. |
82 |
# |
83 |
# Original of this code was contributed by <furukawa@tcp-ip.or.jp>. |
84 |
# [1997-09-15] |
85 |
# |
86 |
sub line_adjust_filter ($) { |
87 |
my ($text) = @_; |
88 |
return undef unless defined($$text); |
89 |
|
90 |
my @tmp = split(/\n/, $$text); |
91 |
for my $line (@tmp) { |
92 |
$line .= "\n"; |
93 |
$line =~ s/^[ \>\|\#\:]+//; |
94 |
$line =~ s/ +$//; |
95 |
$line =~ s/\n// if (($line =~ /[\xa1-\xfe]\n*$/) && |
96 |
(length($line) >=40)); |
97 |
$line =~ s/(¡£|¡¢)$/$1\n/; |
98 |
$line =~ s/([a-z])-\n/$1/; # for hyphenation. |
99 |
} |
100 |
$$text = join('', @tmp); |
101 |
} |
102 |
|
103 |
# not implimented yet. |
104 |
sub analize_rcs_stamp() |
105 |
{ |
106 |
} |
107 |
|
108 |
1; |
109 |
|