namazu/filter/mailnews.pl

#
# -*- Perl -*-
# $Id: mailnews.pl,v 1.2 2001/11/04 01:17:42 wakaba Exp $
# Copyright (C) 1997-2000 Satoru Takabayashi ,
#               1999 NOKUBI Takatsugu All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
# 
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding
#

package mailnews;
use strict;
require 'util.pl';
require 'gfilter.pl';

sub mediatype() {
    return ('message/rfc822', 'message/news');
}

sub status() {
    return 'yes';
}

sub recursive() {
    return 0;
}

sub pre_codeconv() {
    return 1;
}

sub post_codeconv () {
    return 0;
}

sub add_magic ($) {
    my ($magic) = @_;
    $magic->addFileExts('\\.822$' => 'message/rfc822');
    return;
}

sub filter ($$$$$) {
    my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
      = @_;
    my $cfile = defined $orig_cfile ? $$orig_cfile : '';

    util::vprint("Processing mail/news file ...\n");

    uuencode_filter($cont);
    mailnews_filter($cont, $weighted_str, $fields);
    mailnews_citation_filter($cont, $weighted_str);

    gfilter::line_adjust_filter($cont);
    gfilter::line_adjust_filter($weighted_str);
    gfilter::white_space_adjust_filter($cont);
    gfilter::white_space_adjust_filter($weighted_str);
    gfilter::show_filter_debug_info($cont, $weighted_str,
                           $fields, $headings);
    return undef;
}

# Original of this code was contributed by <furukawa@tcp-ip.or.jp>. 
sub mailnews_filter ($$$) {
    my ($contref, $weighted_str, $fields) = @_;

    my $boundary = "";
    my $line     = "";
    my $partial  = 0;

    $$contref =~ s/^\s+//;
    # Don't handle if first like doesn't seem like a mail/news header.
    return unless $$contref =~ /(^\S+:|^from )/i;

    my @tmp = split(/\n/, $$contref);
  HEADER_PROCESSING:
    while (@tmp) {
        $line = shift @tmp;
        last if ($line =~ /^$/);  # if an empty line, header is over
        # Connect the two lines if next line has leading spaces
        while (defined($tmp[0]) && $tmp[0] =~ /^\s+/) {
            # if connection is Japanese character, remove spaces
            # from Furukawa-san's idea [1998-09-22]
            my $nextline = shift @tmp;
            $line =~ s/([\xa1-\xfe])\s+$/$1/;
            $nextline =~ s/^\s+([\xa1-\xfe])/$1/;
            $line .= $nextline;
        }

        # Handle fields.
        if ($line =~ s/^subject:\s*//i){
            $fields->{'title'} = $line;
            # Skip [foobar-ML:000] for a typical mailing list subject.
            # Practically skip first [...] for simple implementation.
            $line =~ s/^\[.*?\]\s*//;

            # Skip 'Re:'
            $line =~ s/\bre:\s*//gi;

            my $weight = $conf::Weight{'html'}->{'title'};
            $$weighted_str .= "\x7f$weight\x7f$line\x7f/$weight\x7f\n";
        } elsif ($line =~ s/^content-type:\s*//i) {
            if ($line =~ /multipart.*boundary="(.*)"/i){
                $boundary = $1;
                util::dprint("((boundary: $boundary))\n");
            } elsif ($line =~ m!message/partial;\s*(.*)!i) {
                # The Message/Partial subtype routine [1998-10-12]
                # contributed by Hiroshi Kato <tumibito@mm.rd.nttdata.co.jp>
                $partial = $1;
                util::dprint("((partial: $partial))\n");
            }
        } elsif ($line =~ /^(\S+):\s*(.*)/i) {
            my $name = $1;
            my $value = $2;
            $fields->{lc($name)} = $value;
            if ($name =~ /^($conf::REMAIN_HEADER)$/io) {
                # keep some fields specified REMAIN_HEADER for search keyword
                my $weight = $conf::Weight{'headers'};
                $$weighted_str .= 
                    "\x7f$weight\x7f$value\x7f/$weight\x7f\n";
            }
        } 
    }
    if ($partial) {
        # MHonARC makes several empty lines between header and body,
        # so remove them.
        while(@tmp) {
            last if (! $line =~ /^\s*$/);
            $line = shift @tmp;
        }
        undef $partial;
        goto HEADER_PROCESSING;
    }
    $$contref = join("\n", @tmp);

    # Handle MIME multipart message.
    if ($boundary) {
        $boundary =~ s/(\W)/\\$1/g;
        $$contref =~ s/This is multipart message.\n//i;


        # MIME multipart processing,
        # modified by Furukawa-san's patch on [1998/08/27]
        $$contref =~ s/--$boundary(--)?\n?/\xff/g;
        my (@parts) = split(/\xff/, $$contref);
        $$contref = '';
        for $_ (@parts){
            if (s/^(.*?\n\n)//s){
                my ($head) = $1;
                $$contref .= $_ if $head =~ m!^content-type:.*text/plain!mi;
            }
        }
    }
}

# Make mail/news citation marks not to be indexed.
# And a greeting message at the beginning.
# And a meaningless message such as "foo wrote:".
# Thanks to Akira Yamada for great idea.
sub mailnews_citation_filter ($$) {
    my ($contref, $weighted_str) = @_;

    my $omake = "";
    $$contref =~ s/^\s+//;
    my @tmp = split(/\n/, $$contref);
    $$contref = "";

    # Greeting at the beginning (first one or two lines)
    for (my $i = 0; $i < 2 && defined($tmp[$i]); $i++) {
        if ($tmp[$i] =~ /(^\s*((([\xa1-\xfe][\xa1-\xfe]){1,8}|([\x21-\x7e]{1,16}))\s*(��|��|\.|��|,|��|\@|��|��)\s*){0,2}\s*(([\xa1-\xfe][\xa1-\xfe]){1,8}|([\x21-\x7e]{1,16}))\s*(�Ǥ�|�ȿ����ޤ�|�Ȥ⤦���ޤ�|�Ȥ����ޤ�)(.{0,2})?\s*$)/) {
            # for searching debug info by perl -n00e 'print if /^<<<</'
            util::dprint("\n\n<<<<$tmp[$i]>>>>\n\n");
            $omake .= $tmp[$i] . "\n";
            $tmp[$i] = "";
        }
    }

    # Isolate citation parts.
    for my $line (@tmp) {
        # Don't do that if there is an HTML tag at first.
        if ($line !~ /^[^>]*</ &&
            $line =~ s/^((\S{1,10}>)|(\s*[\>\|\:\#]+\s*))+//) {
            $omake .= $line . "\n";
            $$contref .= "\n";  # Insert LF.
            next;
        }
        $$contref .= $line. "\n";
    }
        
    # Process text as chunks of paragraphs.
    # Isolate meaningless message such as "foo wrote:".
    @tmp = split(/\n\n+/, $$contref);
    $$contref = "";
    my $i = 0;
    for my $line (@tmp) {
        # Complete excluding is impossible. I tnink it's good enough.
        # Process only first five paragrahs.
        # And don't handle the paragrah which has five or longer lines.
        # Hmm, this regex looks very hairly.
        if ($i < 5 && ($line =~ tr/\n/\n/) <= 5 && $line =~ /(^\s*(Date:|Subject:|Message-ID:|From:|��̾|���п�|����))|(^.+(�ֻ��Ǥ�|reply\s*�Ǥ�|۩��|���勞|�񤭤ޤ���|�����ޤ���|�ä�|wrote|said|writes|says)(.{0,2})?\s*$)|(^.*In .*(article|message))|(<\S+\@([\w\-.]\.)+\w+>)/im) {
            util::dprint("\n\n<<<<$line>>>>\n\n");
            $omake .= $line . "\n";
            $line = "";
            next;
        }
        $$contref .= $line. "\n\n";
        $i++;
    }
    $$weighted_str .= "\x7f1\x7f$omake\x7f/1\x7f\n";
}

# Skip uuencode and BinHex texts.
# Original of this code was contributed by <furukawa@tcp-ip.or.jp>. 
sub uuencode_filter ($) {
    my ($content) = @_;
    my @tmp = split(/\n/, $$content);
    $$content = "";
    
    my $uuin = 0;
    while (@tmp) {
        my $line = shift @tmp;
        $line .= "\n";

        # Skip BinHex texts.
        # All lines will be skipped.
        last if $line =~ /^\(This file must be converted with BinHex/; #)

        # Skip uuencode texts.
        # References : SunOS 4.1.4: man 5 uuencode
        #              FreeBSD 2.2: uuencode.c
        # For avoiding accidental matching, check a format.
        #
        # There are many netnews messages which is separated into several 
        # files. This kind of files has usually no "begin" line. 
        # This function handle them as well.
        #
        # There are two fashion for line length 62 and 63.
        # This function handle both.
        #
        # In the case of following the specification strictly,
        # int((ord($line) - ord(' ') + 2) / 3)
        #     != (length($line) - 2) / 4
        # but it can be transformed into a simple equation.
        # 4 * int(ord($line) / 3) != length($line) + $uunumb;

        # Hey, SunOS's uuencode use SPACE for encoding.
        # But allowing SPACE is dangerous for misrecognizing.
        # For compromise, only the following case are acceptable.
        #   1. inside of begin - end
        #   2. previous line is recognized as uuencoded line 
        #      and ord is identical with previous one.
        
        # a line consists of only characters of 0x20-0x60 is recognized 
        # as uuencoded line. v1.1.2.3 (bug fix)

        $uuin = 1, next if $line =~ /^begin [0-7]{3,4} \S+$/;
        if ($line =~ /^end$/){
            $uuin = 0,next if $uuin;
        } else {
            # Restrict ord value in range of 32-95.
            my $uuord = ord($line);
            $uuord = 32 if $uuord == 96;

            # if the line of uunumb = 38 is over this loop,
            # a normal line of 63 length can be ruined accidentaly.
            my $uunumb = (length($line)==63)? 37: 38;

            if ((32 <= $uuord && $uuord < 96) &&
                length($line) <= 63 &&
                (4 * int($uuord / 3) == length($line) + $uunumb)){

                if ($uuin == 1 || $uuin == $uuord){
                    next if $line =~ /^[\x20-\x60]+$/;
                } else {
                    # Be strict for files which doesn't begin with "begin".
                    $uuin = $uuord, next if $line =~ /^M[\x21-\x60]+$/;
                }
            }
        }
        $uuin = 0;
        $$content .= $line;
    }
}


1;
1	#
2	# -- Perl --
3	# $Id: mailnews.pl,v 1.2 2001/11/04 01:17:42 wakaba Exp $
4	# Copyright (C) 1997-2000 Satoru Takabayashi ,
5	# 1999 NOKUBI Takatsugu All rights reserved.
6	# This is free software with ABSOLUTELY NO WARRANTY.
7	#
8	# This program is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation; either versions 2, or (at your option)
11	# any later version.
12	#
13	# This program is distributed in the hope that it will be useful
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.
17	#
18	# You should have received a copy of the GNU General Public License
19	# along with this program; if not, write to the Free Software
20	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21	# 02111-1307, USA
22	#
23	# This file must be encoded in EUC-JP encoding
24	#
25
26	package mailnews;
27	use strict;
28	require 'util.pl';
29	require 'gfilter.pl';
30
31	sub mediatype() {
32	return ('message/rfc822', 'message/news');
33	}
34
35	sub status() {
36	return 'yes';
37	}
38
39	sub recursive() {
40	return 0;
41	}
42
43	sub pre_codeconv() {
44	return 1;
45	}
46
47	sub post_codeconv () {
48	return 0;
49	}
50
51	sub add_magic ($) {
52	my ($magic) = @_;
53	$magic->addFileExts('\\.822$' => 'message/rfc822');
54	return;
55	}
56
57	sub filter ($$$$$) {
58	my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
59	= @_;
60	my $cfile = defined $orig_cfile ? $$orig_cfile : '';
61
62	util::vprint("Processing mail/news file ...\n");
63
64	uuencode_filter($cont);
65	mailnews_filter($cont, $weighted_str, $fields);
66	mailnews_citation_filter($cont, $weighted_str);
67
68	gfilter::line_adjust_filter($cont);
69	gfilter::line_adjust_filter($weighted_str);
70	gfilter::white_space_adjust_filter($cont);
71	gfilter::white_space_adjust_filter($weighted_str);
72	gfilter::show_filter_debug_info($cont, $weighted_str,
73	$fields, $headings);
74	return undef;
75	}
76
77	# Original of this code was contributed by <furukawa@tcp-ip.or.jp>.
78	sub mailnews_filter ($$$) {
79	my ($contref, $weighted_str, $fields) = @_;
80
81	my $boundary = "";
82	my $line = "";
83	my $partial = 0;
84
85	$$contref =~ s/^\s+//;
86	# Don't handle if first like doesn't seem like a mail/news header.
87	return unless $$contref =~ /(^\S+:\|^from )/i;
88
89	my @tmp = split(/\n/, $$contref);
90	HEADER_PROCESSING:
91	while (@tmp) {
92	$line = shift @tmp;
93	last if ($line =~ /^$/); # if an empty line, header is over
94	# Connect the two lines if next line has leading spaces
95	while (defined($tmp[0]) && $tmp[0] =~ /^\s+/) {
96	# if connection is Japanese character, remove spaces
97	# from Furukawa-san's idea [1998-09-22]
98	my $nextline = shift @tmp;
99	$line =~ s/([\xa1-\xfe])\s+$/$1/;
100	$nextline =~ s/^\s+([\xa1-\xfe])/$1/;
101	$line .= $nextline;
102	}
103
104	# Handle fields.
105	if ($line =~ s/^subject:\s*//i){
106	$fields->{'title'} = $line;
107	# Skip [foobar-ML:000] for a typical mailing list subject.
108	# Practically skip first [...] for simple implementation.
109	$line =~ s/^\[.?\]\s//;
110
111	# Skip 'Re:'
112	$line =~ s/\bre:\s*//gi;
113
114	my $weight = $conf::Weight{'html'}->{'title'};
115	$$weighted_str .= "\x7f$weight\x7f$line\x7f/$weight\x7f\n";
116	} elsif ($line =~ s/^content-type:\s*//i) {
117	if ($line =~ /multipart.boundary="(.)"/i){
118	$boundary = $1;
119	util::dprint("((boundary: $boundary))\n");
120	} elsif ($line =~ m!message/partial;\s(.)!i) {
121	# The Message/Partial subtype routine [1998-10-12]
122	# contributed by Hiroshi Kato <tumibito@mm.rd.nttdata.co.jp>
123	$partial = $1;
124	util::dprint("((partial: $partial))\n");
125	}
126	} elsif ($line =~ /^(\S+):\s(.)/i) {
127	my $name = $1;
128	my $value = $2;
129	$fields->{lc($name)} = $value;
130	if ($name =~ /^($conf::REMAIN_HEADER)$/io) {
131	# keep some fields specified REMAIN_HEADER for search keyword
132	my $weight = $conf::Weight{'headers'};
133	$$weighted_str .=
134	"\x7f$weight\x7f$value\x7f/$weight\x7f\n";
135	}
136	}
137	}
138	if ($partial) {
139	# MHonARC makes several empty lines between header and body,
140	# so remove them.
141	while(@tmp) {
142	last if (! $line =~ /^\s*$/);
143	$line = shift @tmp;
144	}
145	undef $partial;
146	goto HEADER_PROCESSING;
147	}
148	$$contref = join("\n", @tmp);
149
150	# Handle MIME multipart message.
151	if ($boundary) {
152	$boundary =~ s/(\W)/\\$1/g;
153	$$contref =~ s/This is multipart message.\n//i;
154
155
156	# MIME multipart processing,
157	# modified by Furukawa-san's patch on [1998/08/27]
158	$$contref =~ s/--$boundary(--)?\n?/\xff/g;
159	my (@parts) = split(/\xff/, $$contref);
160	$$contref = '';
161	for $_ (@parts){
162	if (s/^(.*?\n\n)//s){
163	my ($head) = $1;
164	$$contref .= $_ if $head =~ m!^content-type:.*text/plain!mi;
165	}
166	}
167	}
168	}
169
170	# Make mail/news citation marks not to be indexed.
171	# And a greeting message at the beginning.
172	# And a meaningless message such as "foo wrote:".
173	# Thanks to Akira Yamada for great idea.
174	sub mailnews_citation_filter ($$) {
175	my ($contref, $weighted_str) = @_;
176
177	my $omake = "";
178	$$contref =~ s/^\s+//;
179	my @tmp = split(/\n/, $$contref);
180	$$contref = "";
181
182	# Greeting at the beginning (first one or two lines)
183	for (my $i = 0; $i < 2 && defined($tmp[$i]); $i++) {
184	if ($tmp[$i] =~ /(^\s((([\xa1-\xfe][\xa1-\xfe]){1,8}\|([\x21-\x7e]{1,16}))\s(��\|��\|\.\|��\|,\|��\|\@\|��\|��)\s){0,2}\s(([\xa1-\xfe][\xa1-\xfe]){1,8}\|([\x21-\x7e]{1,16}))\s(�Ǥ�\|�ȿ��ޤ�\|�Ȥ⤦��ޤ�\|�Ȥ��ޤ�)(.{0,2})?\s$)/) {
185	# for searching debug info by perl -n00e 'print if /^<<<</'
186	util::dprint("\n\n<<<<$tmp[$i]>>>>\n\n");
187	$omake .= $tmp[$i] . "\n";
188	$tmp[$i] = "";
189	}
190	}
191
192	# Isolate citation parts.
193	for my $line (@tmp) {
194	# Don't do that if there is an HTML tag at first.
195	if ($line !~ /^[^>]*</ &&
196	$line =~ s/^((\S{1,10}>)\|(\s[\>\\|\:\#]+\s))+//) {
197	$omake .= $line . "\n";
198	$$contref .= "\n"; # Insert LF.
199	next;
200	}
201	$$contref .= $line. "\n";
202	}
203
204	# Process text as chunks of paragraphs.
205	# Isolate meaningless message such as "foo wrote:".
206	@tmp = split(/\n\n+/, $$contref);
207	$$contref = "";
208	my $i = 0;
209	for my $line (@tmp) {
210	# Complete excluding is impossible. I tnink it's good enough.
211	# Process only first five paragrahs.
212	# And don't handle the paragrah which has five or longer lines.
213	# Hmm, this regex looks very hairly.
214	if ($i < 5 && ($line =~ tr/\n/\n/) <= 5 && $line =~ /(^\s(Date:\|Subject:\|Message-ID:\|From:\|��̾\|��п�\|��))\|(^.+(�ֻ��Ǥ�\|reply\s�Ǥ�\|۩��\|��勞\|�񤭤ޤ��\|��ޤ��\|�ä�\|wrote\|said\|writes\|says)(.{0,2})?\s$)\|(^.In .*(article\|message))\|(<\S+\@([\w\-.]\.)+\w+>)/im) {
215	util::dprint("\n\n<<<<$line>>>>\n\n");
216	$omake .= $line . "\n";
217	$line = "";
218	next;
219	}
220	$$contref .= $line. "\n\n";
221	$i++;
222	}
223	$$weighted_str .= "\x7f1\x7f$omake\x7f/1\x7f\n";
224	}
225
226	# Skip uuencode and BinHex texts.
227	# Original of this code was contributed by <furukawa@tcp-ip.or.jp>.
228	sub uuencode_filter ($) {
229	my ($content) = @_;
230	my @tmp = split(/\n/, $$content);
231	$$content = "";
232
233	my $uuin = 0;
234	while (@tmp) {
235	my $line = shift @tmp;
236	$line .= "\n";
237
238	# Skip BinHex texts.
239	# All lines will be skipped.
240	last if $line =~ /^\(This file must be converted with BinHex/; #)
241
242	# Skip uuencode texts.
243	# References : SunOS 4.1.4: man 5 uuencode
244	# FreeBSD 2.2: uuencode.c
245	# For avoiding accidental matching, check a format.
246	#
247	# There are many netnews messages which is separated into several
248	# files. This kind of files has usually no "begin" line.
249	# This function handle them as well.
250	#
251	# There are two fashion for line length 62 and 63.
252	# This function handle both.
253	#
254	# In the case of following the specification strictly,
255	# int((ord($line) - ord(' ') + 2) / 3)
256	# != (length($line) - 2) / 4
257	# but it can be transformed into a simple equation.
258	# 4 * int(ord($line) / 3) != length($line) + $uunumb;
259
260	# Hey, SunOS's uuencode use SPACE for encoding.
261	# But allowing SPACE is dangerous for misrecognizing.
262	# For compromise, only the following case are acceptable.
263	# 1. inside of begin - end
264	# 2. previous line is recognized as uuencoded line
265	# and ord is identical with previous one.
266
267	# a line consists of only characters of 0x20-0x60 is recognized
268	# as uuencoded line. v1.1.2.3 (bug fix)
269
270	$uuin = 1, next if $line =~ /^begin [0-7]{3,4} \S+$/;
271	if ($line =~ /^end$/){
272	$uuin = 0,next if $uuin;
273	} else {
274	# Restrict ord value in range of 32-95.
275	my $uuord = ord($line);
276	$uuord = 32 if $uuord == 96;
277
278	# if the line of uunumb = 38 is over this loop,
279	# a normal line of 63 length can be ruined accidentaly.
280	my $uunumb = (length($line)==63)? 37: 38;
281
282	if ((32 <= $uuord && $uuord < 96) &&
283	length($line) <= 63 &&
284	(4 * int($uuord / 3) == length($line) + $uunumb)){
285
286	if ($uuin == 1 \|\| $uuin == $uuord){
287	next if $line =~ /^[\x20-\x60]+$/;
288	} else {
289	# Be strict for files which doesn't begin with "begin".
290	$uuin = $uuord, next if $line =~ /^M[\x21-\x60]+$/;
291	}
292	}
293	}
294	$uuin = 0;
295	$$content .= $line;
296	}
297	}
298
299
300	1;