1 |
# |
2 |
# -*- Perl -*- |
3 |
# $Id: mailnews.pl,v 1.2 2001/11/04 01:17:42 wakaba Exp $ |
4 |
# Copyright (C) 1997-2000 Satoru Takabayashi , |
5 |
# 1999 NOKUBI Takatsugu All rights reserved. |
6 |
# This is free software with ABSOLUTELY NO WARRANTY. |
7 |
# |
8 |
# This program is free software; you can redistribute it and/or modify |
9 |
# it under the terms of the GNU General Public License as published by |
10 |
# the Free Software Foundation; either versions 2, or (at your option) |
11 |
# any later version. |
12 |
# |
13 |
# This program is distributed in the hope that it will be useful |
14 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
# GNU General Public License for more details. |
17 |
# |
18 |
# You should have received a copy of the GNU General Public License |
19 |
# along with this program; if not, write to the Free Software |
20 |
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
21 |
# 02111-1307, USA |
22 |
# |
23 |
# This file must be encoded in EUC-JP encoding |
24 |
# |
25 |
|
26 |
package mailnews; |
27 |
use strict; |
28 |
require 'util.pl'; |
29 |
require 'gfilter.pl'; |
30 |
|
31 |
sub mediatype() { |
32 |
return ('message/rfc822', 'message/news'); |
33 |
} |
34 |
|
35 |
sub status() { |
36 |
return 'yes'; |
37 |
} |
38 |
|
39 |
sub recursive() { |
40 |
return 0; |
41 |
} |
42 |
|
43 |
sub pre_codeconv() { |
44 |
return 1; |
45 |
} |
46 |
|
47 |
sub post_codeconv () { |
48 |
return 0; |
49 |
} |
50 |
|
51 |
sub add_magic ($) { |
52 |
my ($magic) = @_; |
53 |
$magic->addFileExts('\\.822$' => 'message/rfc822'); |
54 |
return; |
55 |
} |
56 |
|
57 |
sub filter ($$$$$) { |
58 |
my ($orig_cfile, $cont, $weighted_str, $headings, $fields) |
59 |
= @_; |
60 |
my $cfile = defined $orig_cfile ? $$orig_cfile : ''; |
61 |
|
62 |
util::vprint("Processing mail/news file ...\n"); |
63 |
|
64 |
uuencode_filter($cont); |
65 |
mailnews_filter($cont, $weighted_str, $fields); |
66 |
mailnews_citation_filter($cont, $weighted_str); |
67 |
|
68 |
gfilter::line_adjust_filter($cont); |
69 |
gfilter::line_adjust_filter($weighted_str); |
70 |
gfilter::white_space_adjust_filter($cont); |
71 |
gfilter::white_space_adjust_filter($weighted_str); |
72 |
gfilter::show_filter_debug_info($cont, $weighted_str, |
73 |
$fields, $headings); |
74 |
return undef; |
75 |
} |
76 |
|
77 |
# Original of this code was contributed by <furukawa@tcp-ip.or.jp>. |
78 |
sub mailnews_filter ($$$) { |
79 |
my ($contref, $weighted_str, $fields) = @_; |
80 |
|
81 |
my $boundary = ""; |
82 |
my $line = ""; |
83 |
my $partial = 0; |
84 |
|
85 |
$$contref =~ s/^\s+//; |
86 |
# Don't handle if first like doesn't seem like a mail/news header. |
87 |
return unless $$contref =~ /(^\S+:|^from )/i; |
88 |
|
89 |
my @tmp = split(/\n/, $$contref); |
90 |
HEADER_PROCESSING: |
91 |
while (@tmp) { |
92 |
$line = shift @tmp; |
93 |
last if ($line =~ /^$/); # if an empty line, header is over |
94 |
# Connect the two lines if next line has leading spaces |
95 |
while (defined($tmp[0]) && $tmp[0] =~ /^\s+/) { |
96 |
# if connection is Japanese character, remove spaces |
97 |
# from Furukawa-san's idea [1998-09-22] |
98 |
my $nextline = shift @tmp; |
99 |
$line =~ s/([\xa1-\xfe])\s+$/$1/; |
100 |
$nextline =~ s/^\s+([\xa1-\xfe])/$1/; |
101 |
$line .= $nextline; |
102 |
} |
103 |
|
104 |
# Handle fields. |
105 |
if ($line =~ s/^subject:\s*//i){ |
106 |
$fields->{'title'} = $line; |
107 |
# Skip [foobar-ML:000] for a typical mailing list subject. |
108 |
# Practically skip first [...] for simple implementation. |
109 |
$line =~ s/^\[.*?\]\s*//; |
110 |
|
111 |
# Skip 'Re:' |
112 |
$line =~ s/\bre:\s*//gi; |
113 |
|
114 |
my $weight = $conf::Weight{'html'}->{'title'}; |
115 |
$$weighted_str .= "\x7f$weight\x7f$line\x7f/$weight\x7f\n"; |
116 |
} elsif ($line =~ s/^content-type:\s*//i) { |
117 |
if ($line =~ /multipart.*boundary="(.*)"/i){ |
118 |
$boundary = $1; |
119 |
util::dprint("((boundary: $boundary))\n"); |
120 |
} elsif ($line =~ m!message/partial;\s*(.*)!i) { |
121 |
# The Message/Partial subtype routine [1998-10-12] |
122 |
# contributed by Hiroshi Kato <tumibito@mm.rd.nttdata.co.jp> |
123 |
$partial = $1; |
124 |
util::dprint("((partial: $partial))\n"); |
125 |
} |
126 |
} elsif ($line =~ /^(\S+):\s*(.*)/i) { |
127 |
my $name = $1; |
128 |
my $value = $2; |
129 |
$fields->{lc($name)} = $value; |
130 |
if ($name =~ /^($conf::REMAIN_HEADER)$/io) { |
131 |
# keep some fields specified REMAIN_HEADER for search keyword |
132 |
my $weight = $conf::Weight{'headers'}; |
133 |
$$weighted_str .= |
134 |
"\x7f$weight\x7f$value\x7f/$weight\x7f\n"; |
135 |
} |
136 |
} |
137 |
} |
138 |
if ($partial) { |
139 |
# MHonARC makes several empty lines between header and body, |
140 |
# so remove them. |
141 |
while(@tmp) { |
142 |
last if (! $line =~ /^\s*$/); |
143 |
$line = shift @tmp; |
144 |
} |
145 |
undef $partial; |
146 |
goto HEADER_PROCESSING; |
147 |
} |
148 |
$$contref = join("\n", @tmp); |
149 |
|
150 |
# Handle MIME multipart message. |
151 |
if ($boundary) { |
152 |
$boundary =~ s/(\W)/\\$1/g; |
153 |
$$contref =~ s/This is multipart message.\n//i; |
154 |
|
155 |
|
156 |
# MIME multipart processing, |
157 |
# modified by Furukawa-san's patch on [1998/08/27] |
158 |
$$contref =~ s/--$boundary(--)?\n?/\xff/g; |
159 |
my (@parts) = split(/\xff/, $$contref); |
160 |
$$contref = ''; |
161 |
for $_ (@parts){ |
162 |
if (s/^(.*?\n\n)//s){ |
163 |
my ($head) = $1; |
164 |
$$contref .= $_ if $head =~ m!^content-type:.*text/plain!mi; |
165 |
} |
166 |
} |
167 |
} |
168 |
} |
169 |
|
170 |
# Make mail/news citation marks not to be indexed. |
171 |
# And a greeting message at the beginning. |
172 |
# And a meaningless message such as "foo wrote:". |
173 |
# Thanks to Akira Yamada for great idea. |
174 |
sub mailnews_citation_filter ($$) { |
175 |
my ($contref, $weighted_str) = @_; |
176 |
|
177 |
my $omake = ""; |
178 |
$$contref =~ s/^\s+//; |
179 |
my @tmp = split(/\n/, $$contref); |
180 |
$$contref = ""; |
181 |
|
182 |
# Greeting at the beginning (first one or two lines) |
183 |
for (my $i = 0; $i < 2 && defined($tmp[$i]); $i++) { |
184 |
if ($tmp[$i] =~ /(^\s*((([\xa1-\xfe][\xa1-\xfe]){1,8}|([\x21-\x7e]{1,16}))\s*(。|.|\.|,|,|、|\@|@|の)\s*){0,2}\s*(([\xa1-\xfe][\xa1-\xfe]){1,8}|([\x21-\x7e]{1,16}))\s*(です|と申します|ともうします|といいます)(.{0,2})?\s*$)/) { |
185 |
# for searching debug info by perl -n00e 'print if /^<<<</' |
186 |
util::dprint("\n\n<<<<$tmp[$i]>>>>\n\n"); |
187 |
$omake .= $tmp[$i] . "\n"; |
188 |
$tmp[$i] = ""; |
189 |
} |
190 |
} |
191 |
|
192 |
# Isolate citation parts. |
193 |
for my $line (@tmp) { |
194 |
# Don't do that if there is an HTML tag at first. |
195 |
if ($line !~ /^[^>]*</ && |
196 |
$line =~ s/^((\S{1,10}>)|(\s*[\>\|\:\#]+\s*))+//) { |
197 |
$omake .= $line . "\n"; |
198 |
$$contref .= "\n"; # Insert LF. |
199 |
next; |
200 |
} |
201 |
$$contref .= $line. "\n"; |
202 |
} |
203 |
|
204 |
# Process text as chunks of paragraphs. |
205 |
# Isolate meaningless message such as "foo wrote:". |
206 |
@tmp = split(/\n\n+/, $$contref); |
207 |
$$contref = ""; |
208 |
my $i = 0; |
209 |
for my $line (@tmp) { |
210 |
# Complete excluding is impossible. I tnink it's good enough. |
211 |
# Process only first five paragrahs. |
212 |
# And don't handle the paragrah which has five or longer lines. |
213 |
# Hmm, this regex looks very hairly. |
214 |
if ($i < 5 && ($line =~ tr/\n/\n/) <= 5 && $line =~ /(^\s*(Date:|Subject:|Message-ID:|From:|件名|差出人|日時))|(^.+(返事です|reply\s*です|曰く|いわく|書きました|言いました|話で|wrote|said|writes|says)(.{0,2})?\s*$)|(^.*In .*(article|message))|(<\S+\@([\w\-.]\.)+\w+>)/im) { |
215 |
util::dprint("\n\n<<<<$line>>>>\n\n"); |
216 |
$omake .= $line . "\n"; |
217 |
$line = ""; |
218 |
next; |
219 |
} |
220 |
$$contref .= $line. "\n\n"; |
221 |
$i++; |
222 |
} |
223 |
$$weighted_str .= "\x7f1\x7f$omake\x7f/1\x7f\n"; |
224 |
} |
225 |
|
226 |
# Skip uuencode and BinHex texts. |
227 |
# Original of this code was contributed by <furukawa@tcp-ip.or.jp>. |
228 |
sub uuencode_filter ($) { |
229 |
my ($content) = @_; |
230 |
my @tmp = split(/\n/, $$content); |
231 |
$$content = ""; |
232 |
|
233 |
my $uuin = 0; |
234 |
while (@tmp) { |
235 |
my $line = shift @tmp; |
236 |
$line .= "\n"; |
237 |
|
238 |
# Skip BinHex texts. |
239 |
# All lines will be skipped. |
240 |
last if $line =~ /^\(This file must be converted with BinHex/; #) |
241 |
|
242 |
# Skip uuencode texts. |
243 |
# References : SunOS 4.1.4: man 5 uuencode |
244 |
# FreeBSD 2.2: uuencode.c |
245 |
# For avoiding accidental matching, check a format. |
246 |
# |
247 |
# There are many netnews messages which is separated into several |
248 |
# files. This kind of files has usually no "begin" line. |
249 |
# This function handle them as well. |
250 |
# |
251 |
# There are two fashion for line length 62 and 63. |
252 |
# This function handle both. |
253 |
# |
254 |
# In the case of following the specification strictly, |
255 |
# int((ord($line) - ord(' ') + 2) / 3) |
256 |
# != (length($line) - 2) / 4 |
257 |
# but it can be transformed into a simple equation. |
258 |
# 4 * int(ord($line) / 3) != length($line) + $uunumb; |
259 |
|
260 |
# Hey, SunOS's uuencode use SPACE for encoding. |
261 |
# But allowing SPACE is dangerous for misrecognizing. |
262 |
# For compromise, only the following case are acceptable. |
263 |
# 1. inside of begin - end |
264 |
# 2. previous line is recognized as uuencoded line |
265 |
# and ord is identical with previous one. |
266 |
|
267 |
# a line consists of only characters of 0x20-0x60 is recognized |
268 |
# as uuencoded line. v1.1.2.3 (bug fix) |
269 |
|
270 |
$uuin = 1, next if $line =~ /^begin [0-7]{3,4} \S+$/; |
271 |
if ($line =~ /^end$/){ |
272 |
$uuin = 0,next if $uuin; |
273 |
} else { |
274 |
# Restrict ord value in range of 32-95. |
275 |
my $uuord = ord($line); |
276 |
$uuord = 32 if $uuord == 96; |
277 |
|
278 |
# if the line of uunumb = 38 is over this loop, |
279 |
# a normal line of 63 length can be ruined accidentaly. |
280 |
my $uunumb = (length($line)==63)? 37: 38; |
281 |
|
282 |
if ((32 <= $uuord && $uuord < 96) && |
283 |
length($line) <= 63 && |
284 |
(4 * int($uuord / 3) == length($line) + $uunumb)){ |
285 |
|
286 |
if ($uuin == 1 || $uuin == $uuord){ |
287 |
next if $line =~ /^[\x20-\x60]+$/; |
288 |
} else { |
289 |
# Be strict for files which doesn't begin with "begin". |
290 |
$uuin = $uuord, next if $line =~ /^M[\x21-\x60]+$/; |
291 |
} |
292 |
} |
293 |
} |
294 |
$uuin = 0; |
295 |
$$content .= $line; |
296 |
} |
297 |
} |
298 |
|
299 |
|
300 |
1; |