/[suikacvs]/www/namazu/filter/mailnews.pl
Suika

Contents of /www/namazu/filter/mailnews.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1.1.1 - (show annotations) (download) (vendor branch)
Fri Nov 30 07:56:45 2001 UTC (22 years, 5 months ago) by wakaba
Branch: MAIN, wakaba
CVS Tags: initial, HEAD
Changes since 1.1: +0 -0 lines
File MIME type: text/plain

1 #
2 # -*- Perl -*-
3 # $Id: mailnews.pl,v 1.2 2001/11/04 01:17:42 wakaba Exp $
4 # Copyright (C) 1997-2000 Satoru Takabayashi ,
5 # 1999 NOKUBI Takatsugu All rights reserved.
6 # This is free software with ABSOLUTELY NO WARRANTY.
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either versions 2, or (at your option)
11 # any later version.
12 #
13 # This program is distributed in the hope that it will be useful
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 # 02111-1307, USA
22 #
23 # This file must be encoded in EUC-JP encoding
24 #
25
26 package mailnews;
27 use strict;
28 require 'util.pl';
29 require 'gfilter.pl';
30
31 sub mediatype() {
32 return ('message/rfc822', 'message/news');
33 }
34
35 sub status() {
36 return 'yes';
37 }
38
39 sub recursive() {
40 return 0;
41 }
42
43 sub pre_codeconv() {
44 return 1;
45 }
46
47 sub post_codeconv () {
48 return 0;
49 }
50
51 sub add_magic ($) {
52 my ($magic) = @_;
53 $magic->addFileExts('\\.822$' => 'message/rfc822');
54 return;
55 }
56
57 sub filter ($$$$$) {
58 my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
59 = @_;
60 my $cfile = defined $orig_cfile ? $$orig_cfile : '';
61
62 util::vprint("Processing mail/news file ...\n");
63
64 uuencode_filter($cont);
65 mailnews_filter($cont, $weighted_str, $fields);
66 mailnews_citation_filter($cont, $weighted_str);
67
68 gfilter::line_adjust_filter($cont);
69 gfilter::line_adjust_filter($weighted_str);
70 gfilter::white_space_adjust_filter($cont);
71 gfilter::white_space_adjust_filter($weighted_str);
72 gfilter::show_filter_debug_info($cont, $weighted_str,
73 $fields, $headings);
74 return undef;
75 }
76
77 # Original of this code was contributed by <furukawa@tcp-ip.or.jp>.
78 sub mailnews_filter ($$$) {
79 my ($contref, $weighted_str, $fields) = @_;
80
81 my $boundary = "";
82 my $line = "";
83 my $partial = 0;
84
85 $$contref =~ s/^\s+//;
86 # Don't handle if first like doesn't seem like a mail/news header.
87 return unless $$contref =~ /(^\S+:|^from )/i;
88
89 my @tmp = split(/\n/, $$contref);
90 HEADER_PROCESSING:
91 while (@tmp) {
92 $line = shift @tmp;
93 last if ($line =~ /^$/); # if an empty line, header is over
94 # Connect the two lines if next line has leading spaces
95 while (defined($tmp[0]) && $tmp[0] =~ /^\s+/) {
96 # if connection is Japanese character, remove spaces
97 # from Furukawa-san's idea [1998-09-22]
98 my $nextline = shift @tmp;
99 $line =~ s/([\xa1-\xfe])\s+$/$1/;
100 $nextline =~ s/^\s+([\xa1-\xfe])/$1/;
101 $line .= $nextline;
102 }
103
104 # Handle fields.
105 if ($line =~ s/^subject:\s*//i){
106 $fields->{'title'} = $line;
107 # Skip [foobar-ML:000] for a typical mailing list subject.
108 # Practically skip first [...] for simple implementation.
109 $line =~ s/^\[.*?\]\s*//;
110
111 # Skip 'Re:'
112 $line =~ s/\bre:\s*//gi;
113
114 my $weight = $conf::Weight{'html'}->{'title'};
115 $$weighted_str .= "\x7f$weight\x7f$line\x7f/$weight\x7f\n";
116 } elsif ($line =~ s/^content-type:\s*//i) {
117 if ($line =~ /multipart.*boundary="(.*)"/i){
118 $boundary = $1;
119 util::dprint("((boundary: $boundary))\n");
120 } elsif ($line =~ m!message/partial;\s*(.*)!i) {
121 # The Message/Partial subtype routine [1998-10-12]
122 # contributed by Hiroshi Kato <tumibito@mm.rd.nttdata.co.jp>
123 $partial = $1;
124 util::dprint("((partial: $partial))\n");
125 }
126 } elsif ($line =~ /^(\S+):\s*(.*)/i) {
127 my $name = $1;
128 my $value = $2;
129 $fields->{lc($name)} = $value;
130 if ($name =~ /^($conf::REMAIN_HEADER)$/io) {
131 # keep some fields specified REMAIN_HEADER for search keyword
132 my $weight = $conf::Weight{'headers'};
133 $$weighted_str .=
134 "\x7f$weight\x7f$value\x7f/$weight\x7f\n";
135 }
136 }
137 }
138 if ($partial) {
139 # MHonARC makes several empty lines between header and body,
140 # so remove them.
141 while(@tmp) {
142 last if (! $line =~ /^\s*$/);
143 $line = shift @tmp;
144 }
145 undef $partial;
146 goto HEADER_PROCESSING;
147 }
148 $$contref = join("\n", @tmp);
149
150 # Handle MIME multipart message.
151 if ($boundary) {
152 $boundary =~ s/(\W)/\\$1/g;
153 $$contref =~ s/This is multipart message.\n//i;
154
155
156 # MIME multipart processing,
157 # modified by Furukawa-san's patch on [1998/08/27]
158 $$contref =~ s/--$boundary(--)?\n?/\xff/g;
159 my (@parts) = split(/\xff/, $$contref);
160 $$contref = '';
161 for $_ (@parts){
162 if (s/^(.*?\n\n)//s){
163 my ($head) = $1;
164 $$contref .= $_ if $head =~ m!^content-type:.*text/plain!mi;
165 }
166 }
167 }
168 }
169
170 # Make mail/news citation marks not to be indexed.
171 # And a greeting message at the beginning.
172 # And a meaningless message such as "foo wrote:".
173 # Thanks to Akira Yamada for great idea.
174 sub mailnews_citation_filter ($$) {
175 my ($contref, $weighted_str) = @_;
176
177 my $omake = "";
178 $$contref =~ s/^\s+//;
179 my @tmp = split(/\n/, $$contref);
180 $$contref = "";
181
182 # Greeting at the beginning (first one or two lines)
183 for (my $i = 0; $i < 2 && defined($tmp[$i]); $i++) {
184 if ($tmp[$i] =~ /(^\s*((([\xa1-\xfe][\xa1-\xfe]){1,8}|([\x21-\x7e]{1,16}))\s*(。|.|\.|,|,|、|\@|@|の)\s*){0,2}\s*(([\xa1-\xfe][\xa1-\xfe]){1,8}|([\x21-\x7e]{1,16}))\s*(です|と申します|ともうします|といいます)(.{0,2})?\s*$)/) {
185 # for searching debug info by perl -n00e 'print if /^<<<</'
186 util::dprint("\n\n<<<<$tmp[$i]>>>>\n\n");
187 $omake .= $tmp[$i] . "\n";
188 $tmp[$i] = "";
189 }
190 }
191
192 # Isolate citation parts.
193 for my $line (@tmp) {
194 # Don't do that if there is an HTML tag at first.
195 if ($line !~ /^[^>]*</ &&
196 $line =~ s/^((\S{1,10}>)|(\s*[\>\|\:\#]+\s*))+//) {
197 $omake .= $line . "\n";
198 $$contref .= "\n"; # Insert LF.
199 next;
200 }
201 $$contref .= $line. "\n";
202 }
203
204 # Process text as chunks of paragraphs.
205 # Isolate meaningless message such as "foo wrote:".
206 @tmp = split(/\n\n+/, $$contref);
207 $$contref = "";
208 my $i = 0;
209 for my $line (@tmp) {
210 # Complete excluding is impossible. I tnink it's good enough.
211 # Process only first five paragrahs.
212 # And don't handle the paragrah which has five or longer lines.
213 # Hmm, this regex looks very hairly.
214 if ($i < 5 && ($line =~ tr/\n/\n/) <= 5 && $line =~ /(^\s*(Date:|Subject:|Message-ID:|From:|件名|差出人|日時))|(^.+(返事です|reply\s*です|曰く|いわく|書きました|言いました|話で|wrote|said|writes|says)(.{0,2})?\s*$)|(^.*In .*(article|message))|(<\S+\@([\w\-.]\.)+\w+>)/im) {
215 util::dprint("\n\n<<<<$line>>>>\n\n");
216 $omake .= $line . "\n";
217 $line = "";
218 next;
219 }
220 $$contref .= $line. "\n\n";
221 $i++;
222 }
223 $$weighted_str .= "\x7f1\x7f$omake\x7f/1\x7f\n";
224 }
225
226 # Skip uuencode and BinHex texts.
227 # Original of this code was contributed by <furukawa@tcp-ip.or.jp>.
228 sub uuencode_filter ($) {
229 my ($content) = @_;
230 my @tmp = split(/\n/, $$content);
231 $$content = "";
232
233 my $uuin = 0;
234 while (@tmp) {
235 my $line = shift @tmp;
236 $line .= "\n";
237
238 # Skip BinHex texts.
239 # All lines will be skipped.
240 last if $line =~ /^\(This file must be converted with BinHex/; #)
241
242 # Skip uuencode texts.
243 # References : SunOS 4.1.4: man 5 uuencode
244 # FreeBSD 2.2: uuencode.c
245 # For avoiding accidental matching, check a format.
246 #
247 # There are many netnews messages which is separated into several
248 # files. This kind of files has usually no "begin" line.
249 # This function handle them as well.
250 #
251 # There are two fashion for line length 62 and 63.
252 # This function handle both.
253 #
254 # In the case of following the specification strictly,
255 # int((ord($line) - ord(' ') + 2) / 3)
256 # != (length($line) - 2) / 4
257 # but it can be transformed into a simple equation.
258 # 4 * int(ord($line) / 3) != length($line) + $uunumb;
259
260 # Hey, SunOS's uuencode use SPACE for encoding.
261 # But allowing SPACE is dangerous for misrecognizing.
262 # For compromise, only the following case are acceptable.
263 # 1. inside of begin - end
264 # 2. previous line is recognized as uuencoded line
265 # and ord is identical with previous one.
266
267 # a line consists of only characters of 0x20-0x60 is recognized
268 # as uuencoded line. v1.1.2.3 (bug fix)
269
270 $uuin = 1, next if $line =~ /^begin [0-7]{3,4} \S+$/;
271 if ($line =~ /^end$/){
272 $uuin = 0,next if $uuin;
273 } else {
274 # Restrict ord value in range of 32-95.
275 my $uuord = ord($line);
276 $uuord = 32 if $uuord == 96;
277
278 # if the line of uunumb = 38 is over this loop,
279 # a normal line of 63 length can be ruined accidentaly.
280 my $uunumb = (length($line)==63)? 37: 38;
281
282 if ((32 <= $uuord && $uuord < 96) &&
283 length($line) <= 63 &&
284 (4 * int($uuord / 3) == length($line) + $uunumb)){
285
286 if ($uuin == 1 || $uuin == $uuord){
287 next if $line =~ /^[\x20-\x60]+$/;
288 } else {
289 # Be strict for files which doesn't begin with "begin".
290 $uuin = $uuord, next if $line =~ /^M[\x21-\x60]+$/;
291 }
292 }
293 }
294 $uuin = 0;
295 $$content .= $line;
296 }
297 }
298
299
300 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24