namazu/filter/msword.pl

#
# -*- Perl -*-
# $Id: msword.pl,v 1.28.4.1 2001/07/13 01:14:26 knok Exp $
# Copyright (C) 1997-2000 Satoru Takabayashi All rights reserved.
# Copyright (C) 2000 Satoru Takabayashi Namazu Project All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
# 
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding
#

package msword;
use strict;
use File::Copy;
require 'util.pl';
require 'gfilter.pl';
require 'html.pl';

my $wordconvpath  = undef;
my $utfconvpath   = undef;
my $wvversionpath = undef;

sub mediatype() {
    return ('application/msword');
}

sub status() {
    $wordconvpath = util::checkcmd('wvHtml');
    if (defined $wordconvpath) {
        if (!util::islang("ja")) {
            return 'yes';
        } else {
            $utfconvpath   = util::checkcmd('lv');
            $wvversionpath = util::checkcmd('wvVersion');
            if ((defined $utfconvpath) && (defined $wvversionpath)) {
                return 'yes';
            } else {
                return 'no';
            }
        }
    } else {
        $wordconvpath = util::checkcmd('doccat');
        return 'yes' if defined $wordconvpath;
        return 'no';
    }
}

sub recursive() {
    return 0;
}

sub pre_codeconv() {
    return 0;
}

sub post_codeconv () {
    return 0;
}

sub add_magic ($) {
    return;
}

sub filter ($$$$$) {
    my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
      = @_;
    my $err = undef;
 
    if (util::checkcmd('wvHtml')) {
    $err = filter_wv($orig_cfile, $cont, $weighted_str, $headings, $fields);
    } else { 
    $err = filter_doccat($orig_cfile, $cont, $weighted_str, $headings, $fields);
    }
    return $err;
}   

sub filter_wv ($$$$$) {
    my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
      = @_;
    my $cfile = defined $orig_cfile ? $$orig_cfile : '';

    my $tmpfile  = util::tmpnam('NMZ.word');
    my $tmpfile2 = util::tmpnam('NMZ.word2');


    if (util::islang("ja")) {
    }

    util::vprint("Processing ms-word file ... (using  '$wordconvpath')\n");

    { 
        my $fh = util::efopen("> $tmpfile");
        print $fh $$cont;
    }

    if (!util::islang("ja")) {
        system("$wordconvpath $tmpfile $tmpfile2");
    } else {
        my $version = "unknown";
        my $supported = undef;
        my $fh_cmd = util::efopen("$wvversionpath $tmpfile |");
        while (<$fh_cmd>) {
            if (/^Version: (word\d+),/i) {
                $version = $1;
                #
                # Only word8 format is supported for Japanese.
                #
                if ($version =~ /^word8$/) {
                    $supported = 1;
                }
            }
        }
        return _("Unsupported format: ") .  $version unless $supported;
        system("$wordconvpath $tmpfile $tmpfile2");
        system("$utfconvpath -Iu8 -Oej $tmpfile2 > $tmpfile");
        unlink($tmpfile2);
        rename($tmpfile, $tmpfile2);
    }

    {
        my $fh = util::efopen("< $tmpfile2");
        $$cont = util::readfile($fh);

        # Exclude wvHtml's footer becaues it has no good index terms.
        $$cont =~ s/<!--Section Ends-->.*$//s;
    }

    unlink($tmpfile);
    unlink($tmpfile2);

    html::html_filter($cont, $weighted_str, $fields, $headings);

    gfilter::line_adjust_filter($cont);
    gfilter::line_adjust_filter($weighted_str);
    gfilter::white_space_adjust_filter($cont);
    $fields->{'title'} = gfilter::filename_to_title($cfile, $weighted_str)
      unless $fields->{'title'};
    gfilter::show_filter_debug_info($cont, $weighted_str,
                           $fields, $headings);
    return undef;
}

sub filter_doccat ($$$$$) {
    my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
      = @_;
    my $cfile = defined $orig_cfile ? $$orig_cfile : '';
 
    my $tmpfile  = util::tmpnam('NMZ.word');
    my $tmpfile2 = util::tmpnam('NMZ.word2');   
    copy("$cfile", "$tmpfile2");

    system("$wordconvpath -o e $tmpfile2 > $tmpfile");

    {
        my $fh = util::efopen("< $tmpfile");
        $$cont = util::readfile($fh);
    }

    unlink($tmpfile);
    unlink($tmpfile2);

    gfilter::line_adjust_filter($cont);
    gfilter::line_adjust_filter($weighted_str);
    gfilter::white_space_adjust_filter($cont);
    $fields->{'title'} = gfilter::filename_to_title($cfile, $weighted_str)
        unless $fields->{'title'};
    gfilter::show_filter_debug_info($cont, $weighted_str,
               $fields, $headings);
    return undef;
}

1;
1	#
2	# -- Perl --
3	# $Id: msword.pl,v 1.28.4.1 2001/07/13 01:14:26 knok Exp $
4	# Copyright (C) 1997-2000 Satoru Takabayashi All rights reserved.
5	# Copyright (C) 2000 Satoru Takabayashi Namazu Project All rights reserved.
6	# This is free software with ABSOLUTELY NO WARRANTY.
7	#
8	# This program is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation; either versions 2, or (at your option)
11	# any later version.
12	#
13	# This program is distributed in the hope that it will be useful
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.
17	#
18	# You should have received a copy of the GNU General Public License
19	# along with this program; if not, write to the Free Software
20	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21	# 02111-1307, USA
22	#
23	# This file must be encoded in EUC-JP encoding
24	#
25
26	package msword;
27	use strict;
28	use File::Copy;
29	require 'util.pl';
30	require 'gfilter.pl';
31	require 'html.pl';
32
33	my $wordconvpath = undef;
34	my $utfconvpath = undef;
35	my $wvversionpath = undef;
36
37	sub mediatype() {
38	return ('application/msword');
39	}
40
41	sub status() {
42	$wordconvpath = util::checkcmd('wvHtml');
43	if (defined $wordconvpath) {
44	if (!util::islang("ja")) {
45	return 'yes';
46	} else {
47	$utfconvpath = util::checkcmd('lv');
48	$wvversionpath = util::checkcmd('wvVersion');
49	if ((defined $utfconvpath) && (defined $wvversionpath)) {
50	return 'yes';
51	} else {
52	return 'no';
53	}
54	}
55	} else {
56	$wordconvpath = util::checkcmd('doccat');
57	return 'yes' if defined $wordconvpath;
58	return 'no';
59	}
60	}
61
62	sub recursive() {
63	return 0;
64	}
65
66	sub pre_codeconv() {
67	return 0;
68	}
69
70	sub post_codeconv () {
71	return 0;
72	}
73
74	sub add_magic ($) {
75	return;
76	}
77
78	sub filter ($$$$$) {
79	my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
80	= @_;
81	my $err = undef;
82
83	if (util::checkcmd('wvHtml')) {
84	$err = filter_wv($orig_cfile, $cont, $weighted_str, $headings, $fields);
85	} else {
86	$err = filter_doccat($orig_cfile, $cont, $weighted_str, $headings, $fields);
87	}
88	return $err;
89	}
90
91	sub filter_wv ($$$$$) {
92	my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
93	= @_;
94	my $cfile = defined $orig_cfile ? $$orig_cfile : '';
95
96	my $tmpfile = util::tmpnam('NMZ.word');
97	my $tmpfile2 = util::tmpnam('NMZ.word2');
98
99
100	if (util::islang("ja")) {
101	}
102
103	util::vprint("Processing ms-word file ... (using '$wordconvpath')\n");
104
105	{
106	my $fh = util::efopen("> $tmpfile");
107	print $fh $$cont;
108	}
109
110	if (!util::islang("ja")) {
111	system("$wordconvpath $tmpfile $tmpfile2");
112	} else {
113	my $version = "unknown";
114	my $supported = undef;
115	my $fh_cmd = util::efopen("$wvversionpath $tmpfile \|");
116	while (<$fh_cmd>) {
117	if (/^Version: (word\d+),/i) {
118	$version = $1;
119	#
120	# Only word8 format is supported for Japanese.
121	#
122	if ($version =~ /^word8$/) {
123	$supported = 1;
124	}
125	}
126	}
127	return _("Unsupported format: ") . $version unless $supported;
128	system("$wordconvpath $tmpfile $tmpfile2");
129	system("$utfconvpath -Iu8 -Oej $tmpfile2 > $tmpfile");
130	unlink($tmpfile2);
131	rename($tmpfile, $tmpfile2);
132	}
133
134	{
135	my $fh = util::efopen("< $tmpfile2");
136	$$cont = util::readfile($fh);
137
138	# Exclude wvHtml's footer becaues it has no good index terms.
139	$$cont =~ s/<!--Section Ends-->.*$//s;
140	}
141
142	unlink($tmpfile);
143	unlink($tmpfile2);
144
145	html::html_filter($cont, $weighted_str, $fields, $headings);
146
147	gfilter::line_adjust_filter($cont);
148	gfilter::line_adjust_filter($weighted_str);
149	gfilter::white_space_adjust_filter($cont);
150	$fields->{'title'} = gfilter::filename_to_title($cfile, $weighted_str)
151	unless $fields->{'title'};
152	gfilter::show_filter_debug_info($cont, $weighted_str,
153	$fields, $headings);
154	return undef;
155	}
156
157	sub filter_doccat ($$$$$) {
158	my ($orig_cfile, $cont, $weighted_str, $headings, $fields)
159	= @_;
160	my $cfile = defined $orig_cfile ? $$orig_cfile : '';
161
162	my $tmpfile = util::tmpnam('NMZ.word');
163	my $tmpfile2 = util::tmpnam('NMZ.word2');
164	copy("$cfile", "$tmpfile2");
165
166	system("$wordconvpath -o e $tmpfile2 > $tmpfile");
167
168	{
169	my $fh = util::efopen("< $tmpfile");
170	$$cont = util::readfile($fh);
171	}
172
173	unlink($tmpfile);
174	unlink($tmpfile2);
175
176	gfilter::line_adjust_filter($cont);
177	gfilter::line_adjust_filter($weighted_str);
178	gfilter::white_space_adjust_filter($cont);
179	$fields->{'title'} = gfilter::filename_to_title($cfile, $weighted_str)
180	unless $fields->{'title'};
181	gfilter::show_filter_debug_info($cont, $weighted_str,
182	$fields, $headings);
183	return undef;
184	}
185
186	1;