1 |
|
2 |
=head1 NAME |
3 |
|
4 |
Message::Field::Token --- Message-pm: Lexical tokens used in Internet message formats |
5 |
|
6 |
=head1 DESCRIPTION |
7 |
|
8 |
This module provides functions for handling lexical tokens of Internet message formats, |
9 |
such as quoted-string, comment, etc. |
10 |
|
11 |
This module is part of Message::* Perl Modules. |
12 |
|
13 |
=cut |
14 |
|
15 |
package Message::Field::Token; |
16 |
use strict; |
17 |
our $VERSION=do{my @r=(q$Revision: 1.12 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; |
18 |
|
19 |
=head1 $phrase = Message::Field::Token::en_phrase ($string, %option) |
20 |
|
21 |
Return given string (in internal code) as a phrase (= atom / quoted-string / encoded-word). |
22 |
|
23 |
Options: |
24 |
|
25 |
=over 4 |
26 |
|
27 |
=item -charset => charset (default: 'us-ascii') |
28 |
|
29 |
Charset name (in lower cases) to be used to encode the output string. |
30 |
|
31 |
=item -ebcdic_safe |
32 |
|
33 |
=item -encode_encoded_word_like |
34 |
|
35 |
=item -quoted_pair |
36 |
|
37 |
=item -token_maxlength |
38 |
|
39 |
=item -unsafe_char => 1*CHAR / '' (default) |
40 |
|
41 |
See options of C<en_quoted_string>. |
42 |
|
43 |
=item -source_charset => charset (default = *internal) |
44 |
|
45 |
Charset name (in lower case) of given string. |
46 |
|
47 |
=back |
48 |
|
49 |
=cut |
50 |
|
51 |
sub en_phrase ($;%) { |
52 |
my $string = shift; |
53 |
my %option = @_; |
54 |
my @string = split /(?<=[^\x09\x20])(?=[\x09\x20])/, $string; |
55 |
my @str = ([0, '']); |
56 |
for my $i (0..$#string) { |
57 |
if (Message::MIME::Charset::is_representable_in ($option{-charset}, $string[$i], \%option) && $string[$i] !~ /[\x00\x0A\x0D]/) { |
58 |
if ($str[$#str]->[0]) { ## Preceding token should be encoded |
59 |
push @str, [0, $string[$i]]; |
60 |
} else { ## Preceding token need not be encoded |
61 |
$str[$#str]->[1] .= $string[$i]; |
62 |
} |
63 |
} else { ## Should be encoded |
64 |
if ($str[$#str]->[0]) { ## Preceding token should be encoded |
65 |
$str[$#str]->[1] .= $string[$i]; |
66 |
} else { ## Preceding token need not be encoded |
67 |
push @str, [1, $string[$i]]; |
68 |
} |
69 |
} |
70 |
} |
71 |
my $phrase = ''; |
72 |
for my $i (0..$#str) { |
73 |
my $ws = ''; $ws = $1 if $i != 0 && $str[$i]->[1] =~ s/^([\x09\x20])//; |
74 |
if ($str[$i]->[0]) { |
75 |
$str[$i]->[1] = Message::MIME::EncodedWord::encode ($str[$i]->[1], %option, -context => 'phrase', -encode_char => '\x00-\x7F'); |
76 |
} else { |
77 |
$str[$i]->[1] = _to_quoted_string ($str[$i]->[1], \%option); |
78 |
} |
79 |
$phrase .= $ws . $str[$i]->[1]; |
80 |
} |
81 |
$phrase; |
82 |
} |
83 |
|
84 |
=head1 $quoted_string = Message::Field::Token::en_quoted_string ($string, %option) |
85 |
|
86 |
Quote string as a quoted-string if necessary. |
87 |
|
88 |
Options: |
89 |
|
90 |
=over 4 |
91 |
|
92 |
=item -charset => charset (default: 'us-ascii') |
93 |
|
94 |
Charset name (in lower cases) to be used to encode the output string. |
95 |
|
96 |
=item -context => 'quoted_string' (default) / 'atom' / 'token' / 'http_token' / 'attr_char' / 'http_attr_char' |
97 |
|
98 |
Context in which given string is embeded. This value is used to check if |
99 |
string should be quoted as a quoted-string. 'quoted_string', the default value, |
100 |
makes string ALWAYS quoted. 'atom' makes it quoted when string contains |
101 |
one or more characters not included in the atext (see RFC 2822). |
102 |
Likewise, 'token' is for token of MIME, 'http_token' is for token of HTTP, |
103 |
'attr_char' is attribute-char of MIME (RFC 2231) and 'http_attr_char' |
104 |
is 'http_token' AND 'attr_char' (ie. safe for both HTTP and RFC 2231). |
105 |
|
106 |
=item -ebcdic_safe |
107 |
|
108 |
This option is only meaningful when C<-use_quoted_string> is true. |
109 |
See L<Message::MIME::EncodedWord>::decode. |
110 |
|
111 |
=item -encode_encoded_word_like |
112 |
|
113 |
This option is only meaningful when C<-use_quoted_string> is true. |
114 |
See L<Message::MIME::EncodedWord>::decode. |
115 |
|
116 |
=item -quoted_pair => 1*CHAR / qr|(:: pattern ::)| (default: qr/([\x0D\\"]|(?<==)\?)/) |
117 |
|
118 |
A character list or a Regexp pattern for characters to be quoted as |
119 |
the quoted-pairs. When '', no character is quoted. |
120 |
Quoting is performed to characters NOT encoded as encoded-words. |
121 |
|
122 |
=item -source_charset => charset (default = *internal) |
123 |
|
124 |
Charset name (in lower case) of given string. |
125 |
|
126 |
=item -token_maxlength |
127 |
|
128 |
This option is only meaningful when C<-use_quoted_string> is true. |
129 |
See L<Message::MIME::EncodedWord>::decode. |
130 |
|
131 |
=item -unsafe_char => 1*CHAR / '' (default) |
132 |
|
133 |
The list of characters that when one of them is included in the string |
134 |
it should be quoted, in addition to special characters determined by C<-context> option. |
135 |
|
136 |
=item -use_encoded_word => 0 (default)/1 |
137 |
|
138 |
If true, characters in qcontent which unable to be represented in C<-charset> charset |
139 |
are encoded in encoded-words. |
140 |
|
141 |
=back |
142 |
|
143 |
=cut |
144 |
|
145 |
sub en_quoted_string ($;%) { |
146 |
my ($string, %option) = @_; |
147 |
if ($option{-use_encoded_word}) { |
148 |
require Message::MIME::EncodedWord; |
149 |
$option{-quoted_pair} ||= qr/([\x0D\\"]|(?<==)\?)/; |
150 |
$string = Message::MIME::EncodedWord::encode ($string, %option, -context => 'quoted_string', -preserve_wsp => 0); |
151 |
$option{-quoted_pair} = qr/(?!)/; |
152 |
} |
153 |
$string = _to_quoted_string ($string, \%option); |
154 |
my ($s, %r) = Message::MIME::Charset::encode ($option{-charset}, $string); |
155 |
$r{success} ? $s : $string; |
156 |
} |
157 |
|
158 |
sub _to_quoted_string ($$) { |
159 |
my ($string, $option) = @_; |
160 |
## -- What characters should be quoted? |
161 |
my $achar = { |
162 |
atom => qq(0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#\$%&'*+/=?^_`{|}~-), |
163 |
token => qq(0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#\$%&'*+^_`{|}~.-), |
164 |
http_token => qq(0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#\$%&'*+^_`|~.-), |
165 |
attr_char => qq(0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#\$&+^_`{|}~.-), |
166 |
http_attr_char => qq(0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#\$&+^_`|~.-), |
167 |
quoted_string => '', |
168 |
}->{$option->{-context} || 'quoted_string'}; |
169 |
my $echar = $option->{-unsafe_char}; $echar =~ s/([\/\\])/\\$1/g; |
170 |
eval qq{\$achar =~ tr/$echar//d}; |
171 |
$achar =~ s/([\\\[\]])/\\$1/g; |
172 |
$option->{-quoted_pair} ||= qr/([\x0D"\\]|(?<==)\?)/; |
173 |
if (!ref $option->{-quoted_pair}) { |
174 |
$option->{-quoted_pair} = quotemeta $option->{-quoted_pair}; |
175 |
$option->{-quoted_pair} = qr/([$option->{-quoted_pair}])/; |
176 |
} |
177 |
## -- Quote |
178 |
if (length ($achar) == 0 || $string =~ /[^$achar]/) { |
179 |
$string =~ s/$option->{-quoted_pair}/\\$1/g if $option->{-quoted_pair}; |
180 |
$string = '"' . $string . '"'; |
181 |
} |
182 |
$string; |
183 |
} |
184 |
|
185 |
=head1 $string = Message::Field::Token::de_phrase ($phrase, %option) |
186 |
|
187 |
Parses given phrase or a quoted-string and return it as a string (in internal code). |
188 |
|
189 |
Options: |
190 |
|
191 |
=over 4 |
192 |
|
193 |
=item -charset => charset (default: 'us-ascii') |
194 |
|
195 |
Charset name of input. The charset MUST be a superset of ASCII. |
196 |
|
197 |
=item -use_encoded_word => 0/1 (default) |
198 |
|
199 |
Decodes encoded-words (out of quoted-strings). |
200 |
|
201 |
=item -use_quoted_encoded_word => 0 (default)/1 |
202 |
|
203 |
Decodes encoded-words in quoted-strings. |
204 |
|
205 |
=back |
206 |
|
207 |
=cut |
208 |
|
209 |
sub de_phrase ($;%) { |
210 |
my ($string, %option) = @_; |
211 |
$option{-use_encoded_word} = 1 unless defined $option{-use_encoded_word}; |
212 |
require Message::MIME::EncodedWord if $option{-use_encoded_word}; |
213 |
require Message::MIME::Charset; |
214 |
$string =~ s("((?:\\.|[^"])*)"|([^"]+)){ ## Note: broken <"> does not match with this. |
215 |
my ($qcontent, $atom) = ($1, $2); |
216 |
if (defined $qcontent) { |
217 |
if ($option{-use_quoted_encoded_word}) { |
218 |
$qcontent = Message::MIME::EncodedWord::decode($qcontent, -process_non_encoded_word => sub { |
219 |
$_[0] =~ s/\\(.)/$1/g; |
220 |
my ($s, %s) = Message::MIME::Charset::decode ($option{-charset} || 'us-ascii', $_[0]); |
221 |
$s{success} ? ($s, 0) : ($_[0], 0); |
222 |
}); |
223 |
} else { |
224 |
$qcontent =~ s/\\(.)/$1/g; |
225 |
my ($s, %s) = Message::MIME::Charset::decode ($option{-charset} || 'us-ascii', $qcontent); |
226 |
$qcontent = $s if $s{success}; |
227 |
} |
228 |
$qcontent; |
229 |
} else { ## 1*(atom / encoded-word / FWS) |
230 |
if ($option{-use_encoded_word}) { |
231 |
$atom = Message::MIME::EncodedWord::decode ($atom, -process_non_encoded_word => sub { |
232 |
my ($s, %s) = Message::MIME::Charset::decode ($option{-charset} || 'us-ascii', $_[0]); |
233 |
$s{success} ? ($s, 0) : ($_[0], 0); |
234 |
}); |
235 |
} |
236 |
$atom; |
237 |
} |
238 |
}ges; |
239 |
$string; |
240 |
} |
241 |
|
242 |
=head1 $string = Message::Field::Token::de_quoted_string ($quoted_string, %option) |
243 |
|
244 |
An alias to C<de_phrase>. |
245 |
|
246 |
=cut |
247 |
|
248 |
*de_quoted_string = \&de_phrase; |
249 |
|
250 |
=head1 LICENSE |
251 |
|
252 |
Copyright 2002 Wakaba <w@suika.fam.cx> |
253 |
|
254 |
This program is free software; you can redistribute it and/or modify |
255 |
it under the terms of the GNU General Public License as published by |
256 |
the Free Software Foundation; either version 2 of the License, or |
257 |
(at your option) any later version. |
258 |
|
259 |
This program is distributed in the hope that it will be useful, |
260 |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
261 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
262 |
GNU General Public License for more details. |
263 |
|
264 |
You should have received a copy of the GNU General Public License |
265 |
along with this program; see the file COPYING. If not, write to |
266 |
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
267 |
Boston, MA 02111-1307, USA. |
268 |
|
269 |
=cut |
270 |
|
271 |
1; # $Date: $ |