1 |
#!/usr/bin/perl
|
2 |
|
3 |
=head1 NAME
|
4 |
|
5 |
winapi2tbl --- Mapping table generator (from Win32 API)
|
6 |
|
7 |
=head1 USAGE
|
8 |
|
9 |
$ perl winapi2tbl.pl 932 > winapi-cp932.tbl
|
10 |
|
11 |
=cut
|
12 |
|
13 |
use strict;
|
14 |
require Win32::API;
|
15 |
|
16 |
my $CodePage = shift || 932;
|
17 |
my %S_range = (
|
18 |
932 => [0x40..0x7E,0x80..0xFC],
|
19 |
936 => [0x40..0x7E,0x80..0xFE],
|
20 |
949 => [0x41..0x5A,0x61..0x7A,0x81..0xFE],
|
21 |
950 => [0x40..0x7E,0xA1..0xFE],
|
22 |
1361 => [0x31..0xFE],
|
23 |
20000 => [0x21..0x7E,0xA1..0xFE],
|
24 |
20932 => [0xA1..0xFE],
|
25 |
20936 => [0xA1..0xFE],
|
26 |
20949 => [0xA1..0xFE],
|
27 |
);
|
28 |
|
29 |
{
|
30 |
my @name = split /\n/, require 'unicore/Name.pl';
|
31 |
my %name;
|
32 |
for (@name) {
|
33 |
if (/^(....) ([^\t]+)/) {
|
34 |
$name{hex $1} = $2;
|
35 |
}
|
36 |
}
|
37 |
sub NAME ($) {
|
38 |
$_[0] < 0x0020 ? '<control>' :
|
39 |
$_[0] < 0x007F ? $name{$_[0]} :
|
40 |
$_[0] < 0x00A0 ? '<control>' :
|
41 |
$name{$_[0]} ? $name{$_[0]} :
|
42 |
$_[0] < 0x00A0 ? '<control>' :
|
43 |
$_[0] < 0x3400 ? '' :
|
44 |
$_[0] < 0xA000 ? '<cjk>' :
|
45 |
$_[0] < 0xE000 ? '<hangul>' :
|
46 |
$_[0] < 0xF900 ? '<private>' :
|
47 |
'';
|
48 |
}
|
49 |
}
|
50 |
|
51 |
my @hdr;
|
52 |
print STDERR "Checking system version...\n";
|
53 |
if (eval q{require Message::Field::UA}) {
|
54 |
my $ua = new Message::Field::UA;
|
55 |
$ua->add_our_name;
|
56 |
$ua->replace_system_version ('ie', -prepend => 0);
|
57 |
push @hdr, "#; Environment: $ua";
|
58 |
print STDERR $hdr[$#hdr],"\n";
|
59 |
}
|
60 |
|
61 |
print STDERR "MultiByte -> WideChar...\n";
|
62 |
#int MultiByteToWideChar(
|
63 |
# UINT CodePage, // code page
|
64 |
# DWORD dwFlags, // mapping flag
|
65 |
# LPCSTR lpMultiByteStr, // multibyte string (source)
|
66 |
# int cchMultiByte, // length of multibyte string
|
67 |
# LPWSTR lpWideCharStr, // wide string (result)
|
68 |
# int cchWideChar // length of wide string
|
69 |
#);
|
70 |
## <http://www.microsoft.com/japan/developer/library/jpwinpf/_win32_multibytetowidechar.htm>
|
71 |
|
72 |
my $M2W = new Win32::API (kernel32 => "MultiByteToWideChar", INPIPI => 'I');
|
73 |
my %M2U;
|
74 |
for my $F (0x00..0xFF) { ## one byte
|
75 |
my $mb = pack 'C', $F;
|
76 |
my $wc = "\x00" x 10;
|
77 |
my $len = $M2W->Call ($CodePage, 0, $mb, length ($mb) => $wc, length ($wc));
|
78 |
$wc = substr ($wc, 0, $len * 2);
|
79 |
if (length ($wc) >= 2 && $wc ne "\x00\x00") {
|
80 |
$wc =~ s/(.)(.)/$2$1/gs;
|
81 |
$wc =~ s/(.)/sprintf '%02X', ord $1/ges;
|
82 |
$M2U{ $F } = hex $wc;
|
83 |
}
|
84 |
}
|
85 |
for my $F (0x80..0xFF) { ## two bytes
|
86 |
for my $S (@{$S_range{$CodePage} || [0x01..0xFF] }) {
|
87 |
my $mb = pack 'CC', $F, $S;
|
88 |
my $wc = "\x00" x 10;
|
89 |
my $len = $M2W->Call ($CodePage, 0, $mb, length ($mb) => $wc, length ($wc));
|
90 |
$wc = substr ($wc, 0, $len * 2);
|
91 |
if (length ($wc) >= 2) {
|
92 |
$wc =~ s/(.)(.)/$2$1/gs;
|
93 |
$wc =~ s/(.)/sprintf '%02X', ord $1/ges;
|
94 |
$M2U{ 0x100 * $F + $S } = hex ($wc) unless 0x10000 * $M2U{$F} + $M2U{$S} == hex ($wc);
|
95 |
}
|
96 |
}}
|
97 |
$M2U{0x00} = 0x0000;
|
98 |
|
99 |
print STDERR "WideChar -> MultiByte...\n";
|
100 |
#int WideCharToMultiByte(
|
101 |
# UINT CodePage, // code page
|
102 |
# DWORD dwFlags, // mapping flag
|
103 |
# LPCWSTR lpWideCharStr, // wide string (source)
|
104 |
# int cchWideChar, // length of wide string
|
105 |
# LPSTR lpMultiByteStr, // multibyte string (result)
|
106 |
# int cchMultiByte, // length of multibyte
|
107 |
# LPCSTR lpDefaultChar, // substition char of unmappable one
|
108 |
# LPBOOL lpUsedDefaultChar // substition char is used?
|
109 |
#);
|
110 |
## <http://www.microsoft.com/japan/developer/library/jpwinpf/_win32_widechartomultibyte.htm>
|
111 |
|
112 |
my $W2M = new Win32::API (kernel32 => "WideCharToMultiByte", INPIPIPP => 'I');
|
113 |
my %U2M;
|
114 |
for my $R (0x00..0xFF) {
|
115 |
for my $C (0x00..0xFF) {
|
116 |
my $wc = pack 'CC', $C, $R; ## UTF-16LE
|
117 |
my $mb = "\x00" x 10;
|
118 |
$W2M->Call ($CodePage, 0, $wc, length ($wc) / 2 => $mb, length ($mb), "\x00", 0);
|
119 |
$mb =~ s/\x00.*$//;
|
120 |
if (length $mb) {
|
121 |
$mb =~ s/(.)/sprintf '%02X', ord $1/ges;
|
122 |
$U2M{ 0x100 * $R + $C } = hex $mb;
|
123 |
}
|
124 |
}}
|
125 |
$U2M{0x0000} = 0x00;
|
126 |
|
127 |
for my $R1 (0xD8..0xDB) { for my $C1 (0x00..0xFF) {
|
128 |
for my $R2 (0xDC..0xDF) { for my $C2 (0x00..0xFF) {
|
129 |
my $wc = pack 'CC', $C1, $R1, $C2, $R2; ## UTF-16LE
|
130 |
my $mb = "\x00" x 10;
|
131 |
$W2M->Call ($CodePage, 0, $wc, length ($wc) / 2 => $mb, length ($mb), "\x00", 0);
|
132 |
$mb =~ s/\x00.*$//;
|
133 |
if (length $mb) {
|
134 |
$mb =~ s/(.)/sprintf '%02X', ord $1/ges;
|
135 |
$U2M{ 0x10000 + (0x100 * $R1 + $C1 - 0xD800) * 0x400
|
136 |
+ (0x100 * $R2 + $C2 - 0xDC00) } = hex ($mb);
|
137 |
}
|
138 |
}}}}
|
139 |
|
140 |
print STDERR "Creating table...\n";
|
141 |
|
142 |
my @t;
|
143 |
for my $U (keys %U2M) {
|
144 |
if ($M2U{$U2M{$U}} == $U) {
|
145 |
push @t, sprintf '0x%04X U+%04X # %s',
|
146 |
$U2M{$U}, $U, NAME ($U);
|
147 |
delete $M2U{ $U2M{$U} };
|
148 |
} else {
|
149 |
push @t, sprintf '0x%04X U+%04X <- # %s',
|
150 |
$U2M{$U}, $U, NAME ($U);
|
151 |
}
|
152 |
}
|
153 |
for my $C (keys %M2U) {
|
154 |
push @t, sprintf '0x%04X U+%04X -> # %s',
|
155 |
$C, $M2U{$C}, NAME ($M2U{$C});
|
156 |
}
|
157 |
for (@t) {
|
158 |
s/^0x00(..)/0x$1/;
|
159 |
}
|
160 |
|
161 |
print "#; This file is auto-generated (at @{[ sprintf '%04d-%02d-%02dT%02d:%02d:%02dZ', (gmtime)[5]+1900, (gmtime)[4]+1, (gmtime)[3,2,1,0] ]}).\n";
|
162 |
print join "\n", @hdr, sort @t;
|
163 |
print "\n";
|
164 |
|
165 |
=head1 REQUIRES
|
166 |
|
167 |
L<Win32::API> (MUST), unicore/Name.pl (MUST; in perl 5.8.0),
|
168 |
L<Message::Field::UA> (SHOULD)
|
169 |
|
170 |
=head1 BUGS
|
171 |
|
172 |
0x00 or U+0000 can't be treated correctly so that
|
173 |
0x00 <-> U+0000 is hardcoded. This may cause problem
|
174 |
if the code page does not have 0x00 <-> U+0000 mapping.
|
175 |
|
176 |
Mapping to 0x00 or U+0000 is also having problem.
|
177 |
For example, CP20127, Microsoft mapping of ASCII,
|
178 |
has an entry of 0x80 -> U+0000. But this entry does
|
179 |
not occur in outputed table.
|
180 |
|
181 |
In other points, this script has a lot of "hardcoded"
|
182 |
code, so that making table of some unknown code page
|
183 |
can result incorrect mapping.
|
184 |
|
185 |
=head1 AUTHOR
|
186 |
|
187 |
Nanashi-san <nanashi@san.invalid>
|
188 |
|
189 |
=head1 LICENSE
|
190 |
|
191 |
Public Domain.
|
192 |
|
193 |
=cut
|
194 |
|
195 |
# $Date: 2002/11/03 11:08:15 $
|