Table/tool/winapi2tbl.pl

#!/usr/bin/perl

=head1 NAME

winapi2tbl --- Mapping table generator (from Win32 API)

=head1 USAGE

 $ perl winapi2tbl.pl 932 > winapi-cp932.tbl

=cut

use strict;
require Win32::API;

my $CodePage = shift || 932;
my %S_range = (
        932     => [0x40..0x7E,0x80..0xFC],
        936     => [0x40..0x7E,0x80..0xFE],
        949     => [0x41..0x5A,0x61..0x7A,0x81..0xFE],
        950     => [0x40..0x7E,0xA1..0xFE],
        1361    => [0x31..0xFE],
        20000   => [0x21..0x7E,0xA1..0xFE],
        20932   => [0xA1..0xFE],
        20936   => [0xA1..0xFE],
        20949   => [0xA1..0xFE],
);

{
my @name = split /\n/, require 'unicore/Name.pl';
my %name;
for (@name) {
if (/^(....)            ([^\t]+)/) {
  $name{hex $1} = $2;
}
}
sub NAME ($) {
  $_[0] < 0x0020 ? '<control>' :
  $_[0] < 0x007F ? $name{$_[0]} :
  $_[0] < 0x00A0 ? '<control>' :
  $name{$_[0]} ? $name{$_[0]} :
  $_[0] < 0x00A0 ? '<control>' :
  $_[0] < 0x3400 ? '' :
  $_[0] < 0xA000 ? '<cjk>' :
  $_[0] < 0xE000 ? '<hangul>' :
  $_[0] < 0xF900 ? '<private>' :
  '';
}
}

my @hdr;
print STDERR "Checking system version...\n";
if (eval q{require Message::Field::UA}) {
  my $ua = new Message::Field::UA;
  $ua->add_our_name;
  $ua->replace_system_version ('ie', -prepend => 0);
  push @hdr, "#; Environment: $ua";
  print STDERR $hdr[$#hdr],"\n";
}

print STDERR "MultiByte -> WideChar...\n";
#int MultiByteToWideChar(
#  UINT CodePage,         // code page
#  DWORD dwFlags,         // mapping flag
#  LPCSTR lpMultiByteStr, // multibyte string (source)
#  int cchMultiByte,      // length of multibyte string
#  LPWSTR lpWideCharStr,  // wide string (result)
#  int cchWideChar        // length of wide string
#);
## <http://www.microsoft.com/japan/developer/library/jpwinpf/_win32_multibytetowidechar.htm>

my $M2W = new Win32::API (kernel32 => "MultiByteToWideChar", INPIPI => 'I');
my %M2U;
for my $F (0x00..0xFF) {        ## one byte
  my $mb = pack 'C', $F;
  my $wc = "\x00" x 10;
  my $len = $M2W->Call ($CodePage, 0, $mb, length ($mb) => $wc, length ($wc));
  $wc = substr ($wc, 0, $len * 2);
  if (length ($wc) >= 2 && $wc ne "\x00\x00") {
    $wc =~ s/(.)(.)/$2$1/gs;
    $wc =~ s/(.)/sprintf '%02X', ord $1/ges;
    $M2U{ $F } = hex $wc;
  }
}
for my $F (0x80..0xFF) {        ## two bytes
for my $S (@{$S_range{$CodePage} || [0x01..0xFF] }) {
  my $mb = pack 'CC', $F, $S;
  my $wc = "\x00" x 10;
  my $len = $M2W->Call ($CodePage, 0, $mb, length ($mb) => $wc, length ($wc));
  $wc = substr ($wc, 0, $len * 2);
  if (length ($wc) >= 2) {
    $wc =~ s/(.)(.)/$2$1/gs;
    $wc =~ s/(.)/sprintf '%02X', ord $1/ges;
    $M2U{ 0x100 * $F + $S } = hex ($wc) unless 0x10000 * $M2U{$F} + $M2U{$S} == hex ($wc);
  }
}}
$M2U{0x00} = 0x0000;

print STDERR "WideChar -> MultiByte...\n";
#int WideCharToMultiByte(
#  UINT CodePage,               // code page
#  DWORD dwFlags,               // mapping flag
#  LPCWSTR lpWideCharStr,       // wide string (source)
#  int cchWideChar,             // length of wide string
#  LPSTR lpMultiByteStr,        // multibyte string (result)
#  int cchMultiByte,            // length of multibyte
#  LPCSTR lpDefaultChar,        // substition char of unmappable one
#  LPBOOL lpUsedDefaultChar     // substition char is used?
#);
## <http://www.microsoft.com/japan/developer/library/jpwinpf/_win32_widechartomultibyte.htm>

my $W2M = new Win32::API (kernel32 => "WideCharToMultiByte", INPIPIPP => 'I');
my %U2M;
for my $R (0x00..0xFF) {
for my $C (0x00..0xFF) {
  my $wc = pack 'CC', $C, $R;   ## UTF-16LE
  my $mb = "\x00" x 10;
  $W2M->Call ($CodePage, 0, $wc, length ($wc) / 2 => $mb, length ($mb), "\x00", 0);
  $mb =~ s/\x00.*$//;
  if (length $mb) {
    $mb =~ s/(.)/sprintf '%02X', ord $1/ges;
    $U2M{ 0x100 * $R + $C } = hex $mb;
  }
}}
$U2M{0x0000} = 0x00;

for my $R1 (0xD8..0xDB) { for my $C1 (0x00..0xFF) {
for my $R2 (0xDC..0xDF) { for my $C2 (0x00..0xFF) {
  my $wc = pack 'CC', $C1, $R1, $C2, $R2;       ## UTF-16LE
  my $mb = "\x00" x 10;
  $W2M->Call ($CodePage, 0, $wc, length ($wc) / 2 => $mb, length ($mb), "\x00", 0);
  $mb =~ s/\x00.*$//;
  if (length $mb) {
    $mb =~ s/(.)/sprintf '%02X', ord $1/ges;
    $U2M{ 0x10000 + (0x100 * $R1 + $C1 - 0xD800) * 0x400
                  + (0x100 * $R2 + $C2 - 0xDC00) } = hex ($mb);
  }
}}}}

print STDERR "Creating table...\n";

my @t;
for my $U (keys %U2M) {
    if ($M2U{$U2M{$U}} == $U) {
      push @t, sprintf '0x%04X  U+%04X          # %s',
                       $U2M{$U}, $U, NAME ($U);
      delete $M2U{ $U2M{$U} };
    } else {
      push @t, sprintf '0x%04X  U+%04X  <-      # %s',
                       $U2M{$U}, $U, NAME ($U);
    }
}
for my $C (keys %M2U) {
  push @t, sprintf '0x%04X      U+%04X  ->      # %s',
                       $C, $M2U{$C}, NAME ($M2U{$C});
}
for (@t) {
  s/^0x00(..)/0x$1/;
}

print "#; This file is auto-generated (at @{[ sprintf '%04d-%02d-%02dT%02d:%02d:%02dZ', (gmtime)[5]+1900, (gmtime)[4]+1, (gmtime)[3,2,1,0] ]}).\n";
print join "\n", @hdr, sort @t;
print "\n";

=head1 REQUIRES

L<Win32::API> (MUST), unicore/Name.pl (MUST; in perl 5.8.0),
L<Message::Field::UA> (SHOULD)

=head1 BUGS

0x00 or U+0000 can't be treated correctly so that
0x00 <-> U+0000 is hardcoded.  This may cause problem
if the code page does not have 0x00 <-> U+0000 mapping.

Mapping to 0x00 or U+0000 is also having problem.
For example, CP20127, Microsoft mapping of ASCII,
has an entry of 0x80 -> U+0000.  But this entry does
not occur in outputed table.

In other points, this script has a lot of "hardcoded"
code, so that making table of some unknown code page
can result incorrect mapping.

=head1 AUTHOR

Nanashi-san <nanashi@san.invalid>

=head1 LICENSE

Public Domain.

=cut

# $Date: 2002/11/03 11:08:15 $
1	#!/usr/bin/perl
2
3	=head1 NAME
4
5	winapi2tbl --- Mapping table generator (from Win32 API)
6
7	=head1 USAGE
8
9	$ perl winapi2tbl.pl 932 > winapi-cp932.tbl
10
11	=cut
12
13	use strict;
14	require Win32::API;
15
16	my $CodePage = shift \|\| 932;
17	my %S_range = (
18	932 => [0x40..0x7E,0x80..0xFC],
19	936 => [0x40..0x7E,0x80..0xFE],
20	949 => [0x41..0x5A,0x61..0x7A,0x81..0xFE],
21	950 => [0x40..0x7E,0xA1..0xFE],
22	1361 => [0x31..0xFE],
23	20000 => [0x21..0x7E,0xA1..0xFE],
24	20932 => [0xA1..0xFE],
25	20936 => [0xA1..0xFE],
26	20949 => [0xA1..0xFE],
27	);
28
29	{
30	my @name = split /\n/, require 'unicore/Name.pl';
31	my %name;
32	for (@name) {
33	if (/^(....) ([^\t]+)/) {
34	$name{hex $1} = $2;
35	}
36	}
37	sub NAME ($) {
38	$_[0] < 0x0020 ? '<control>' :
39	$_[0] < 0x007F ? $name{$_[0]} :
40	$_[0] < 0x00A0 ? '<control>' :
41	$name{$_[0]} ? $name{$_[0]} :
42	$_[0] < 0x00A0 ? '<control>' :
43	$_[0] < 0x3400 ? '' :
44	$_[0] < 0xA000 ? '<cjk>' :
45	$_[0] < 0xE000 ? '<hangul>' :
46	$_[0] < 0xF900 ? '<private>' :
47	'';
48	}
49	}
50
51	my @hdr;
52	print STDERR "Checking system version...\n";
53	if (eval q{require Message::Field::UA}) {
54	my $ua = new Message::Field::UA;
55	$ua->add_our_name;
56	$ua->replace_system_version ('ie', -prepend => 0);
57	push @hdr, "#; Environment: $ua";
58	print STDERR $hdr[$#hdr],"\n";
59	}
60
61	print STDERR "MultiByte -> WideChar...\n";
62	#int MultiByteToWideChar(
63	# UINT CodePage, // code page
64	# DWORD dwFlags, // mapping flag
65	# LPCSTR lpMultiByteStr, // multibyte string (source)
66	# int cchMultiByte, // length of multibyte string
67	# LPWSTR lpWideCharStr, // wide string (result)
68	# int cchWideChar // length of wide string
69	#);
70	## <http://www.microsoft.com/japan/developer/library/jpwinpf/_win32_multibytetowidechar.htm>
71
72	my $M2W = new Win32::API (kernel32 => "MultiByteToWideChar", INPIPI => 'I');
73	my %M2U;
74	for my $F (0x00..0xFF) { ## one byte
75	my $mb = pack 'C', $F;
76	my $wc = "\x00" x 10;
77	my $len = $M2W->Call ($CodePage, 0, $mb, length ($mb) => $wc, length ($wc));
78	$wc = substr ($wc, 0, $len * 2);
79	if (length ($wc) >= 2 && $wc ne "\x00\x00") {
80	$wc =~ s/(.)(.)/$2$1/gs;
81	$wc =~ s/(.)/sprintf '%02X', ord $1/ges;
82	$M2U{ $F } = hex $wc;
83	}
84	}
85	for my $F (0x80..0xFF) { ## two bytes
86	for my $S (@{$S_range{$CodePage} \|\| [0x01..0xFF] }) {
87	my $mb = pack 'CC', $F, $S;
88	my $wc = "\x00" x 10;
89	my $len = $M2W->Call ($CodePage, 0, $mb, length ($mb) => $wc, length ($wc));
90	$wc = substr ($wc, 0, $len * 2);
91	if (length ($wc) >= 2) {
92	$wc =~ s/(.)(.)/$2$1/gs;
93	$wc =~ s/(.)/sprintf '%02X', ord $1/ges;
94	$M2U{ 0x100 * $F + $S } = hex ($wc) unless 0x10000 * $M2U{$F} + $M2U{$S} == hex ($wc);
95	}
96	}}
97	$M2U{0x00} = 0x0000;
98
99	print STDERR "WideChar -> MultiByte...\n";
100	#int WideCharToMultiByte(
101	# UINT CodePage, // code page
102	# DWORD dwFlags, // mapping flag
103	# LPCWSTR lpWideCharStr, // wide string (source)
104	# int cchWideChar, // length of wide string
105	# LPSTR lpMultiByteStr, // multibyte string (result)
106	# int cchMultiByte, // length of multibyte
107	# LPCSTR lpDefaultChar, // substition char of unmappable one
108	# LPBOOL lpUsedDefaultChar // substition char is used?
109	#);
110	## <http://www.microsoft.com/japan/developer/library/jpwinpf/_win32_widechartomultibyte.htm>
111
112	my $W2M = new Win32::API (kernel32 => "WideCharToMultiByte", INPIPIPP => 'I');
113	my %U2M;
114	for my $R (0x00..0xFF) {
115	for my $C (0x00..0xFF) {
116	my $wc = pack 'CC', $C, $R; ## UTF-16LE
117	my $mb = "\x00" x 10;
118	$W2M->Call ($CodePage, 0, $wc, length ($wc) / 2 => $mb, length ($mb), "\x00", 0);
119	$mb =~ s/\x00.*$//;
120	if (length $mb) {
121	$mb =~ s/(.)/sprintf '%02X', ord $1/ges;
122	$U2M{ 0x100 * $R + $C } = hex $mb;
123	}
124	}}
125	$U2M{0x0000} = 0x00;
126
127	for my $R1 (0xD8..0xDB) { for my $C1 (0x00..0xFF) {
128	for my $R2 (0xDC..0xDF) { for my $C2 (0x00..0xFF) {
129	my $wc = pack 'CC', $C1, $R1, $C2, $R2; ## UTF-16LE
130	my $mb = "\x00" x 10;
131	$W2M->Call ($CodePage, 0, $wc, length ($wc) / 2 => $mb, length ($mb), "\x00", 0);
132	$mb =~ s/\x00.*$//;
133	if (length $mb) {
134	$mb =~ s/(.)/sprintf '%02X', ord $1/ges;
135	$U2M{ 0x10000 + (0x100 * $R1 + $C1 - 0xD800) * 0x400
136	+ (0x100 * $R2 + $C2 - 0xDC00) } = hex ($mb);
137	}
138	}}}}
139
140	print STDERR "Creating table...\n";
141
142	my @t;
143	for my $U (keys %U2M) {
144	if ($M2U{$U2M{$U}} == $U) {
145	push @t, sprintf '0x%04X U+%04X # %s',
146	$U2M{$U}, $U, NAME ($U);
147	delete $M2U{ $U2M{$U} };
148	} else {
149	push @t, sprintf '0x%04X U+%04X <- # %s',
150	$U2M{$U}, $U, NAME ($U);
151	}
152	}
153	for my $C (keys %M2U) {
154	push @t, sprintf '0x%04X U+%04X -> # %s',
155	$C, $M2U{$C}, NAME ($M2U{$C});
156	}
157	for (@t) {
158	s/^0x00(..)/0x$1/;
159	}
160
161	print "#; This file is auto-generated (at @{[ sprintf '%04d-%02d-%02dT%02d:%02d:%02dZ', (gmtime)[5]+1900, (gmtime)[4]+1, (gmtime)[3,2,1,0] ]}).\n";
162	print join "\n", @hdr, sort @t;
163	print "\n";
164
165	=head1 REQUIRES
166
167	L<Win32::API> (MUST), unicore/Name.pl (MUST; in perl 5.8.0),
168	L<Message::Field::UA> (SHOULD)
169
170	=head1 BUGS
171
172	0x00 or U+0000 can't be treated correctly so that
173	0x00 <-> U+0000 is hardcoded. This may cause problem
174	if the code page does not have 0x00 <-> U+0000 mapping.
175
176	Mapping to 0x00 or U+0000 is also having problem.
177	For example, CP20127, Microsoft mapping of ASCII,
178	has an entry of 0x80 -> U+0000. But this entry does
179	not occur in outputed table.
180
181	In other points, this script has a lot of "hardcoded"
182	code, so that making table of some unknown code page
183	can result incorrect mapping.
184
185	=head1 AUTHOR
186
187	Nanashi-san <nanashi@san.invalid>
188
189	=head1 LICENSE
190
191	Public Domain.
192
193	=cut
194
195	# $Date: 2002/11/03 11:08:15 $