/[suikacvs]/markup/html/whatpm/Whatpm/mklangreg.pl
Suika

Contents of /markup/html/whatpm/Whatpm/mklangreg.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations) (download)
Thu Sep 18 08:22:17 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.1: +1 -0 lines
File MIME type: text/plain
++ whatpm/t/ChangeLog	18 Sep 2008 08:22:10 -0000
	* langtag-1.dat: More test data from RFC 4646 sections 1.-2.2.4.
	are added.

2008-09-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	18 Sep 2008 08:21:37 -0000
	* LangTag.pm: Warn for private use language subtags.  Error level
	typos fixed.  Support for Suppress-Script field.

	* mklangreg.pl: Support for dumping of nested structure.

2008-09-18  Wakaba  <wakaba@suika.fam.cx>

1 #!/usr/bin/perl
2 use strict;
3
4 my $subtags;
5
6 my $langreg_source_file_name = shift;
7 {
8 open my $langreg_source_file, '<', $langreg_source_file_name or
9 die "$0: $langreg_source_file_name: $!";
10 local $/ = undef;
11
12 ## NOTE: Based on RFC 4646 3.1.'s syntax, but more error-torelant.
13 for (split /\x0D?+\x0A%%\x0D?+\x0A/, <$langreg_source_file>) {
14 my $fields = [['' => '']];
15 for (split /\x0D?+\x0A/, $_) {
16 if (/^\s/) { ## Part of continuous line
17 $fields->[-1]->[1] .= $_;
18 } elsif (s/^([^:\s]++)\s*+:\s*+//) { ## The first line of a |field|
19 push @$fields, [$1 => $_];
20 } else { ## An errorneous line
21 push @$fields, ['' => $_];
22 }
23 }
24 my $subtag;
25 shift @$fields if $fields->[0]->[1] eq ''; # remove dummy if unused
26 for (@$fields) {
27 $subtag->{$_->[0]} ||= [];
28 my $v = $_->[1];
29 $v =~ s/&#x([0-9A-Fa-f]++);/chr hex $1/ge;
30 push @{$subtag->{$_->[0]}}, $v;
31 }
32 if ($subtags) {
33 my $tag_name_start = $subtag->{Subtag}->[0] || $subtag->{Tag}->[0];
34 if ($tag_name_start =~ /^[A-Z][a-z]++(?>\.\.[A-Z][a-z]++)?+$/) {
35 $subtag->{_canon} = '_titlecase';
36 } elsif ($tag_name_start =~ /^[A-Z]++(?>\.\.[A-Z]++)?+$/) {
37 $subtag->{_canon} = '_uppercase';
38 } elsif ($tag_name_start =~ /^[a-z]++(?>\.\.[a-z]++)?+$/) {
39 $subtag->{_canon} = '_lowercase';
40 } else {
41 $subtag->{_canon} = $tag_name_start;
42 }
43 $tag_name_start =~ tr/A-Z/a-z/;
44 my $tag_name_end;
45 if ($tag_name_start =~ /^(.+)\.\.(.+)$/) {
46 $tag_name_start = $1;
47 $tag_name_end = $2;
48 } else {
49 $tag_name_end = $tag_name_start;
50 }
51 for my $tag_name ($tag_name_start .. $tag_name_end) {
52 if ($subtags->{$subtag->{Type}->[0]}->{$tag_name}) {
53 warn "Duplicate tag: $tag_name\n";
54 } else {
55 $subtags->{$subtag->{Type}->[0]}->{$tag_name} = $subtag;
56 }
57 }
58 } else { ## The first record
59 $subtags->{header} = $subtag;
60 }
61 }
62 }
63
64 ## Remove unused data
65
66 $subtags->{_file_date} = $subtags->{header}->{'File-Date'}->[0];
67 delete $subtags->{header};
68
69 for my $type (grep {!/^_/} keys %{$subtags}) {
70 for my $tag (keys %{$subtags->{$type}}) {
71 my $subtag = $subtags->{$type}->{$tag};
72 delete $subtag->{Comments};
73 delete $subtag->{Description};
74 delete $subtag->{Added};
75 delete $subtag->{Tag};
76 delete $subtag->{Subtag};
77 delete $subtag->{Type};
78
79 $subtag->{_deprecated} = 1 if $subtag->{Deprecated};
80 delete $subtag->{Deprecated};
81
82 $subtag->{_preferred} = $subtag->{'Preferred-Value'}->[0]
83 if defined $subtag->{'Preferred-Value'}->[0];
84 delete $subtag->{'Preferred-Value'};
85
86 if (defined $subtag->{'Suppress-Script'}->[0]) {
87 $subtag->{_suppress} = $subtag->{'Suppress-Script'}->[0];
88 $subtag->{_suppress} =~ tr/A-Z/a-z/;
89 }
90 delete $subtag->{'Suppress-Script'};
91
92 for (@{$subtag->{Prefix} or []}) {
93 tr/A-Z/a-z/;
94 }
95 }
96 }
97
98 ## Resolve transitive relationship of Preferred-Value field
99 ## NOTE: Although noted in RFC 4646, urrently no tag has such relationship.
100
101 for my $type (grep {!/^_/} keys %{$subtags}) {
102 for my $tag (keys %{$subtags->{$type}}) {
103 my $subtag = $subtags->{$type}->{$tag};
104 my $new_subtag = $subtag;
105 while (1) {
106 my $preferred = $new_subtag->{_preferred};
107 last unless defined $preferred;
108 $preferred =~ tr/A-Z/a-z/;
109 $new_subtag = $subtags->{$type}->{$preferred};
110 last unless $new_subtag;
111 }
112 }
113 }
114
115 use Data::Dumper;
116 $Data::Dumper::Sortkeys = 1;
117 $Data::Dumper::Purity = 1;
118 my $value = Dumper $subtags;
119 $value =~ s/\$VAR1\b/\$Whatpm::LangTag::Registry/g;
120
121 print $value;
122 print "1;\n";
123 print '__DATA__
124
125 =head1 NAME
126
127 mklangreg.pl - Generate language subtag registry object for langauge
128 tag validation
129
130 _LangTagReg.pm - A language subtag registry data module for language
131 tag validation
132
133 =head1 DESCRIPTION
134
135 The C<_LangTagReg.pm> file contains a list of registered language
136 subtags. It is used by L<Whatpm::LangTag> for the purpose of language
137 tag validation.
138
139 The C<mklangreg.pl> script is used to generate the C<_LangTagReg.pm>
140 file from the IANA registry.
141
142 =head1 SEE ALSO
143
144 L<Whatpm::LangTag>.
145
146 RFC 4646 (BCP 47) Tags for Identifying Languages <urn:ietf:rfc:4646>.
147
148 IANA Language Subtag Registry
149 <http://www.iana.org/assignments/language-subtag-registry>.
150
151 =head1 LICENSE
152
153 The C<mklangreg.pl> is in the Public Domain.
154
155 =cut
156
157 ';

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24