/[suikacvs]/markup/html/whatpm/Whatpm/mklangreg.pl
Suika

Contents of /markup/html/whatpm/Whatpm/mklangreg.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (show annotations) (download)
Thu Sep 18 14:32:48 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +3 -0 lines
File MIME type: text/plain
++ whatpm/t/ChangeLog	18 Sep 2008 14:32:17 -0000
	* langtag-1.dat: More test data from RFC 4646 are added.

2008-09-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	18 Sep 2008 14:31:58 -0000
	* LangTag.pm: Add checks for remaining requirements from RFC 4646.

	* mklangreg.pl: Sort 'Prefix' values by their length, to ease
	matching.

2008-09-18  Wakaba  <wakaba@suika.fam.cx>

1 #!/usr/bin/perl
2 use strict;
3
4 my $subtags;
5
6 my $langreg_source_file_name = shift;
7 {
8 open my $langreg_source_file, '<', $langreg_source_file_name or
9 die "$0: $langreg_source_file_name: $!";
10 local $/ = undef;
11
12 ## NOTE: Based on RFC 4646 3.1.'s syntax, but more error-torelant.
13 for (split /\x0D?+\x0A%%\x0D?+\x0A/, <$langreg_source_file>) {
14 my $fields = [['' => '']];
15 for (split /\x0D?+\x0A/, $_) {
16 if (/^\s/) { ## Part of continuous line
17 $fields->[-1]->[1] .= $_;
18 } elsif (s/^([^:\s]++)\s*+:\s*+//) { ## The first line of a |field|
19 push @$fields, [$1 => $_];
20 } else { ## An errorneous line
21 push @$fields, ['' => $_];
22 }
23 }
24 my $subtag;
25 shift @$fields if $fields->[0]->[1] eq ''; # remove dummy if unused
26 for (@$fields) {
27 $subtag->{$_->[0]} ||= [];
28 my $v = $_->[1];
29 $v =~ s/&#x([0-9A-Fa-f]++);/chr hex $1/ge;
30 push @{$subtag->{$_->[0]}}, $v;
31 }
32 if ($subtags) {
33 my $tag_name_start = $subtag->{Subtag}->[0] || $subtag->{Tag}->[0];
34 if ($tag_name_start =~ /^[A-Z][a-z]++(?>\.\.[A-Z][a-z]++)?+$/) {
35 $subtag->{_canon} = '_titlecase';
36 } elsif ($tag_name_start =~ /^[A-Z]++(?>\.\.[A-Z]++)?+$/) {
37 $subtag->{_canon} = '_uppercase';
38 } elsif ($tag_name_start =~ /^[a-z]++(?>\.\.[a-z]++)?+$/) {
39 $subtag->{_canon} = '_lowercase';
40 } else {
41 $subtag->{_canon} = $tag_name_start;
42 }
43 $tag_name_start =~ tr/A-Z/a-z/;
44 my $tag_name_end;
45 if ($tag_name_start =~ /^(.+)\.\.(.+)$/) {
46 $tag_name_start = $1;
47 $tag_name_end = $2;
48 } else {
49 $tag_name_end = $tag_name_start;
50 }
51 for my $tag_name ($tag_name_start .. $tag_name_end) {
52 if ($subtags->{$subtag->{Type}->[0]}->{$tag_name}) {
53 warn "Duplicate tag: $tag_name\n";
54 } else {
55 $subtags->{$subtag->{Type}->[0]}->{$tag_name} = $subtag;
56 }
57 }
58 } else { ## The first record
59 $subtags->{header} = $subtag;
60 }
61 }
62 }
63
64 ## Remove unused data
65
66 $subtags->{_file_date} = $subtags->{header}->{'File-Date'}->[0];
67 delete $subtags->{header};
68
69 for my $type (grep {!/^_/} keys %{$subtags}) {
70 for my $tag (keys %{$subtags->{$type}}) {
71 my $subtag = $subtags->{$type}->{$tag};
72 delete $subtag->{Comments};
73 delete $subtag->{Description};
74 delete $subtag->{Added};
75 delete $subtag->{Tag};
76 delete $subtag->{Subtag};
77 delete $subtag->{Type};
78
79 $subtag->{_deprecated} = 1 if $subtag->{Deprecated};
80 delete $subtag->{Deprecated};
81
82 $subtag->{_preferred} = $subtag->{'Preferred-Value'}->[0]
83 if defined $subtag->{'Preferred-Value'}->[0];
84 delete $subtag->{'Preferred-Value'};
85
86 if (defined $subtag->{'Suppress-Script'}->[0]) {
87 $subtag->{_suppress} = $subtag->{'Suppress-Script'}->[0];
88 $subtag->{_suppress} =~ tr/A-Z/a-z/;
89 }
90 delete $subtag->{'Suppress-Script'};
91
92 for (@{$subtag->{Prefix} or []}) {
93 tr/A-Z/a-z/;
94 }
95
96 $subtag->{Prefix} = [sort {length $b <=> length $a or $a cmp $b}
97 @{$subtag->{Prefix}}] if $subtag->{Prefix};
98 }
99 }
100
101 ## Resolve transitive relationship of Preferred-Value field
102 ## NOTE: Although noted in RFC 4646, urrently no tag has such relationship.
103
104 for my $type (grep {!/^_/} keys %{$subtags}) {
105 for my $tag (keys %{$subtags->{$type}}) {
106 my $subtag = $subtags->{$type}->{$tag};
107 my $new_subtag = $subtag;
108 while (1) {
109 my $preferred = $new_subtag->{_preferred};
110 last unless defined $preferred;
111 $preferred =~ tr/A-Z/a-z/;
112 $new_subtag = $subtags->{$type}->{$preferred};
113 last unless $new_subtag;
114 }
115 }
116 }
117
118 use Data::Dumper;
119 $Data::Dumper::Sortkeys = 1;
120 $Data::Dumper::Purity = 1;
121 my $value = Dumper $subtags;
122 $value =~ s/\$VAR1\b/\$Whatpm::LangTag::Registry/g;
123
124 print $value;
125 print "1;\n";
126 print '__DATA__
127
128 =head1 NAME
129
130 mklangreg.pl - Generate language subtag registry object for langauge
131 tag validation
132
133 _LangTagReg.pm - A language subtag registry data module for language
134 tag validation
135
136 =head1 DESCRIPTION
137
138 The C<_LangTagReg.pm> file contains a list of registered language
139 subtags. It is used by L<Whatpm::LangTag> for the purpose of language
140 tag validation.
141
142 The C<mklangreg.pl> script is used to generate the C<_LangTagReg.pm>
143 file from the IANA registry.
144
145 =head1 SEE ALSO
146
147 L<Whatpm::LangTag>.
148
149 RFC 4646 (BCP 47) Tags for Identifying Languages <urn:ietf:rfc:4646>.
150
151 IANA Language Subtag Registry
152 <http://www.iana.org/assignments/language-subtag-registry>.
153
154 =head1 LICENSE
155
156 The C<mklangreg.pl> is in the Public Domain.
157
158 =cut
159
160 ';

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24