/[suikacvs]/markup/html/whatpm/Whatpm/mklangreg.pl
Suika

Contents of /markup/html/whatpm/Whatpm/mklangreg.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Thu Sep 18 08:22:17 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
Changes since 1.1: +1 -0 lines
File MIME type: text/plain
++ whatpm/t/ChangeLog	18 Sep 2008 08:22:10 -0000
	* langtag-1.dat: More test data from RFC 4646 sections 1.-2.2.4.
	are added.

2008-09-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	18 Sep 2008 08:21:37 -0000
	* LangTag.pm: Warn for private use language subtags.  Error level
	typos fixed.  Support for Suppress-Script field.

	* mklangreg.pl: Support for dumping of nested structure.

2008-09-18  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 #!/usr/bin/perl
2     use strict;
3    
4     my $subtags;
5    
6     my $langreg_source_file_name = shift;
7     {
8     open my $langreg_source_file, '<', $langreg_source_file_name or
9     die "$0: $langreg_source_file_name: $!";
10     local $/ = undef;
11    
12     ## NOTE: Based on RFC 4646 3.1.'s syntax, but more error-torelant.
13     for (split /\x0D?+\x0A%%\x0D?+\x0A/, <$langreg_source_file>) {
14     my $fields = [['' => '']];
15     for (split /\x0D?+\x0A/, $_) {
16     if (/^\s/) { ## Part of continuous line
17     $fields->[-1]->[1] .= $_;
18     } elsif (s/^([^:\s]++)\s*+:\s*+//) { ## The first line of a |field|
19     push @$fields, [$1 => $_];
20     } else { ## An errorneous line
21     push @$fields, ['' => $_];
22     }
23     }
24     my $subtag;
25     shift @$fields if $fields->[0]->[1] eq ''; # remove dummy if unused
26     for (@$fields) {
27     $subtag->{$_->[0]} ||= [];
28     my $v = $_->[1];
29     $v =~ s/&#x([0-9A-Fa-f]++);/chr hex $1/ge;
30     push @{$subtag->{$_->[0]}}, $v;
31     }
32     if ($subtags) {
33     my $tag_name_start = $subtag->{Subtag}->[0] || $subtag->{Tag}->[0];
34     if ($tag_name_start =~ /^[A-Z][a-z]++(?>\.\.[A-Z][a-z]++)?+$/) {
35     $subtag->{_canon} = '_titlecase';
36     } elsif ($tag_name_start =~ /^[A-Z]++(?>\.\.[A-Z]++)?+$/) {
37     $subtag->{_canon} = '_uppercase';
38     } elsif ($tag_name_start =~ /^[a-z]++(?>\.\.[a-z]++)?+$/) {
39     $subtag->{_canon} = '_lowercase';
40     } else {
41     $subtag->{_canon} = $tag_name_start;
42     }
43     $tag_name_start =~ tr/A-Z/a-z/;
44     my $tag_name_end;
45     if ($tag_name_start =~ /^(.+)\.\.(.+)$/) {
46     $tag_name_start = $1;
47     $tag_name_end = $2;
48     } else {
49     $tag_name_end = $tag_name_start;
50     }
51     for my $tag_name ($tag_name_start .. $tag_name_end) {
52     if ($subtags->{$subtag->{Type}->[0]}->{$tag_name}) {
53     warn "Duplicate tag: $tag_name\n";
54     } else {
55     $subtags->{$subtag->{Type}->[0]}->{$tag_name} = $subtag;
56     }
57     }
58     } else { ## The first record
59     $subtags->{header} = $subtag;
60     }
61     }
62     }
63    
64     ## Remove unused data
65    
66     $subtags->{_file_date} = $subtags->{header}->{'File-Date'}->[0];
67     delete $subtags->{header};
68    
69     for my $type (grep {!/^_/} keys %{$subtags}) {
70     for my $tag (keys %{$subtags->{$type}}) {
71     my $subtag = $subtags->{$type}->{$tag};
72     delete $subtag->{Comments};
73     delete $subtag->{Description};
74     delete $subtag->{Added};
75     delete $subtag->{Tag};
76     delete $subtag->{Subtag};
77     delete $subtag->{Type};
78    
79     $subtag->{_deprecated} = 1 if $subtag->{Deprecated};
80     delete $subtag->{Deprecated};
81    
82     $subtag->{_preferred} = $subtag->{'Preferred-Value'}->[0]
83     if defined $subtag->{'Preferred-Value'}->[0];
84     delete $subtag->{'Preferred-Value'};
85    
86     if (defined $subtag->{'Suppress-Script'}->[0]) {
87     $subtag->{_suppress} = $subtag->{'Suppress-Script'}->[0];
88     $subtag->{_suppress} =~ tr/A-Z/a-z/;
89     }
90     delete $subtag->{'Suppress-Script'};
91    
92     for (@{$subtag->{Prefix} or []}) {
93     tr/A-Z/a-z/;
94     }
95     }
96     }
97    
98     ## Resolve transitive relationship of Preferred-Value field
99     ## NOTE: Although noted in RFC 4646, urrently no tag has such relationship.
100    
101     for my $type (grep {!/^_/} keys %{$subtags}) {
102     for my $tag (keys %{$subtags->{$type}}) {
103     my $subtag = $subtags->{$type}->{$tag};
104     my $new_subtag = $subtag;
105     while (1) {
106     my $preferred = $new_subtag->{_preferred};
107     last unless defined $preferred;
108     $preferred =~ tr/A-Z/a-z/;
109     $new_subtag = $subtags->{$type}->{$preferred};
110     last unless $new_subtag;
111     }
112     }
113     }
114    
115     use Data::Dumper;
116     $Data::Dumper::Sortkeys = 1;
117 wakaba 1.2 $Data::Dumper::Purity = 1;
118 wakaba 1.1 my $value = Dumper $subtags;
119     $value =~ s/\$VAR1\b/\$Whatpm::LangTag::Registry/g;
120    
121     print $value;
122     print "1;\n";
123     print '__DATA__
124    
125     =head1 NAME
126    
127     mklangreg.pl - Generate language subtag registry object for langauge
128     tag validation
129    
130     _LangTagReg.pm - A language subtag registry data module for language
131     tag validation
132    
133     =head1 DESCRIPTION
134    
135     The C<_LangTagReg.pm> file contains a list of registered language
136     subtags. It is used by L<Whatpm::LangTag> for the purpose of language
137     tag validation.
138    
139     The C<mklangreg.pl> script is used to generate the C<_LangTagReg.pm>
140     file from the IANA registry.
141    
142     =head1 SEE ALSO
143    
144     L<Whatpm::LangTag>.
145    
146     RFC 4646 (BCP 47) Tags for Identifying Languages <urn:ietf:rfc:4646>.
147    
148     IANA Language Subtag Registry
149     <http://www.iana.org/assignments/language-subtag-registry>.
150    
151     =head1 LICENSE
152    
153     The C<mklangreg.pl> is in the Public Domain.
154    
155     =cut
156    
157     ';

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24