/[suikacvs]/markup/html/whatpm/Whatpm/mklangreg.pl
Suika

Contents of /markup/html/whatpm/Whatpm/mklangreg.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations) (download)
Thu Sep 18 14:32:48 2008 UTC (16 years, 1 month ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +3 -0 lines
File MIME type: text/plain
++ whatpm/t/ChangeLog	18 Sep 2008 14:32:17 -0000
	* langtag-1.dat: More test data from RFC 4646 are added.

2008-09-18  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	18 Sep 2008 14:31:58 -0000
	* LangTag.pm: Add checks for remaining requirements from RFC 4646.

	* mklangreg.pl: Sort 'Prefix' values by their length, to ease
	matching.

2008-09-18  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 #!/usr/bin/perl
2     use strict;
3    
4     my $subtags;
5    
6     my $langreg_source_file_name = shift;
7     {
8     open my $langreg_source_file, '<', $langreg_source_file_name or
9     die "$0: $langreg_source_file_name: $!";
10     local $/ = undef;
11    
12     ## NOTE: Based on RFC 4646 3.1.'s syntax, but more error-torelant.
13     for (split /\x0D?+\x0A%%\x0D?+\x0A/, <$langreg_source_file>) {
14     my $fields = [['' => '']];
15     for (split /\x0D?+\x0A/, $_) {
16     if (/^\s/) { ## Part of continuous line
17     $fields->[-1]->[1] .= $_;
18     } elsif (s/^([^:\s]++)\s*+:\s*+//) { ## The first line of a |field|
19     push @$fields, [$1 => $_];
20     } else { ## An errorneous line
21     push @$fields, ['' => $_];
22     }
23     }
24     my $subtag;
25     shift @$fields if $fields->[0]->[1] eq ''; # remove dummy if unused
26     for (@$fields) {
27     $subtag->{$_->[0]} ||= [];
28     my $v = $_->[1];
29     $v =~ s/&#x([0-9A-Fa-f]++);/chr hex $1/ge;
30     push @{$subtag->{$_->[0]}}, $v;
31     }
32     if ($subtags) {
33     my $tag_name_start = $subtag->{Subtag}->[0] || $subtag->{Tag}->[0];
34     if ($tag_name_start =~ /^[A-Z][a-z]++(?>\.\.[A-Z][a-z]++)?+$/) {
35     $subtag->{_canon} = '_titlecase';
36     } elsif ($tag_name_start =~ /^[A-Z]++(?>\.\.[A-Z]++)?+$/) {
37     $subtag->{_canon} = '_uppercase';
38     } elsif ($tag_name_start =~ /^[a-z]++(?>\.\.[a-z]++)?+$/) {
39     $subtag->{_canon} = '_lowercase';
40     } else {
41     $subtag->{_canon} = $tag_name_start;
42     }
43     $tag_name_start =~ tr/A-Z/a-z/;
44     my $tag_name_end;
45     if ($tag_name_start =~ /^(.+)\.\.(.+)$/) {
46     $tag_name_start = $1;
47     $tag_name_end = $2;
48     } else {
49     $tag_name_end = $tag_name_start;
50     }
51     for my $tag_name ($tag_name_start .. $tag_name_end) {
52     if ($subtags->{$subtag->{Type}->[0]}->{$tag_name}) {
53     warn "Duplicate tag: $tag_name\n";
54     } else {
55     $subtags->{$subtag->{Type}->[0]}->{$tag_name} = $subtag;
56     }
57     }
58     } else { ## The first record
59     $subtags->{header} = $subtag;
60     }
61     }
62     }
63    
64     ## Remove unused data
65    
66     $subtags->{_file_date} = $subtags->{header}->{'File-Date'}->[0];
67     delete $subtags->{header};
68    
69     for my $type (grep {!/^_/} keys %{$subtags}) {
70     for my $tag (keys %{$subtags->{$type}}) {
71     my $subtag = $subtags->{$type}->{$tag};
72     delete $subtag->{Comments};
73     delete $subtag->{Description};
74     delete $subtag->{Added};
75     delete $subtag->{Tag};
76     delete $subtag->{Subtag};
77     delete $subtag->{Type};
78    
79     $subtag->{_deprecated} = 1 if $subtag->{Deprecated};
80     delete $subtag->{Deprecated};
81    
82     $subtag->{_preferred} = $subtag->{'Preferred-Value'}->[0]
83     if defined $subtag->{'Preferred-Value'}->[0];
84     delete $subtag->{'Preferred-Value'};
85    
86     if (defined $subtag->{'Suppress-Script'}->[0]) {
87     $subtag->{_suppress} = $subtag->{'Suppress-Script'}->[0];
88     $subtag->{_suppress} =~ tr/A-Z/a-z/;
89     }
90     delete $subtag->{'Suppress-Script'};
91    
92     for (@{$subtag->{Prefix} or []}) {
93     tr/A-Z/a-z/;
94     }
95 wakaba 1.3
96     $subtag->{Prefix} = [sort {length $b <=> length $a or $a cmp $b}
97     @{$subtag->{Prefix}}] if $subtag->{Prefix};
98 wakaba 1.1 }
99     }
100    
101     ## Resolve transitive relationship of Preferred-Value field
102     ## NOTE: Although noted in RFC 4646, urrently no tag has such relationship.
103    
104     for my $type (grep {!/^_/} keys %{$subtags}) {
105     for my $tag (keys %{$subtags->{$type}}) {
106     my $subtag = $subtags->{$type}->{$tag};
107     my $new_subtag = $subtag;
108     while (1) {
109     my $preferred = $new_subtag->{_preferred};
110     last unless defined $preferred;
111     $preferred =~ tr/A-Z/a-z/;
112     $new_subtag = $subtags->{$type}->{$preferred};
113     last unless $new_subtag;
114     }
115     }
116     }
117    
118     use Data::Dumper;
119     $Data::Dumper::Sortkeys = 1;
120 wakaba 1.2 $Data::Dumper::Purity = 1;
121 wakaba 1.1 my $value = Dumper $subtags;
122     $value =~ s/\$VAR1\b/\$Whatpm::LangTag::Registry/g;
123    
124     print $value;
125     print "1;\n";
126     print '__DATA__
127    
128     =head1 NAME
129    
130     mklangreg.pl - Generate language subtag registry object for langauge
131     tag validation
132    
133     _LangTagReg.pm - A language subtag registry data module for language
134     tag validation
135    
136     =head1 DESCRIPTION
137    
138     The C<_LangTagReg.pm> file contains a list of registered language
139     subtags. It is used by L<Whatpm::LangTag> for the purpose of language
140     tag validation.
141    
142     The C<mklangreg.pl> script is used to generate the C<_LangTagReg.pm>
143     file from the IANA registry.
144    
145     =head1 SEE ALSO
146    
147     L<Whatpm::LangTag>.
148    
149     RFC 4646 (BCP 47) Tags for Identifying Languages <urn:ietf:rfc:4646>.
150    
151     IANA Language Subtag Registry
152     <http://www.iana.org/assignments/language-subtag-registry>.
153    
154     =head1 LICENSE
155    
156     The C<mklangreg.pl> is in the Public Domain.
157    
158     =cut
159    
160     ';

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24