| 1 |
#!/usr/bin/perl |
| 2 |
use strict; |
| 3 |
|
| 4 |
my $subtags; |
| 5 |
|
| 6 |
my $langreg_source_file_name = shift; |
| 7 |
{ |
| 8 |
open my $langreg_source_file, '<', $langreg_source_file_name or |
| 9 |
die "$0: $langreg_source_file_name: $!"; |
| 10 |
local $/ = undef; |
| 11 |
|
| 12 |
## NOTE: Based on RFC 4646 3.1.'s syntax, but more error-torelant. |
| 13 |
for (split /\x0D?+\x0A%%\x0D?+\x0A/, <$langreg_source_file>) { |
| 14 |
my $fields = [['' => '']]; |
| 15 |
for (split /\x0D?+\x0A/, $_) { |
| 16 |
if (/^\s/) { ## Part of continuous line |
| 17 |
$fields->[-1]->[1] .= $_; |
| 18 |
} elsif (s/^([^:\s]++)\s*+:\s*+//) { ## The first line of a |field| |
| 19 |
push @$fields, [$1 => $_]; |
| 20 |
} else { ## An errorneous line |
| 21 |
push @$fields, ['' => $_]; |
| 22 |
} |
| 23 |
} |
| 24 |
my $subtag; |
| 25 |
shift @$fields if $fields->[0]->[1] eq ''; # remove dummy if unused |
| 26 |
for (@$fields) { |
| 27 |
$subtag->{$_->[0]} ||= []; |
| 28 |
my $v = $_->[1]; |
| 29 |
$v =~ s/&#x([0-9A-Fa-f]++);/chr hex $1/ge; |
| 30 |
push @{$subtag->{$_->[0]}}, $v; |
| 31 |
} |
| 32 |
if ($subtags) { |
| 33 |
my $tag_name_start = $subtag->{Subtag}->[0] || $subtag->{Tag}->[0]; |
| 34 |
if ($tag_name_start =~ /^[A-Z][a-z]++(?>\.\.[A-Z][a-z]++)?+$/) { |
| 35 |
$subtag->{_canon} = '_titlecase'; |
| 36 |
} elsif ($tag_name_start =~ /^[A-Z]++(?>\.\.[A-Z]++)?+$/) { |
| 37 |
$subtag->{_canon} = '_uppercase'; |
| 38 |
} elsif ($tag_name_start =~ /^[a-z]++(?>\.\.[a-z]++)?+$/) { |
| 39 |
$subtag->{_canon} = '_lowercase'; |
| 40 |
} else { |
| 41 |
$subtag->{_canon} = $tag_name_start; |
| 42 |
} |
| 43 |
$tag_name_start =~ tr/A-Z/a-z/; |
| 44 |
my $tag_name_end; |
| 45 |
if ($tag_name_start =~ /^(.+)\.\.(.+)$/) { |
| 46 |
$tag_name_start = $1; |
| 47 |
$tag_name_end = $2; |
| 48 |
} else { |
| 49 |
$tag_name_end = $tag_name_start; |
| 50 |
} |
| 51 |
for my $tag_name ($tag_name_start .. $tag_name_end) { |
| 52 |
if ($subtags->{$subtag->{Type}->[0]}->{$tag_name}) { |
| 53 |
warn "Duplicate tag: $tag_name\n"; |
| 54 |
} else { |
| 55 |
$subtags->{$subtag->{Type}->[0]}->{$tag_name} = $subtag; |
| 56 |
} |
| 57 |
} |
| 58 |
} else { ## The first record |
| 59 |
$subtags->{header} = $subtag; |
| 60 |
} |
| 61 |
} |
| 62 |
} |
| 63 |
|
| 64 |
## Remove unused data |
| 65 |
|
| 66 |
$subtags->{_file_date} = $subtags->{header}->{'File-Date'}->[0]; |
| 67 |
delete $subtags->{header}; |
| 68 |
|
| 69 |
for my $type (grep {!/^_/} keys %{$subtags}) { |
| 70 |
for my $tag (keys %{$subtags->{$type}}) { |
| 71 |
my $subtag = $subtags->{$type}->{$tag}; |
| 72 |
delete $subtag->{Comments}; |
| 73 |
delete $subtag->{Description}; |
| 74 |
delete $subtag->{Added}; |
| 75 |
delete $subtag->{Tag}; |
| 76 |
delete $subtag->{Subtag}; |
| 77 |
delete $subtag->{Type}; |
| 78 |
|
| 79 |
$subtag->{_deprecated} = 1 if $subtag->{Deprecated}; |
| 80 |
delete $subtag->{Deprecated}; |
| 81 |
|
| 82 |
$subtag->{_preferred} = $subtag->{'Preferred-Value'}->[0] |
| 83 |
if defined $subtag->{'Preferred-Value'}->[0]; |
| 84 |
delete $subtag->{'Preferred-Value'}; |
| 85 |
|
| 86 |
if (defined $subtag->{'Suppress-Script'}->[0]) { |
| 87 |
$subtag->{_suppress} = $subtag->{'Suppress-Script'}->[0]; |
| 88 |
$subtag->{_suppress} =~ tr/A-Z/a-z/; |
| 89 |
} |
| 90 |
delete $subtag->{'Suppress-Script'}; |
| 91 |
|
| 92 |
for (@{$subtag->{Prefix} or []}) { |
| 93 |
tr/A-Z/a-z/; |
| 94 |
} |
| 95 |
|
| 96 |
$subtag->{Prefix} = [sort {length $b <=> length $a or $a cmp $b} |
| 97 |
@{$subtag->{Prefix}}] if $subtag->{Prefix}; |
| 98 |
} |
| 99 |
} |
| 100 |
|
| 101 |
## Resolve transitive relationship of Preferred-Value field |
| 102 |
## NOTE: Although noted in RFC 4646, urrently no tag has such relationship. |
| 103 |
|
| 104 |
for my $type (grep {!/^_/} keys %{$subtags}) { |
| 105 |
for my $tag (keys %{$subtags->{$type}}) { |
| 106 |
my $subtag = $subtags->{$type}->{$tag}; |
| 107 |
my $new_subtag = $subtag; |
| 108 |
while (1) { |
| 109 |
my $preferred = $new_subtag->{_preferred}; |
| 110 |
last unless defined $preferred; |
| 111 |
$preferred =~ tr/A-Z/a-z/; |
| 112 |
$new_subtag = $subtags->{$type}->{$preferred}; |
| 113 |
last unless $new_subtag; |
| 114 |
} |
| 115 |
} |
| 116 |
} |
| 117 |
|
| 118 |
use Data::Dumper; |
| 119 |
$Data::Dumper::Sortkeys = 1; |
| 120 |
$Data::Dumper::Purity = 1; |
| 121 |
my $value = Dumper $subtags; |
| 122 |
$value =~ s/\$VAR1\b/\$Whatpm::LangTag::Registry/g; |
| 123 |
|
| 124 |
print $value; |
| 125 |
print "1;\n"; |
| 126 |
print '__DATA__ |
| 127 |
|
| 128 |
=head1 NAME |
| 129 |
|
| 130 |
mklangreg.pl - Generate language subtag registry object for langauge |
| 131 |
tag validation |
| 132 |
|
| 133 |
_LangTagReg.pm - A language subtag registry data module for language |
| 134 |
tag validation |
| 135 |
|
| 136 |
=head1 DESCRIPTION |
| 137 |
|
| 138 |
The C<_LangTagReg.pm> file contains a list of registered language |
| 139 |
subtags. It is used by L<Whatpm::LangTag> for the purpose of language |
| 140 |
tag validation. |
| 141 |
|
| 142 |
The C<mklangreg.pl> script is used to generate the C<_LangTagReg.pm> |
| 143 |
file from the IANA registry. |
| 144 |
|
| 145 |
=head1 SEE ALSO |
| 146 |
|
| 147 |
L<Whatpm::LangTag>. |
| 148 |
|
| 149 |
RFC 4646 (BCP 47) Tags for Identifying Languages <urn:ietf:rfc:4646>. |
| 150 |
|
| 151 |
IANA Language Subtag Registry |
| 152 |
<http://www.iana.org/assignments/language-subtag-registry>. |
| 153 |
|
| 154 |
=head1 LICENSE |
| 155 |
|
| 156 |
The C<mklangreg.pl> is in the Public Domain. |
| 157 |
|
| 158 |
=cut |
| 159 |
|
| 160 |
'; |