whatpm/Whatpm/mklangreg.pl

#!/usr/bin/perl
use strict;

my $subtags;

my $langreg_source_file_name = shift;
{
  open my $langreg_source_file, '<', $langreg_source_file_name or
      die "$0: $langreg_source_file_name: $!";
  local $/ = undef;

  ## NOTE: Based on RFC 4646 3.1.'s syntax, but more error-torelant.
  for (split /\x0D?+\x0A%%\x0D?+\x0A/, <$langreg_source_file>) {
    my $fields = [['' => '']];
    for (split /\x0D?+\x0A/, $_) {
      if (/^\s/) { ## Part of continuous line
        $fields->[-1]->[1] .= $_;
      } elsif (s/^([^:\s]++)\s*+:\s*+//) { ## The first line of a |field|
        push @$fields, [$1 => $_];
      } else { ## An errorneous line
        push @$fields, ['' => $_];
      }
    }
    my $subtag;
    shift @$fields if $fields->[0]->[1] eq ''; # remove dummy if unused
    for (@$fields) {
      $subtag->{$_->[0]} ||= [];
      my $v = $_->[1];
      $v =~ s/&#x([0-9A-Fa-f]++);/chr hex $1/ge;
      push @{$subtag->{$_->[0]}}, $v;
    }
    if ($subtags) {
      my $tag_name_start = $subtag->{Subtag}->[0] || $subtag->{Tag}->[0];
      if ($tag_name_start =~ /^[A-Z][a-z]++(?>\.\.[A-Z][a-z]++)?+$/) {
        $subtag->{_canon} = '_titlecase';
      } elsif ($tag_name_start =~ /^[A-Z]++(?>\.\.[A-Z]++)?+$/) {
        $subtag->{_canon} = '_uppercase';
      } elsif ($tag_name_start =~ /^[a-z]++(?>\.\.[a-z]++)?+$/) {
        $subtag->{_canon} = '_lowercase';
      } else {
        $subtag->{_canon} = $tag_name_start;
      }
      $tag_name_start =~ tr/A-Z/a-z/;
      my $tag_name_end;
      if ($tag_name_start =~ /^(.+)\.\.(.+)$/) {
        $tag_name_start = $1;
        $tag_name_end = $2;
      } else {
        $tag_name_end = $tag_name_start;
      }
      for my $tag_name ($tag_name_start .. $tag_name_end) {
        if ($subtags->{$subtag->{Type}->[0]}->{$tag_name}) {
          warn "Duplicate tag: $tag_name\n";
        } else {
          $subtags->{$subtag->{Type}->[0]}->{$tag_name} = $subtag;
        }
      }
    } else { ## The first record
      $subtags->{header} = $subtag;
    }
  }
}

## Remove unused data

$subtags->{_file_date} = $subtags->{header}->{'File-Date'}->[0];
delete $subtags->{header};

for my $type (grep {!/^_/} keys %{$subtags}) {
  for my $tag (keys %{$subtags->{$type}}) {
    my $subtag = $subtags->{$type}->{$tag};
    delete $subtag->{Comments};
    delete $subtag->{Description};
    delete $subtag->{Added};
    delete $subtag->{Tag};
    delete $subtag->{Subtag};
    delete $subtag->{Type};
    
    $subtag->{_deprecated} = 1 if $subtag->{Deprecated};
    delete $subtag->{Deprecated};

    $subtag->{_preferred} = $subtag->{'Preferred-Value'}->[0]
        if defined $subtag->{'Preferred-Value'}->[0];
    delete $subtag->{'Preferred-Value'};

    if (defined $subtag->{'Suppress-Script'}->[0]) {
      $subtag->{_suppress} = $subtag->{'Suppress-Script'}->[0];
      $subtag->{_suppress} =~ tr/A-Z/a-z/;
    }
    delete $subtag->{'Suppress-Script'};

    for (@{$subtag->{Prefix} or []}) {
      tr/A-Z/a-z/;
    }

    $subtag->{Prefix} = [sort {length $b <=> length $a or $a cmp $b}
                             @{$subtag->{Prefix}}] if $subtag->{Prefix};
  }
}

## Resolve transitive relationship of Preferred-Value field
## NOTE: Although noted in RFC 4646, urrently no tag has such relationship.

for my $type (grep {!/^_/} keys %{$subtags}) {
  for my $tag (keys %{$subtags->{$type}}) {
    my $subtag = $subtags->{$type}->{$tag};
    my $new_subtag = $subtag;
    while (1) {
      my $preferred = $new_subtag->{_preferred};
      last unless defined $preferred;
      $preferred =~ tr/A-Z/a-z/;
      $new_subtag = $subtags->{$type}->{$preferred};
      last unless $new_subtag;
    }
  }
}

use Data::Dumper;
$Data::Dumper::Sortkeys = 1;
$Data::Dumper::Purity = 1;
my $value = Dumper $subtags;
$value =~ s/\$VAR1\b/\$Whatpm::LangTag::Registry/g;

print $value;
print "1;\n";
print '__DATA__

=head1 NAME

mklangreg.pl - Generate language subtag registry object for langauge
tag validation

_LangTagReg.pm - A language subtag registry data module for language
tag validation

=head1 DESCRIPTION

The C<_LangTagReg.pm> file contains a list of registered language
subtags.  It is used by L<Whatpm::LangTag> for the purpose of language
tag validation.

The C<mklangreg.pl> script is used to generate the C<_LangTagReg.pm>
file from the IANA registry.

=head1 SEE ALSO

L<Whatpm::LangTag>.

RFC 4646 (BCP 47) Tags for Identifying Languages <urn:ietf:rfc:4646>.

IANA Language Subtag Registry
<http://www.iana.org/assignments/language-subtag-registry>.

=head1 LICENSE

The C<mklangreg.pl> is in the Public Domain.

=cut

';
1	#!/usr/bin/perl
2	use strict;
3
4	my $subtags;
5
6	my $langreg_source_file_name = shift;
7	{
8	open my $langreg_source_file, '<', $langreg_source_file_name or
9	die "$0: $langreg_source_file_name: $!";
10	local $/ = undef;
11
12	## NOTE: Based on RFC 4646 3.1.'s syntax, but more error-torelant.
13	for (split /\x0D?+\x0A%%\x0D?+\x0A/, <$langreg_source_file>) {
14	my $fields = [['' => '']];
15	for (split /\x0D?+\x0A/, $_) {
16	if (/^\s/) { ## Part of continuous line
17	$fields->[-1]->[1] .= $_;
18	} elsif (s/^([^:\s]++)\s+:\s+//) { ## The first line of a \|field\|
19	push @$fields, [$1 => $_];
20	} else { ## An errorneous line
21	push @$fields, ['' => $_];
22	}
23	}
24	my $subtag;
25	shift @$fields if $fields->[0]->[1] eq ''; # remove dummy if unused
26	for (@$fields) {
27	$subtag->{$_->[0]} \|\|= [];
28	my $v = $_->[1];
29	$v =~ s/&#x([0-9A-Fa-f]++);/chr hex $1/ge;
30	push @{$subtag->{$_->[0]}}, $v;
31	}
32	if ($subtags) {
33	my $tag_name_start = $subtag->{Subtag}->[0] \|\| $subtag->{Tag}->[0];
34	if ($tag_name_start =~ /^[A-Z][a-z]++(?>\.\.[A-Z][a-z]++)?+$/) {
35	$subtag->{_canon} = '_titlecase';
36	} elsif ($tag_name_start =~ /^[A-Z]++(?>\.\.[A-Z]++)?+$/) {
37	$subtag->{_canon} = '_uppercase';
38	} elsif ($tag_name_start =~ /^[a-z]++(?>\.\.[a-z]++)?+$/) {
39	$subtag->{_canon} = '_lowercase';
40	} else {
41	$subtag->{_canon} = $tag_name_start;
42	}
43	$tag_name_start =~ tr/A-Z/a-z/;
44	my $tag_name_end;
45	if ($tag_name_start =~ /^(.+)\.\.(.+)$/) {
46	$tag_name_start = $1;
47	$tag_name_end = $2;
48	} else {
49	$tag_name_end = $tag_name_start;
50	}
51	for my $tag_name ($tag_name_start .. $tag_name_end) {
52	if ($subtags->{$subtag->{Type}->[0]}->{$tag_name}) {
53	warn "Duplicate tag: $tag_name\n";
54	} else {
55	$subtags->{$subtag->{Type}->[0]}->{$tag_name} = $subtag;
56	}
57	}
58	} else { ## The first record
59	$subtags->{header} = $subtag;
60	}
61	}
62	}
63
64	## Remove unused data
65
66	$subtags->{_file_date} = $subtags->{header}->{'File-Date'}->[0];
67	delete $subtags->{header};
68
69	for my $type (grep {!/^_/} keys %{$subtags}) {
70	for my $tag (keys %{$subtags->{$type}}) {
71	my $subtag = $subtags->{$type}->{$tag};
72	delete $subtag->{Comments};
73	delete $subtag->{Description};
74	delete $subtag->{Added};
75	delete $subtag->{Tag};
76	delete $subtag->{Subtag};
77	delete $subtag->{Type};
78
79	$subtag->{_deprecated} = 1 if $subtag->{Deprecated};
80	delete $subtag->{Deprecated};
81
82	$subtag->{_preferred} = $subtag->{'Preferred-Value'}->[0]
83	if defined $subtag->{'Preferred-Value'}->[0];
84	delete $subtag->{'Preferred-Value'};
85
86	if (defined $subtag->{'Suppress-Script'}->[0]) {
87	$subtag->{_suppress} = $subtag->{'Suppress-Script'}->[0];
88	$subtag->{_suppress} =~ tr/A-Z/a-z/;
89	}
90	delete $subtag->{'Suppress-Script'};
91
92	for (@{$subtag->{Prefix} or []}) {
93	tr/A-Z/a-z/;
94	}
95
96	$subtag->{Prefix} = [sort {length $b <=> length $a or $a cmp $b}
97	@{$subtag->{Prefix}}] if $subtag->{Prefix};
98	}
99	}
100
101	## Resolve transitive relationship of Preferred-Value field
102	## NOTE: Although noted in RFC 4646, urrently no tag has such relationship.
103
104	for my $type (grep {!/^_/} keys %{$subtags}) {
105	for my $tag (keys %{$subtags->{$type}}) {
106	my $subtag = $subtags->{$type}->{$tag};
107	my $new_subtag = $subtag;
108	while (1) {
109	my $preferred = $new_subtag->{_preferred};
110	last unless defined $preferred;
111	$preferred =~ tr/A-Z/a-z/;
112	$new_subtag = $subtags->{$type}->{$preferred};
113	last unless $new_subtag;
114	}
115	}
116	}
117
118	use Data::Dumper;
119	$Data::Dumper::Sortkeys = 1;
120	$Data::Dumper::Purity = 1;
121	my $value = Dumper $subtags;
122	$value =~ s/\$VAR1\b/\$Whatpm::LangTag::Registry/g;
123
124	print $value;
125	print "1;\n";
126	print '__DATA__
127
128	=head1 NAME
129
130	mklangreg.pl - Generate language subtag registry object for langauge
131	tag validation
132
133	_LangTagReg.pm - A language subtag registry data module for language
134	tag validation
135
136	=head1 DESCRIPTION
137
138	The C<_LangTagReg.pm> file contains a list of registered language
139	subtags. It is used by L<Whatpm::LangTag> for the purpose of language
140	tag validation.
141
142	The C<mklangreg.pl> script is used to generate the C<_LangTagReg.pm>
143	file from the IANA registry.
144
145	=head1 SEE ALSO
146
147	L<Whatpm::LangTag>.
148
149	RFC 4646 (BCP 47) Tags for Identifying Languages <urn:ietf:rfc:4646>.
150
151	IANA Language Subtag Registry
152	<http://www.iana.org/assignments/language-subtag-registry>.
153
154	=head1 LICENSE
155
156	The C<mklangreg.pl> is in the Public Domain.
157
158	=cut
159
160	';