1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
|
4 |
use Getopt::Long; |
5 |
use Pod::Usage; |
6 |
|
7 |
my $lang; |
8 |
|
9 |
GetOptions ( |
10 |
'lang=s' => \$lang, |
11 |
'help' => sub { |
12 |
pod2usage (-exitval => 0, -verbose => 2); |
13 |
}, |
14 |
) or pod2usage (-exitval => 1, -verbose => 1); |
15 |
|
16 |
pod2usage (-exitval => 1, -verbose => 1, |
17 |
-msg => "Required argument --lang is not specified.\n") |
18 |
unless defined $lang; |
19 |
$lang =~ tr/A-Z/a-z/; ## ASCII case-insensitive. |
20 |
|
21 |
require Message::DOM::DOMImplementation; |
22 |
my $dom = Message::DOM::DOMImplementation->new; |
23 |
|
24 |
my $doc = $dom->create_document; |
25 |
$doc->manakai_is_html (1); |
26 |
|
27 |
{ |
28 |
binmode STDIN, ':encoding(utf8)'; |
29 |
local $/ = undef; |
30 |
$doc->inner_html (scalar <STDIN>); |
31 |
} |
32 |
|
33 |
my @remove; |
34 |
|
35 |
my $containers = $doc->query_selector_all |
36 |
('[data-lang-container], [data-lang-content]'); |
37 |
for my $el (@$containers) { |
38 |
next unless $el->parent_node; |
39 |
|
40 |
if ($el->has_attribute ('data-lang-container')) { |
41 |
my $content = get_content_element ($el); |
42 |
next unless $content; |
43 |
if ($el->get_attribute ('data-lang-container') eq 'replace') { |
44 |
$el->parent_node->replace_child ($content, $el); |
45 |
} else { |
46 |
$el->text_content (''); |
47 |
$el->append_child ($content); |
48 |
} |
49 |
} elsif ($el->has_attribute ('data-lang-content')) { |
50 |
my $idref = $el->get_attribute ('data-lang-content'); |
51 |
my $container = $doc->get_element_by_id ($idref); |
52 |
unless ($container) { |
53 |
warn "Element with ID $idref not found\n"; |
54 |
next; |
55 |
} |
56 |
|
57 |
my $content = get_content_element ($container); |
58 |
$el->text_content ($content->text_content); |
59 |
$el->manakai_html_language ($content->manakai_html_language); |
60 |
|
61 |
if ($container->has_attribute ('data-lang-declaration')) { |
62 |
push @remove, $container; |
63 |
} |
64 |
} |
65 |
} |
66 |
|
67 |
for (@remove) { |
68 |
next unless $_->parent_node; |
69 |
$_->parent_node->remove_child ($_); |
70 |
} |
71 |
|
72 |
$doc->document_element->manakai_html_language ($lang); |
73 |
|
74 |
binmode STDOUT, ':encoding(utf8)'; |
75 |
print STDOUT $doc->inner_html; |
76 |
|
77 |
sub get_content_element ($) { |
78 |
my $container = shift; |
79 |
|
80 |
my $c_el; |
81 |
for my $e (@{$container->child_nodes}) { |
82 |
next unless $e->node_type == $e->ELEMENT_NODE; |
83 |
my $e_lang = $e->manakai_html_language; |
84 |
$e_lang =~ tr/A-Z/a-z/; ## ASCII case-insensitive. |
85 |
if ($e_lang eq $lang) { |
86 |
$c_el = $e; |
87 |
last; |
88 |
} else { |
89 |
$c_el ||= $e; |
90 |
} |
91 |
} |
92 |
|
93 |
return $c_el; |
94 |
} # get_content_element |
95 |
|
96 |
__END__ |
97 |
|
98 |
=head1 NAME |
99 |
|
100 |
harusame.pl - Multilingual Web page management tool |
101 |
|
102 |
=head1 SYNOPSIS |
103 |
|
104 |
perl harusame.pl --lang LANGCODE < input.html > output.html |
105 |
|
106 |
perl harusame.pl --help |
107 |
|
108 |
=head1 DESCRIPTION |
109 |
|
110 |
The C<harusame.pl> script extracts a version of the HTML document |
111 |
written in the specified natural language, from a source HTML document |
112 |
that contains paragraphs in multiple natural languages. |
113 |
|
114 |
The document management of a multilingual Web site where there are |
115 |
multiple versions of a (conceptually same) document is somewhat |
116 |
difficult in general. If the author of an HTML document wants to edit |
117 |
a part of the document, then he or she has to ensure not to forget |
118 |
updating translations at the same time, otherwise documents in |
119 |
different language versions also differ in their content versions. |
120 |
|
121 |
Using the C<harusame.pl>, one can generate versions of an HTML |
122 |
document in different language from one source HTML document that |
123 |
contains paragraphs written in all of those languages, such that |
124 |
authors no longer have to manage different content versions and |
125 |
different language versions in separate files. |
126 |
|
127 |
=head1 ARGUMENTS |
128 |
|
129 |
The source document must be provided to the script using the |
130 |
I<standard input>. It must be encoded in UTF-8. |
131 |
|
132 |
The script outputs the generated document encoded in UTF-8 to the |
133 |
I<standard output>. |
134 |
|
135 |
Following command-line options are available to this script: |
136 |
|
137 |
=over 4 |
138 |
|
139 |
=item C<--help> |
140 |
|
141 |
Show the help message and exit. |
142 |
|
143 |
=item C<--lang I<LANGCODE>> (B<REQUIRED>) |
144 |
|
145 |
The language of the version to generate. This option must be |
146 |
specified. The value must be a value that is valid for HTML |
147 |
C<lang=""> attribute. |
148 |
|
149 |
=back |
150 |
|
151 |
=head1 SEE ALSO |
152 |
|
153 |
Readme L<http://suika.fam.cx/www/harusame/readme>. How to mark up the |
154 |
source HTML document is described in this document. |
155 |
|
156 |
=head1 AUTHOR |
157 |
|
158 |
Wakaba <w@suika.fam.cx>. |
159 |
|
160 |
=head1 LICENSE |
161 |
|
162 |
Copyright 2008 Wakaba <w@suika.fam.cx>. |
163 |
|
164 |
This library is free software; you can redistribute it and/or modify |
165 |
it under the same terms as Perl itself. |
166 |
|
167 |
=cut |
168 |
|
169 |
## $Date: 2008/10/21 08:36:59 $ |