1 |
wakaba |
1.1 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
|
4 |
|
|
use Getopt::Long; |
5 |
|
|
use Pod::Usage; |
6 |
|
|
|
7 |
|
|
my $lang; |
8 |
|
|
|
9 |
|
|
GetOptions ( |
10 |
|
|
'lang=s' => \$lang, |
11 |
|
|
'help' => sub { |
12 |
|
|
pod2usage (-exitval => 0, -verbose => 2); |
13 |
|
|
}, |
14 |
|
|
) or pod2usage (-exitval => 1, -verbose => 1); |
15 |
|
|
|
16 |
|
|
pod2usage (-exitval => 1, -verbose => 1, |
17 |
|
|
-msg => "Required argument --lang is not specified.\n") |
18 |
|
|
unless defined $lang; |
19 |
|
|
$lang =~ tr/A-Z/a-z/; ## ASCII case-insensitive. |
20 |
|
|
|
21 |
wakaba |
1.2 |
require Message::DOM::DOMImplementation; |
22 |
wakaba |
1.1 |
my $dom = Message::DOM::DOMImplementation->new; |
23 |
|
|
|
24 |
|
|
my $doc = $dom->create_document; |
25 |
|
|
$doc->manakai_is_html (1); |
26 |
|
|
|
27 |
|
|
{ |
28 |
|
|
binmode STDIN, ':encoding(utf8)'; |
29 |
|
|
local $/ = undef; |
30 |
|
|
$doc->inner_html (scalar <STDIN>); |
31 |
|
|
} |
32 |
|
|
|
33 |
|
|
my @remove; |
34 |
|
|
|
35 |
|
|
my $containers = $doc->query_selector_all |
36 |
|
|
('[data-lang-container], [data-lang-content]'); |
37 |
|
|
for my $el (@$containers) { |
38 |
|
|
next unless $el->parent_node; |
39 |
|
|
|
40 |
|
|
if ($el->has_attribute ('data-lang-container')) { |
41 |
|
|
my $content = get_content_element ($el); |
42 |
|
|
next unless $content; |
43 |
|
|
if ($el->get_attribute ('data-lang-container') eq 'replace') { |
44 |
|
|
$el->parent_node->replace_child ($content, $el); |
45 |
|
|
} else { |
46 |
|
|
$el->text_content (''); |
47 |
|
|
$el->append_child ($content); |
48 |
|
|
} |
49 |
|
|
} elsif ($el->has_attribute ('data-lang-content')) { |
50 |
|
|
my $idref = $el->get_attribute ('data-lang-content'); |
51 |
|
|
my $container = $doc->get_element_by_id ($idref); |
52 |
|
|
unless ($container) { |
53 |
|
|
warn "Element with ID $idref not found\n"; |
54 |
|
|
next; |
55 |
|
|
} |
56 |
|
|
|
57 |
|
|
my $content = get_content_element ($container); |
58 |
|
|
$el->text_content ($content->text_content); |
59 |
|
|
$el->manakai_html_language ($content->manakai_html_language); |
60 |
|
|
|
61 |
|
|
if ($container->has_attribute ('data-lang-declaration')) { |
62 |
|
|
push @remove, $container; |
63 |
|
|
} |
64 |
|
|
} |
65 |
|
|
} |
66 |
|
|
|
67 |
|
|
for (@remove) { |
68 |
|
|
next unless $_->parent_node; |
69 |
|
|
$_->parent_node->remove_child ($_); |
70 |
|
|
} |
71 |
|
|
|
72 |
|
|
$doc->document_element->manakai_html_language ($lang); |
73 |
|
|
|
74 |
|
|
binmode STDOUT, ':encoding(utf8)'; |
75 |
|
|
print STDOUT $doc->inner_html; |
76 |
|
|
|
77 |
|
|
sub get_content_element ($) { |
78 |
|
|
my $container = shift; |
79 |
|
|
|
80 |
|
|
my $c_el; |
81 |
|
|
for my $e (@{$container->child_nodes}) { |
82 |
|
|
next unless $e->node_type == $e->ELEMENT_NODE; |
83 |
|
|
my $e_lang = $e->manakai_html_language; |
84 |
|
|
$e_lang =~ tr/A-Z/a-z/; ## ASCII case-insensitive. |
85 |
|
|
if ($e_lang eq $lang) { |
86 |
|
|
$c_el = $e; |
87 |
|
|
last; |
88 |
|
|
} else { |
89 |
|
|
$c_el ||= $e; |
90 |
|
|
} |
91 |
|
|
} |
92 |
|
|
|
93 |
|
|
return $c_el; |
94 |
|
|
} # get_content_element |
95 |
|
|
|
96 |
wakaba |
1.2 |
__END__ |
97 |
|
|
|
98 |
|
|
=head1 NAME |
99 |
|
|
|
100 |
|
|
harusame.pl - Multilingual Web page management tool |
101 |
|
|
|
102 |
|
|
=head1 SYNOPSIS |
103 |
|
|
|
104 |
|
|
perl harusame.pl --lang LANGCODE < input.html > output.html |
105 |
|
|
|
106 |
|
|
perl harusame.pl --help |
107 |
|
|
|
108 |
|
|
=head1 DESCRIPTION |
109 |
|
|
|
110 |
|
|
The C<harusame.pl> script extracts a version of the HTML document |
111 |
|
|
written in the specified natural language, from a source HTML document |
112 |
|
|
that contains paragraphs in multiple natural languages. |
113 |
|
|
|
114 |
|
|
The document management of a multilingual Web site where there are |
115 |
|
|
multiple versions of a (conceptually same) document is somewhat |
116 |
|
|
difficult in general. If the author of an HTML document wants to edit |
117 |
|
|
a part of the document, then he or she has to ensure not to forget |
118 |
|
|
updating translations at the same time, otherwise documents in |
119 |
|
|
different language versions also differ in their content versions. |
120 |
|
|
|
121 |
|
|
Using the C<harusame.pl>, one can generate versions of an HTML |
122 |
|
|
document in different language from one source HTML document that |
123 |
|
|
contains paragraphs written in all of those languages, such that |
124 |
|
|
authors no longer have to manage different content versions and |
125 |
|
|
different language versions in separate files. |
126 |
|
|
|
127 |
|
|
=head1 ARGUMENTS |
128 |
|
|
|
129 |
|
|
The source document must be provided to the script using the |
130 |
|
|
I<standard input>. It must be encoded in UTF-8. |
131 |
|
|
|
132 |
|
|
The script outputs the generated document encoded in UTF-8 to the |
133 |
|
|
I<standard output>. |
134 |
|
|
|
135 |
|
|
Following command-line options are available to this script: |
136 |
|
|
|
137 |
|
|
=over 4 |
138 |
|
|
|
139 |
|
|
=item C<--help> |
140 |
|
|
|
141 |
|
|
Show the help message and exit. |
142 |
|
|
|
143 |
|
|
=item C<--lang I<LANGCODE>> (B<REQUIRED>) |
144 |
|
|
|
145 |
|
|
The language of the version to generate. This option must be |
146 |
|
|
specified. The value must be a value that is valid for HTML |
147 |
|
|
C<lang=""> attribute. |
148 |
|
|
|
149 |
|
|
=back |
150 |
|
|
|
151 |
|
|
=head1 SEE ALSO |
152 |
|
|
|
153 |
|
|
Readme L<http://suika.fam.cx/www/harusame/readme>. How to mark up the |
154 |
|
|
source HTML document is described in this document. |
155 |
|
|
|
156 |
|
|
=head1 AUTHOR |
157 |
|
|
|
158 |
|
|
Wakaba <w@suika.fam.cx>. |
159 |
|
|
|
160 |
|
|
=head1 LICENSE |
161 |
|
|
|
162 |
|
|
Copyright 2008 Wakaba <w@suika.fam.cx>. |
163 |
|
|
|
164 |
|
|
This library is free software; you can redistribute it and/or modify |
165 |
|
|
it under the same terms as Perl itself. |
166 |
|
|
|
167 |
|
|
=cut |
168 |
|
|
|
169 |
|
|
## $Date: 2008/10/21 08:36:59 $ |