package Whatpm::H2H;
use strict;
sub H2H_NS () { q }
sub HTML_NS () { q }
sub HTML3_NS () { q }
sub SW09_NS () { q }
sub XHTML2_NS () { q }
sub parse_string ($$$) {
my $self = bless {
token => [],
location => {},
doc => $_[2],
}, $_[0];
my $s = ''.$_[1];
$s =~ s/\x0D\x0A/\x0A/g;
$s =~ tr/\x0D/\x0A/;
$self->{line} = [split /\x0A/, $s];
local $Error::Depth = $Error::Depth + 1;
$self->{doc}->strict_error_checking (0);
my $doc_el = $self->{doc}->create_element_ns (HTML_NS, 'html');
$doc_el->set_attribute_ns (q, 'xmlns', HTML_NS);
$self->{doc}->append_child ($doc_el);
$self->_construct_tree;
return $self->{doc};
} # parse_string
sub _shift_token ($) {
my $self = $_[0];
if (@{$self->{token}}) {
return shift @{$self->{token}};
}
my $attrvalue = sub {
my $v = shift;
$v =~ s/"/"/g;
$v =~ s/<//g;
$v =~ s/®/\x{00AE}/g;
$v =~ s/♥/\x{2661}/g;
$v =~ s/&/&/g;
return $v;
};
my $uriv = sub {
my $v = $attrvalue->(shift);
$v =~ s/^\{/(/;
$v =~ s/\}$/)/;
$v =~ s/^\#([0-9si]+)$/($1)/;
$v =~ s/^\(([0-9]{4})([0-9]{2})([0-9]{2})([^)]*)\)$/($1, $2, $3$4)/;
$v =~ s/[si]/, /g if $v =~ /^\(/ and $v =~ /\)$/;
return $v;
};
my $r = {type => '#EOF'};
L: while (defined (my $line = shift @{$self->{line}})) {
if ($line =~ s/^([A-Z]+|T[0-9])(\*?\+?\*?)(?:\s+|$)//) {
my $command = $1;
my $flag = $2;
$r = {type => 'start', value => $command};
my $uri;
if ($flag =~ /\*/ and $line =~ s/^([^{\s]\S*)\s*//) {
$uri = $1;
}
my $attr = '';
if ($line =~ s/^\{(\s*(?:[A-Za-z][^{}]*)?)\}\s*//) {
$attr = $1;
}
if (not defined $uri and
$flag =~ /\*/ and $line =~ s/^([^{\s]\S*)\s*//) {
$uri = $1;
}
my @token;
my $info = {
# val# val#(*)
ABBR => [2, 2],
ACRONYM => [2, 2],
CITE => [2, 1],
LDIARY => [4, 4],
LIMG => [4, 4],
LINK => [2, 1],
LMG => [2, 2],
LNEW => [2, 2],
PERSON => [2, 2],
RIBU => [2, 2],
RUBY => [2, 2],
SEE => [2, 2],
}->{$command};
my @value = split /\s+/, $line,
($flag =~ /\*/ ? $info->[1] : $info->[0]) || 1;
push @token, {type => 'uri', value => $uriv->($uri)} if defined $uri;
my %attr;
while ($attr =~ /([A-Za-z0-9_-]+)\s*(?:=>?|:)\s*([^";,]+|"[^"]+")/gc) {
my $name = lc $1;
my $value = $2;
$value =~ tr/"//d;
$attr{$name} = $value;
}
delete $attr{'content-type'};
if ({
ABBR => 1, ACRONYM => 1,
RUBY => 1, RIBU => 1,
}->{$command}) {
if (@value == 1 and $attr{title}) {
push @value, $attr{title};
delete $attr{title};
}
if (@value == 2) {
unshift @{$self->{line}},
'RB',
'DATA {} >>'.$value[1],
'/RB',
'RT',
'DATA {} >>'.$value[0],
'/RT',
'/'.$command;
} else {
unshift @{$self->{line}}, 'DATA {} >>'.$value[0], '/'.$command;
}
} elsif ($command eq 'CITE') {
if (@value == 2) {
if (defined $uri or $value[0] !~ /^[a-z-]+:/) {
unshift @{$self->{line}},
'SRC',
'DATA {} >>'.$value[0].' '.$value[1],
'/SRC', 'BODYTEXT';
} else {
push @token, {type => 'uri', value => $uriv->($value[0])};
unshift @{$self->{line}},
'SRC',
'DATA {} >>'.$value[1],
'/SRC', 'BODYTEXT';
}
} elsif (@value == 1) {
if (defined $uri or $value[0] !~ /^[a-z-]+:/) {
unshift @{$self->{line}},
'SRC',
'DATA {} >>'.$value[0],
'/SRC', 'BODYTEXT';
} else {
push @token, {type => 'uri', value => $uriv->($value[0])};
unshift @{$self->{line}}, 'BODYTEXT';
}
} else {
unshift @{$self->{line}}, 'BODYTEXT';
}
} elsif ($command eq 'DATA') {
my @token;
$line =~ s/^>>//;
while (length $line) {
if ($line =~ s/^<([a-z0-9]+)\s*//) {
my $tagname = $1;
push @token, {type => 'start', value => $tagname};
while ($line =~ s/^([a-z-]+)\s*=\s*(?>"([^"]*)"|([^\s">]*))\s*//) {
push @token, {type => $1,
value => $attrvalue->(defined $2 ? $2 : $3)};
}
$line =~ s#^/?\s*>##;
push @token, {type => 'end', value => $tagname}
if $tagname eq 'img' or $tagname eq 'input' or
$tagname eq 'br';
} elsif ($line =~ s#^([a-z0-9]+)\s*>##) {
push @token, {type => 'end', value => $1};
} elsif ($line =~ s/^//) {
push @token, {type => 'html-comment', value => $1};
} elsif ($line =~ s/^&([a-z]+);//) {
my $name = $1;
if ($name eq 'amp') {
push @token, {type => 'text', value => '&'};
} elsif ($name eq 'lt') {
push @token, {type => 'text', value => '<'};
} elsif ($name eq 'gt') {
push @token, {type => 'text', value => '>'};
} elsif ($name eq 'quot') {
push @token, {type => 'text', value => '"'};
} elsif ($name eq 'reg') {
push @token, {type => 'text', value => "\x{00AE}"};
} elsif ($name eq 'hearts') {
push @token, {type => 'text', value => "\x{2661}"};
} else {
push @token, {type => 'char', value => $name};
}
} elsif ($line =~ s/^([0-9]+);//) {
push @token, {type => 'text', value => ord $1};
} elsif ($line =~ s/^([0-9A-Fa-f]+);//) {
push @token, {type => 'text', value => ord hex $1};
} elsif ($line =~ s/^([^<&]+)//) {
push @token, {type => 'text', value => $1};
} else {
push @token, {type => 'text', value => substr ($line, 0, 1)};
substr ($line, 0, 1) = '';
}
}
push @token, {type => 'eol'};
$r = shift @token;
push @{$self->{token}}, @token;
last L;
} elsif ({
DD => 1, DT => 1,
DEL => 1, INS => 1,
LI => 1,
RB => 1, RT => 1,
STRONG => 1,
YAMI => 1,
EM => 1,
HOUR => 1, KION => 1, LUNCH => 1,
TAION => 1, TENKI => 1, THEME => 1,
T1 => 1, T2 => 1, T3 => 1, T4 => 1,
T5 => 1, T6 => 1, T7 => 1, SP => 1,
}->{$command}) {
if (@value) {
unshift @{$self->{line}}, 'DATA {} >>'.$value[0], '/'.$command;
}
} elsif ($command eq 'DIV') {
if (@value) {
$r = {type => 'class', value => $value[0]};
}
} elsif ($command eq 'LDIARY') {
$value[0] =~ s/^([0-9]{4})([0-9]{2})([0-9]{2})/$1, $2, $3/;
$value[0] =~ s/[is]/, /;
$r = {type => 'start', value => 'LINK'};
push @token, {type => 'uri', value => "($value[0])"};
unshift @{$self->{line}}, 'DATA {} >>'.$value[1], '/LINK';
} elsif ($command eq 'LIMG') {
$r = {type => 'start', value => 'IMG'};
push @token, {type => 'uri', value => $uriv->($value[0])};
unshift @{$self->{line}}, 'DATA {} >>'.$value[3], '/IMG';
} elsif ($command eq 'LMG') {
$r = {type => 'start', value => 'IMG'};
push @token, {type => 'uri', value => $uriv->($value[0])};
unshift @{$self->{line}}, 'DATA {} >>'.$value[1], '/IMG';
} elsif ($command eq 'LINK') {
if (@value == 2) {
push @token, {type => 'uri', value => $uriv->($value[0])};
unshift @{$self->{line}}, 'DATA {} >>'.$value[1], '/LINK';
} elsif ($flag =~ /\+/) {
push @token, {type => 'uri', value => $uriv->($value[0])};
} else {
unshift @{$self->{line}}, 'DATA {} >>'.$value[0], '/LINK';
}
} elsif ($command eq 'NEW') {
$r = {type => 'start', value => 'SECTION'};
unshift @{$self->{line}}, 'H', 'DATA {} >>'.$value[0], '/H';
} elsif ($command eq 'LNEW') {
$r = {type => 'start', value => 'SECTION'};
push @token, {type => 'uri', value => $uriv->($value[0])};
unshift @{$self->{line}}, 'H', 'DATA {} >>'.$value[1], '/H';
} elsif ($command eq 'SUB') {
$r = {type => 'start', value => 'SUB'};
unshift @{$self->{line}}, 'H', 'DATA {} >>'.$value[0], '/H';
} elsif ($command eq 'PERSON') {
push @token, {type => 'key', value => $attrvalue->($value[0])};
unshift @{$self->{line}}, 'DATA {} >>'.$value[1], '/PERSON';
} elsif ($command eq 'SEE') {
if (@value == 2) {
push @token, {type => 'key', value => $attrvalue->($value[0])};
unshift @{$self->{line}}, 'DATA {} >>'.$value[1], '/SEE';
} else {
unshift @{$self->{line}}, 'DATA {} >>'.$value[0], '/SEE';
}
} elsif ($command eq 'SPAN') {
if (@value == 2) {
push @token, {type => 'class',
value => $attrvalue->($value[0])};
unshift @{$self->{line}}, 'DATA {} >>'.$value[1], '/SPAN';
} else {
unshift @{$self->{line}}, 'DATA {} >>'.$value[0], '/SPAN';
}
} elsif ($command eq 'OK') {
$r = {type => '#EOF'};
next L;
} elsif ($command eq 'XML') {
unshift @{$self->{line}}, 'DATA {} >>XML '.$line;
next L;
}
for (keys %attr) {
push @token, {type => $_, value => $attrvalue->($attr{$_})};
}
push @{$self->{token}}, @token;
last L;
} elsif ($line eq 'H2H/1.0') {
$r = {type => 'magic', value => 'H2H/1.0'};
last L;
} elsif ($line =~ m#^/([A-Z]+)\s*$#) {
$r = {type => 'end', value => $1};
last L;
} elsif ($line =~ s/^!#//) {
$r = {type => 'hnf-comment', value => $line};
last L;
} elsif ($line =~ s/^!//) {
$r = {type => 'html-comment', value => $line};
last L;
} else {
unshift @{$self->{line}}, 'DATA {} >>'.$line;
next L;
}
} # L
return $r;
} # _shift_token
sub _construct_tree ($) {
my $self = $_[0];
my $doc_el = $self->{doc}->document_element;
my $head_el = $self->{doc}->create_element_ns (HTML_NS, 'head');
my $body_el = $self->{doc}->create_element_ns (HTML_NS, 'body');
$doc_el->append_child ($head_el);
$doc_el->append_child ($body_el);
$doc_el->set_user_data ('command-name' => '#html');
$head_el->set_user_data ('command-name' => '#head');
$body_el->set_user_data ('command-name' => '#body');
$doc_el->set_attribute_ns (SW09_NS, 'sw9:Name' => 'H2H');
$doc_el->set_attribute_ns (SW09_NS, 'sw9:Version' => '0.9');
my $parent = {
subsection => $body_el,
attr => $body_el,
text => $body_el,
};
my $state = 'data';
## data - normal
## list - UL or OL
## br - after br start tag token
## eol - after eol token
T: while (my $token = $self->_shift_token) {
last T if $token->{type} eq '#EOF';
if ($token->{type} eq 'text') {
if ($state eq 'list') {
my $li_el = $self->{doc}->create_element_ns (HTML_NS, 'li');
$li_el->manakai_append_text ($token->{value});
$parent->{text}->append_child ($li_el);
} else {
$parent->{text}->manakai_append_text ("\x0A") if $state eq 'eol';
$parent->{text}->manakai_append_text ($token->{value});
$state = 'data';
}
} elsif ($token->{type} eq 'eol') {
if ($state eq 'eol') {
$parent->{text}->manakai_append_text ("\x0A");
} else {
$state = $state eq 'br' ? 'data' : 'eol';
}
} elsif ($token->{type} eq 'start') {
my $info = {
# nsuri, qname, parent, state
ABBR => [HTML_NS, 'abbr', $parent->{text}, 'data'],
ACRONYM => [HTML_NS, 'abbr', $parent->{text}, 'data'],
BODYTEXT => [HTML3_NS, 'bodytext', $parent->{text}, 'data'],
CITE => [HTML_NS, 'blockquote', $parent->{text}, 'data',
{PRE => 1}],
DD => [HTML_NS, 'dd', $parent->{text}, 'data'],
DEL => [HTML_NS, 'del', $parent->{text}, 'data'],
DIV => [HTML_NS, 'div', $parent->{text}, 'data', {P => 1}],
DL => [HTML_NS, 'dl', $parent->{text}, 'data'],
DT => [HTML_NS, 'dt', $parent->{text}, 'data'],
EM => [HTML_NS, 'em', $parent->{text}, 'data'],
FN => [H2H_NS, 'fn', $parent->{text}, 'data'],
H => [XHTML2_NS, 'h', $parent->{text}, 'data'],
HOUR => [H2H_NS, 'hour', $head_el, 'data'],
IMG => [HTML_NS, 'img', $parent->{text}, 'data'],
INS => [HTML_NS, 'ins', $parent->{text}, 'data'],
KION => [H2H_NS, 'kion', $head_el, 'data'],
LI => [HTML_NS, 'li', $parent->{text}, 'data'],
LINK => [HTML_NS, 'a', $parent->{text}, 'data'],
LUNCH => [H2H_NS, 'lunch', $head_el, 'data'],
OL => [HTML_NS, 'ol', $parent->{text}, 'list', {PRE => 1}],
P => [HTML_NS, 'p', $parent->{text}, 'data',
{P => 1, PRE => 1}],
PERSON => [HTML3_NS, 'person', $parent->{text}, 'data'],
PRE => [HTML_NS, 'pre', $parent->{text}, 'data',
{P => 1}],
RB => [HTML_NS, 'rb', $parent->{text}, 'data'],
RIBU => [HTML_NS, 'ruby', $parent->{text}, 'data'],
RT => [HTML_NS, 'rt', $parent->{text}, 'data'],
RUBY => [HTML_NS, 'ruby', $parent->{text}, 'data'],
SECTION => [HTML_NS, 'section', $body_el, 'data'],
SEE => [HTML_NS, 'i', $parent->{text}, 'data'],
SP => [H2H_NS, 'sp', $head_el, 'data'],
SPAN => [HTML_NS, 'span', $parent->{text}, 'data'],
SRC => [HTML3_NS, 'credit', $parent->{text}, 'data'],
STRONG => [HTML_NS, 'strong', $parent->{text}, 'data'],
SUBSECTION => [HTML_NS, 'section', $parent->{subsection}, 'data'],
T1 => [H2H_NS, 't1', $head_el, 'data'],
T2 => [H2H_NS, 't2', $head_el, 'data'],
T3 => [H2H_NS, 't3', $head_el, 'data'],
T4 => [H2H_NS, 't4', $head_el, 'data'],
T5 => [H2H_NS, 't5', $head_el, 'data'],
T6 => [H2H_NS, 't6', $head_el, 'data'],
T7 => [H2H_NS, 't7', $head_el, 'data'],
TAION => [H2H_NS, 'taion', $head_el, 'data'],
TENKI => [H2H_NS, 'tenki', $head_el, 'data'],
THEME => [H2H_NS, 'theme', $head_el, 'data'],
UL => [HTML_NS, 'ul', $parent->{text}, 'list', {PRE => 1}],
YAMI => [H2H_NS, 'yami', $parent->{text}, 'data'],
a => [HTML_NS, 'a', $parent->{text}, 'data'],
abbr => [HTML_NS, 'abbr', $parent->{text}, 'data'],
acronym => [HTML_NS, 'abbr', $parent->{text}, 'data'],
b => [HTML_NS, 'b', $parent->{text}, 'data'],
blockquote => [HTML_NS, 'blockquote', $parent->{text}, 'data'],
br => [HTML_NS, 'br', $parent->{text}, 'br'],
caption => [HTML_NS, 'caption', $parent->{text}, 'data'],
code => [HTML_NS, 'code', $parent->{text}, 'data'],
dd => [HTML_NS, 'dd', $parent->{text}, 'data'],
del => [HTML_NS, 'del', $parent->{text}, 'data'],
dfn => [HTML_NS, 'dfn', $parent->{text}, 'data'],
div => [HTML_NS, 'div', $parent->{text}, 'data'],
dl => [HTML_NS, 'dl', $parent->{text}, 'data'],
dt => [HTML_NS, 'dt', $parent->{text}, 'data'],
em => [HTML_NS, 'em', $parent->{text}, 'data'],
form => [HTML_NS, 'form', $parent->{text}, 'data'],
h1 => [HTML_NS, 'h1', $parent->{text}, 'data'],
h2 => [HTML_NS, 'h2', $parent->{text}, 'data'],
h3 => [HTML_NS, 'h3', $parent->{text}, 'data'],
h4 => [HTML_NS, 'h4', $parent->{text}, 'data'],
i => [HTML_NS, 'i', $parent->{text}, 'data'],
img => [HTML_NS, 'img', $parent->{text}, 'data'],
input => [HTML_NS, 'input', $parent->{text}, 'data'],
ins => [HTML_NS, 'ins', $parent->{text}, 'data'],
kbd => [HTML_NS, 'kbd', $parent->{text}, 'data'],
label => [HTML_NS, 'label', $parent->{text}, 'data'],
li => [HTML_NS, 'li', $parent->{text}, 'data'],
ol => [HTML_NS, 'ol', $parent->{text}, 'data'],
p => [HTML_NS, 'p', $parent->{text}, 'data'],
pre => [HTML_NS, 'pre', $parent->{text}, 'data'],
q => [HTML_NS, 'q', $parent->{text}, 'data'],
rb => [HTML_NS, 'rb', $parent->{text}, 'data'],
rp => [HTML_NS, 'rp', $parent->{text}, 'data'],
rt => [HTML_NS, 'rt', $parent->{text}, 'data'],
ruby => [HTML_NS, 'ruby', $parent->{text}, 'data'],
samp => [HTML_NS, 'samp', $parent->{text}, 'data'],
span => [HTML_NS, 'span', $parent->{text}, 'data'],
strong => [HTML_NS, 'strong', $parent->{text}, 'data'],
sub => [HTML_NS, 'sub', $parent->{text}, 'data'],
sup => [HTML_NS, 'sup', $parent->{text}, 'data'],
table => [HTML_NS, 'table', $parent->{text}, 'data'],
tbody => [HTML_NS, 'tbody', $parent->{text}, 'data'],
td => [HTML_NS, 'td', $parent->{text}, 'data'],
th => [HTML_NS, 'th', $parent->{text}, 'data'],
thead => [HTML_NS, 'thead', $parent->{text}, 'data'],
tr => [HTML_NS, 'tr', $parent->{text}, 'data'],
ul => [HTML_NS, 'ul', $parent->{text}, 'data'],
var => [HTML_NS, 'var', $parent->{text}, 'data'],
}->{$token->{value}}
|| [H2H_NS, $token->{value}, $parent->{text}, 'data'];
while ($info->[4]->{$info->[2]->get_user_data ('command-name')}) {
$info->[2] = $info->[2]->parent_node;
}
my $el = $self->{doc}->create_element_ns ($info->[0], $info->[1]);
$el->set_user_data ('command-name', $token->{value});
$info->[2]->append_child ($el);
$parent->{text} = $el;
$parent->{attr} = $el;
$parent->{subsection} = $el if $token->{value} eq 'SECTION';
$state = $info->[3];
} elsif ($token->{type} eq 'end') {
E: while (my $et = $parent->{text}->get_user_data ('command-name')) {
$parent->{text} = $parent->{text}->parent_node;
last E if $et eq $token->{value};
last E if $et eq '#body';
}
$parent->{attr} = $parent->{text};
$state = {
UL => 'list',
OL => 'list',
}->{$parent->{text}->get_user_data ('command-name')}
|| $state eq 'br' ? 'br' : 'data';
} elsif ($token->{type} eq 'char') {
my $el = $self->{doc}->create_element_ns (H2H_NS, 'char');
$el->manakai_append_text ($token->{value});
$parent->{text}->append_child ($el);
$state = 'data' if $state eq 'br';
} elsif ($token->{type} eq 'magic') {
my ($name, $version) = split m#/#, $token->{value}, 2;
$doc_el->set_attribute_ns (SW09_NS, 'sw9:Name', $name);
$doc_el->set_attribute_ns (SW09_NS, 'sw9:Version', $version);
} elsif ($token->{type} eq 'hnf-comment') {
my $com = $self->{doc}->create_element_ns
(H2H_NS, 'hnf-comment');
$com->text_content ($token->{value});
$parent->{text}->append_child ($com);
} elsif ($token->{type} eq 'html-comment') {
my $com = $self->{doc}->create_element_ns
(H2H_NS, 'html-comment');
$com->text_content ($token->{value});
$parent->{text}->append_child ($com);
} elsif ($token->{type} eq 'source') {
my $src = $self->{doc}->create_element_ns (HTML3_NS, 'credit');
$src->manakai_append_text ($token->{value});
$parent->{text}->append_child ($src);
} elsif ($token->{type} eq 'uri') {
my $v = $token->{value};
if ($v =~ /^\(([^()]+)\)$/) {
my @v = split /\s*,\s*/, $1;
$parent->{attr}->set_attribute_ns
(H2H_NS, 'href-year' => $v[0]+0);
$parent->{attr}->set_attribute_ns
(H2H_NS, 'href-month' => $v[1]+0);
$parent->{attr}->set_attribute_ns
(H2H_NS, 'href-day' => $v[2]+0);
$parent->{attr}->set_attribute_ns
(H2H_NS, 'href-section' => $v[3]+0) if $v[3];
$parent->{attr}->set_attribute_ns
(H2H_NS, 'href-subsection' => $v[4]+0)
if $v[4];
} else {
my $xuri = $parent->{attr}->manakai_expanded_uri;
if ($xuri eq HTML_NS . 'a') {
$parent->{attr}->set_attribute_ns
(undef, href => $token->{value});
} elsif ($xuri eq HTML_NS . 'blockquote') {
$parent->{attr}->set_attribute_ns
(undef, cite => $token->{value});
} else {
$parent->{attr}->set_attribute_ns
(XHTML2_NS, href => $token->{value});
}
}
} elsif ({
title => 1, style => 1,
class => 1, href => 1, 'accept-charset' => 1,
action => 1, method => 1, alt => 1, src => 1,
type => 1, value => 1, name => 1, accesskey => 1,
for => 1, cite => 1, onclick => 1, colspan => 1,
scope => 1, summary => 1,
}->{$token->{type}}) {
$parent->{attr}->set_attribute_ns
(undef, $token->{type}, $token->{value});
} elsif ($token->{type} eq 'cat') {
for (split /\s*,\s*/, $token->{value}) {
my $el = $self->{doc}->create_element_ns (H2H_NS, 'cat');
$el->manakai_append_text ($_);
$parent->{attr}->append_child ($el);
}
} elsif ($token->{type} eq 'lang' or $token->{type} eq 'xml:lang') {
$parent->{attr}->set_attribute_ns
(q,
'xml:lang' => $token->{value});
} elsif ($token->{type} eq 'id') {
$parent->{attr}->set_attribute_ns
(q,
'xml:id' => $token->{value});
} elsif ($token->{type} eq 'wbradded') {
# ignore
} else {
# key, level, place, position, time
$parent->{attr}->set_attribute_ns
(H2H_NS, $token->{type}, $token->{value});
}
}
} # _construct_tree
1;
## $Date: 2008/08/17 05:09:12 $