--- test/html-webhacc/cc.cgi 2007/11/11 06:57:16 1.24
+++ test/html-webhacc/cc.cgi 2008/02/10 02:05:30 1.31
@@ -52,7 +52,6 @@
$| = 0;
my $input = get_input_document ($http, $dom);
- my $inner_html_element = $http->get_parameter ('e');
my $char_length = 0;
my %time;
@@ -62,7 +61,12 @@
Request URI
<@{[htescape $input->{request_uri}]}>
Document URI
- <@{[htescape $input->{uri}]}>
+ <@{[htescape $input->{uri}]}>
+
]; # no yet
push @nav, ['#document-info' => 'Information'];
@@ -74,7 +78,7 @@
<@{[htescape $input->{base_uri}]}>
Internet Media Type
@{[htescape $input->{media_type}]}
- @{[$input->{media_type_overridden} ? '(overridden)' : '']}
+ @{[$input->{media_type_overridden} ? '(overridden)' : defined $input->{official_type} ? $input->{media_type} eq $input->{official_type} ? '' : '(sniffed; official type is: '.htescape ($input->{official_type}).'
)' : '(sniffed)']}
Character Encoding
@{[defined $input->{charset} ? ''.htescape ($input->{charset}).'
' : '(none)']}
@{[$input->{charset_overridden} ? '(overridden)' : '']}
@@ -85,46 +89,7 @@
];
my $result = {conforming_min => 1, conforming_max => 1};
- print_http_header_section ($input, $result);
-
- my $doc;
- my $el;
- my $manifest;
-
- if ($input->{media_type} eq 'text/html') {
- ($doc, $el) = print_syntax_error_html_section ($input, $result);
- print_source_string_section (\($input->{s}), $input->{charset});
- } elsif ({
- 'text/xml' => 1,
- 'application/atom+xml' => 1,
- 'application/rss+xml' => 1,
- 'application/svg+xml' => 1,
- 'application/xhtml+xml' => 1,
- 'application/xml' => 1,
- }->{$input->{media_type}}) {
- ($doc, $el) = print_syntax_error_xml_section ($input, $result);
- print_source_string_section (\($input->{s}), $doc->input_encoding);
- } elsif ($input->{media_type} eq 'text/cache-manifest') {
-## TODO: MUST be text/cache-manifest
- $manifest = print_syntax_error_manifest_section ($input, $result);
- print_source_string_section (\($input->{s}), 'utf-8');
- } else {
- ## TODO: Change HTTP status code??
- print_result_unknown_type_section ($input, $result);
- }
-
- if (defined $doc or defined $el) {
- print_structure_dump_dom_section ($doc, $el);
- my $elements = print_structure_error_dom_section ($doc, $el, $result);
- print_table_section ($elements->{table}) if @{$elements->{table}};
- print_id_section ($elements->{id}) if keys %{$elements->{id}};
- print_term_section ($elements->{term}) if keys %{$elements->{term}};
- print_class_section ($elements->{class}) if keys %{$elements->{class}};
- } elsif (defined $manifest) {
- print_structure_dump_manifest_section ($manifest);
- print_structure_error_manifest_section ($manifest, $result);
- }
-
+ check_and_print ($input => $result);
print_result_section ($result);
} else {
print STDOUT qq[];
@@ -161,7 +126,7 @@
$result->{conforming_min} = 0;
} elsif ($err->{level} eq 'w' or $err->{level} eq 'g') {
$result->{$layer}->{warning}++;
- } elsif ($err->{level} eq 'unsupported') {
+ } elsif ($err->{level} eq 'u' or $err->{level} eq 'unsupported') {
$result->{$layer}->{unsupported}++;
$result->{unsupported} = 1;
} else {
@@ -180,6 +145,51 @@
}
} # add_error
+sub check_and_print ($$) {
+ my ($input, $result) = @_;
+
+ print_http_header_section ($input, $result);
+
+ my $doc;
+ my $el;
+ my $manifest;
+
+ if ($input->{media_type} eq 'text/html') {
+ ($doc, $el) = print_syntax_error_html_section ($input, $result);
+ print_source_string_section
+ (\($input->{s}), $input->{charset} || $doc->input_encoding);
+ } elsif ({
+ 'text/xml' => 1,
+ 'application/atom+xml' => 1,
+ 'application/rss+xml' => 1,
+ 'application/svg+xml' => 1,
+ 'application/xhtml+xml' => 1,
+ 'application/xml' => 1,
+ }->{$input->{media_type}}) {
+ ($doc, $el) = print_syntax_error_xml_section ($input, $result);
+ print_source_string_section (\($input->{s}), $doc->input_encoding);
+ } elsif ($input->{media_type} eq 'text/cache-manifest') {
+## TODO: MUST be text/cache-manifest
+ $manifest = print_syntax_error_manifest_section ($input, $result);
+ print_source_string_section (\($input->{s}), 'utf-8');
+ } else {
+ ## TODO: Change HTTP status code??
+ print_result_unknown_type_section ($input, $result);
+ }
+
+ if (defined $doc or defined $el) {
+ print_structure_dump_dom_section ($doc, $el);
+ my $elements = print_structure_error_dom_section ($doc, $el, $result);
+ print_table_section ($elements->{table}) if @{$elements->{table}};
+ print_id_section ($elements->{id}) if keys %{$elements->{id}};
+ print_term_section ($elements->{term}) if keys %{$elements->{term}};
+ print_class_section ($elements->{class}) if keys %{$elements->{class}};
+ } elsif (defined $manifest) {
+ print_structure_dump_manifest_section ($manifest);
+ print_structure_error_manifest_section ($manifest, $result);
+ }
+} # check_and_print
+
sub print_http_header_section ($$) {
my ($input, $result) = @_;
return unless defined $input->{header_status_code} or
@@ -247,8 +257,9 @@
my $doc = $dom->create_document;
my $el;
+ my $inner_html_element = $http->get_parameter ('e');
if (defined $inner_html_element and length $inner_html_element) {
- $input->{charset} ||= 'ISO-8859-1'; ## TODO: for now.
+ $input->{charset} ||= 'windows-1252'; ## TODO: for now.
my $time1 = time;
my $t = Encode::decode ($input->{charset}, $input->{s});
$time{decode} = time - $time1;
@@ -264,6 +275,8 @@
($input->{charset}, $input->{s} => $doc, $onerror);
$time{parse_html} = time - $time1;
}
+ $doc->manakai_charset ($input->{official_charset})
+ if defined $input->{official_charset};
print STDOUT qq[];
@@ -304,6 +317,8 @@
my $doc = Message::DOM::XMLParserTemp->parse_byte_stream
($fh => $dom, $onerror, charset => $input->{charset});
$time{parse_xml} = time - $time1;
+ $doc->manakai_charset ($input->{official_charset})
+ if defined $input->{official_charset};
print STDOUT qq[];
@@ -393,7 +408,7 @@
$r .= '';
for my $attr (sort {$a->[0] cmp $b->[0]} map { [$_->name, $_->value, $_->namespace_uri, 'node-'.refaddr $_] }
@{$child->attributes}) {
- $r .= qq[] . htescape ($attr->[0]) . '
= '; ## ISSUE: case?
+ $r .= qq[] . htescape ($attr->[0]) . '
= '; ## ISSUE: case?
$r .= '' . htescape ($attr->[1]) . '
'; ## TODO: children
}
$r .= '
';
@@ -414,6 +429,21 @@
} elsif ($nt == $child->DOCUMENT_NODE) {
$r .= qq'Document';
$r .= qq[];
+ my $cp = $child->manakai_charset;
+ if (defined $cp) {
+ $r .= qq[charset
parameter = ];
+ $r .= htescape ($cp) . qq[
];
+ }
+ $r .= qq[inputEncoding
= ];
+ my $ie = $child->input_encoding;
+ if (defined $ie) {
+ $r .= qq[@{[htescape ($ie)]}
];
+ if ($child->manakai_has_bom) {
+ $r .= qq[ (with BOM
)];
+ }
+ } else {
+ $r .= qq[(null
)];
+ }
$r .= qq[- @{[scalar get_text ('manakaiIsHTML:'.($child->manakai_is_html?1:0))]}
];
$r .= qq[- @{[scalar get_text ('manakaiCompatMode:'.$child->manakai_compat_mode)]}
];
unless ($child->manakai_is_html) {
@@ -781,9 +811,9 @@
];
push @nav, ['#parse-errors' => 'Errors'];
- add_error (char => {level => 'unsupported'} => $result);
- add_error (syntax => {level => 'unsupported'} => $result);
- add_error (structure => {level => 'unsupported'} => $result);
+ add_error (char => {level => 'u'} => $result);
+ add_error (syntax => {level => 'u'} => $result);
+ add_error (structure => {level => 'u'} => $result);
} # print_result_unknown_type_section
sub print_result_input_error_section ($) {
@@ -840,7 +870,7 @@
} elsif ($err->{level} eq 'w') {
$r = qq[Warning:
];
- } elsif ($err->{level} eq 'unsupported') {
+ } elsif ($err->{level} eq 'u' or $err->{level} eq 'unsupported') {
$r = qq[Not
supported: ];
} else {
@@ -889,7 +919,8 @@
sub load_text_catalog ($) {
my $lang = shift; # MUST be a canonical lang name
- open my $file, '<', "cc-msg.$lang.txt" or die "$0: cc-msg.$lang.txt: $!";
+ open my $file, '<:utf8', "cc-msg.$lang.txt"
+ or die "$0: cc-msg.$lang.txt: $!";
while (<$file>) {
if (s/^([^;]+);([^;]*);//) {
my ($type, $cls, $msg) = ($1, $2, $_);
@@ -902,6 +933,7 @@
sub get_text ($) {
my ($type, $level, $node) = @_;
$type = $level . ':' . $type if defined $level;
+ $level = 'm' unless defined $level;
my @arg;
{
if (defined $Msg->{$type}) {
@@ -926,13 +958,13 @@
? htescape ($node->owner_element->manakai_local_name)
: ''
}ge;
- return ($type, $Msg->{$type}->[0], $msg);
+ return ($type, 'level-' . $level . ' ' . $Msg->{$type}->[0], $msg);
} elsif ($type =~ s/:([^:]*)$//) {
unshift @arg, $1;
redo;
}
}
- return ($type, '', htescape ($_[0]));
+ return ($type, 'level-'.$level, htescape ($_[0]));
} # get_text
}
@@ -988,6 +1020,7 @@
$ua->protocols_allowed ([qw/http/]);
$ua->max_size (1000_000);
my $req = HTTP::Request->new (GET => $request_uri);
+ $req->header ('Accept-Encoding' => 'identity, *; q=0');
my $res = $ua->request ($req);
## TODO: 401 sets |is_success| true.
if ($res->is_success or $http->get_parameter ('error-page')) {
@@ -997,12 +1030,10 @@
## TODO: More strict parsing...
my $ct = $res->header ('Content-Type');
- if (defined $ct and $ct =~ m#^([0-9A-Za-z._+-]+/[0-9A-Za-z._+-]+)#) {
- $r->{media_type} = lc $1;
- }
if (defined $ct and $ct =~ /;\s*charset\s*=\s*"?([^\s;"]+)"?/i) {
$r->{charset} = lc $1;
$r->{charset} =~ tr/\\//d;
+ $r->{official_charset} = $r->{charset};
}
my $input_charset = $http->get_parameter ('charset');
@@ -1010,9 +1041,22 @@
$r->{charset_overridden}
= (not defined $r->{charset} or $r->{charset} ne $input_charset);
$r->{charset} = $input_charset;
- }
+ }
+
+ ## TODO: Support for HTTP Content-Encoding
$r->{s} = ''.$res->content;
+
+ require Whatpm::ContentType;
+ ($r->{official_type}, $r->{media_type})
+ = Whatpm::ContentType->get_sniffed_type
+ (get_file_head => sub {
+ return substr $r->{s}, 0, shift;
+ },
+ http_content_type_byte => $ct,
+ has_http_content_encoding =>
+ defined $res->header ('Content-Encoding'),
+ supported_image_types => {});
} else {
$r->{uri} = $res->request->uri;
$r->{request_uri} = $request_uri;
@@ -1033,7 +1077,18 @@
$r->{charset} = ''.$http->get_parameter ('_charset_');
$r->{charset} =~ s/\s+//g;
$r->{charset} = 'utf-8' if $r->{charset} eq '';
+ $r->{official_charset} = $r->{charset};
$r->{header_field} = [];
+
+ require Whatpm::ContentType;
+ ($r->{official_type}, $r->{media_type})
+ = Whatpm::ContentType->get_sniffed_type
+ (get_file_head => sub {
+ return substr $r->{s}, 0, shift;
+ },
+ http_content_type_byte => undef,
+ has_http_content_encoding => 0,
+ supported_image_types => {});
}
my $input_format = $http->get_parameter ('i');
@@ -1050,6 +1105,7 @@
if ($r->{media_type} eq 'text/xml') {
unless (defined $r->{charset}) {
$r->{charset} = 'us-ascii';
+ $r->{official_charset} = $r->{charset};
} elsif ($r->{charset_overridden} and $r->{charset} eq 'us-ascii') {
$r->{charset_overridden} = 0;
}
@@ -1100,4 +1156,4 @@
=cut
-## $Date: 2007/11/11 06:57:16 $
+## $Date: 2008/02/10 02:05:30 $