1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use lib qw[/home/wakaba/work/manakai/lib]; |
4 |
|
5 |
require Message::DOM::DOMCore; |
6 |
require Message::DOM::XMLParser; |
7 |
|
8 |
my $dom = $Message::DOM::DOMImplementationRegistry->get_dom_implementation; |
9 |
my $parser = $dom->create_ls_parser (1); |
10 |
my $Error; |
11 |
|
12 |
our $target = shift; |
13 |
our $code = sub { |
14 |
my ($entity, $file_name) = @_; |
15 |
|
16 |
if ($entity->{body} =~ /^((?:(?!<html).)*?<!DOCTYPE[^>]*>)/is) { |
17 |
my $dt = $1; |
18 |
if ($dt =~ /xhtml/i or $dt =~ /<\?xml/i) { |
19 |
warn $file_name, "\n"; |
20 |
parse_xml ($entity, $file_name); |
21 |
} |
22 |
} |
23 |
}; |
24 |
|
25 |
$SIG{INT} = \&result; |
26 |
|
27 |
require 'foreach.pl'; |
28 |
|
29 |
result (); |
30 |
sub result { |
31 |
delete $SIG{INT}; |
32 |
use Data::Dumper; |
33 |
$Data::Dumper::Sortkeys = 1; |
34 |
print Dumper $Error; |
35 |
exit; |
36 |
} |
37 |
|
38 |
sub parse_xml ($$) { |
39 |
my ($entity, $file_name) = @_; |
40 |
my $http_charset; |
41 |
if ($entity->{field}->{'content-type'}) { |
42 |
if ($entity->{field}->{'content-type'}->[0] =~ /charset\s*=\s*([^\s;]+)/i) { |
43 |
my $charset = $1; |
44 |
if ($charset =~ /euc/i) { |
45 |
$http_charset = 'euc-jp'; |
46 |
} elsif ($charset =~ /shift/i or $charset =~ /sjis/i or |
47 |
$charset =~ /ms932/i or $charset =~ /cp943/i or |
48 |
$charset =~ /windows-31j/i) { |
49 |
$http_charset = 'shift_jis'; |
50 |
} elsif ($charset =~ /none/i or $charset =~ /unknown/i or |
51 |
$charset =~ /jp/i or $charset eq '0') { |
52 |
$http_charset = 'shift_jis' if $target =~ /keitai/i; |
53 |
} else { |
54 |
$http_charset = $charset; |
55 |
} |
56 |
} else { |
57 |
$http_charset = 'shift_jis' if $target =~ /keitai/; |
58 |
} |
59 |
} |
60 |
my $has_error; |
61 |
$Error->{$file_name} = [[$http_charset]]; |
62 |
open my $s, '<', \($entity->{body}); |
63 |
$parser->dom_config->set_parameter ('error-handler' => sub { |
64 |
my ($self, $err) = @_; |
65 |
my $loc = $err->location; |
66 |
push @{$Error->{$file_name}}, |
67 |
[$loc->line_number, $loc->column_number, $err->type, |
68 |
$err->{'http://suika.fam.cx/~wakaba/archive/2004/dom/xml-parser#byte-sequence'}]; |
69 |
#warn join "\t", $file_name, $err->stringify; |
70 |
$has_error = 1; |
71 |
return 1; |
72 |
}); |
73 |
my $doc = $parser->parse |
74 |
({byte_stream => $s, encoding => $http_charset}); |
75 |
$Error->{$has_error ? '#error' : '#errorless'}++; |
76 |
} # parse_xml |
77 |
|
78 |
=head1 AUTHOR |
79 |
|
80 |
Wakaba <w@suika.fam.cx>. |
81 |
|
82 |
=head1 LICENSE |
83 |
|
84 |
Copyright 2007 Wakaba <w@suika.fam.cx> |
85 |
|
86 |
This library is free software; you can redistribute it |
87 |
and/or modify it under the same terms as Perl itself. |
88 |
|
89 |
=cut |
90 |
|
91 |
1; |
92 |
## $Date: 2007/06/03 13:52:14 $ |
93 |
|