/[suikacvs]/webroot/www/ja1200/stat/xmlwf.pl
Suika

Contents of /webroot/www/ja1200/stat/xmlwf.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations) (download)
Sat Jul 21 05:26:48 2007 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +9 -3 lines
File MIME type: text/plain
Whatpm::HTML content model flag syntax has been changed

1 #!/usr/bin/perl
2 use strict;
3 use lib qw[/home/wakaba/work/manakai/lib];
4
5 require Message::DOM::DOMCore;
6 require Message::DOM::XMLParser;
7
8 my $dom = $Message::DOM::DOMImplementationRegistry->get_dom_implementation;
9 my $parser = $dom->create_ls_parser (1);
10 my $Error;
11
12 our $target = shift;
13 our $code = sub {
14 my ($entity, $file_name) = @_;
15
16 if ($entity->{body} =~ /^((?:(?!<html).)*?<!DOCTYPE[^>]*>)/is) {
17 my $dt = $1;
18 if ($dt =~ /xhtml/i or $dt =~ /<\?xml/i) {
19 warn $file_name, "\n";
20 parse_xml ($entity, $file_name);
21 }
22 }
23 };
24
25 $SIG{INT} = \&result;
26
27 require 'foreach.pl';
28
29 result ();
30 sub result {
31 delete $SIG{INT};
32 use Data::Dumper;
33 $Data::Dumper::Sortkeys = 1;
34 print Dumper $Error;
35 exit;
36 }
37
38 sub parse_xml ($$) {
39 my ($entity, $file_name) = @_;
40 my $http_charset;
41 if ($entity->{field}->{'content-type'}) {
42 if ($entity->{field}->{'content-type'}->[0] =~ /charset\s*=\s*([^\s;]+)/i) {
43 my $charset = $1;
44 if ($charset =~ /euc/i) {
45 $http_charset = 'euc-jp';
46 } elsif ($charset =~ /shift/i or $charset =~ /sjis/i or
47 $charset =~ /ms932/i or $charset =~ /cp943/i or
48 $charset =~ /windows-31j/i) {
49 $http_charset = 'shift_jis';
50 } elsif ($charset =~ /none/i or $charset =~ /unknown/i or
51 $charset =~ /jp/i or $charset eq '0') {
52 $http_charset = 'shift_jis' if $target =~ /keitai/i;
53 } else {
54 $http_charset = $charset;
55 }
56 } else {
57 $http_charset = 'shift_jis' if $target =~ /keitai/;
58 }
59 }
60 my $has_error;
61 $Error->{$file_name} = [[$http_charset]];
62 open my $s, '<', \($entity->{body});
63 $parser->dom_config->set_parameter ('error-handler' => sub {
64 my ($self, $err) = @_;
65 my $loc = $err->location;
66 push @{$Error->{$file_name}},
67 [$loc->line_number, $loc->column_number, $err->type,
68 $err->{'http://suika.fam.cx/~wakaba/archive/2004/dom/xml-parser#byte-sequence'}];
69 #warn join "\t", $file_name, $err->stringify;
70 $has_error = 1;
71 return 1;
72 });
73 my $doc = $parser->parse
74 ({byte_stream => $s, encoding => $http_charset});
75 $Error->{$has_error ? '#error' : '#errorless'}++;
76 } # parse_xml
77
78 =head1 AUTHOR
79
80 Wakaba <w@suika.fam.cx>.
81
82 =head1 LICENSE
83
84 Copyright 2007 Wakaba <w@suika.fam.cx>
85
86 This library is free software; you can redistribute it
87 and/or modify it under the same terms as Perl itself.
88
89 =cut
90
91 1;
92 ## $Date: 2007/06/03 13:52:14 $
93

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24