/[suikacvs]/markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (hide annotations) (download)
Sun Sep 14 11:57:42 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.5: +70 -39 lines
++ whatpm/Whatpm/ChangeLog	14 Sep 2008 11:56:24 -0000
	* HTML.pm.src: Use |read| instead of |getc|.  |set_inner_html|
	would report character error from now.

2008-09-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/Charset/ChangeLog	14 Sep 2008 11:57:38 -0000
	* DecodeHandle.pm (CharString onerror): New method.

	* UnicodeString.pm (read): New.
	(getc): Removed.
	(manakai_read_until): Checking operation implemented.

2008-09-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::Charset::UnicodeChecker;
2     use strict;
3    
4     ## NOTE: For more information (including rationals of checks performed
5     ## in this module), see
6     ## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7    
8     ## NOTE: Unicode's definition for character string conformance is
9     ## very, very vague so that it is difficult to determine what error
10     ## level is appropriate for each error. The Unicode Standard abuses
11     ## conformance-creteria-like terms such as "deprecated", "discouraged",
12     ## "preferred", "better", "not encouraged", "should", and so on with no
13     ## clear explanation of their difference (if any) or relationship to
14     ## the conformance. In fact, that specification does not define the
15     ## conformance class for character strings.
16    
17     sub new_handle ($$) {
18     my $self = bless {
19     queue => [],
20 wakaba 1.2 new_queue => [],
21 wakaba 1.1 onerror => sub {},
22     level => {
23     unicode_should => 'w',
24     unicode_deprecated => 'w', # = unicode_should
25     unicode_discouraged => 'w',
26     unicode_preferred => 'w',
27     ## NOTE: We do some "unification" of levels - for example,
28     ## "not encouraged" is unified with "discouraged" and
29     ## "better" is unified with "preferred".
30     },
31     }, shift;
32     $self->{handle} = shift; # char stream
33     return $self;
34     } # new_handle
35    
36 wakaba 1.6 my $check_char = sub ($$) {
37     my ($self, $char_code) = @_;
38 wakaba 1.1
39 wakaba 1.6 if ($char_code == 0x000D) {
40     $self->{line_diff}++;
41     $self->{column_diff} = 0;
42     $self->{set_column} = 1;
43     $self->{has_cr} = 1;
44     return;
45     } elsif ($char_code == 0x000A) {
46     if ($self->{has_cr}) {
47     delete $self->{has_cr};
48     } else {
49     $self->{line_diff}++;
50     $self->{column_diff} = 0;
51     $self->{set_column} = 1;
52     }
53     return;
54     } else {
55     $self->{column_diff}++;
56     delete $self->{has_cr};
57     }
58    
59 wakaba 1.1 if ({
60 wakaba 1.6 0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1,
61     0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1,
62     0x206E => 1, 0x206F => 1, 0xE0001 => 1,
63     }->{$char_code} or
64     (0xE0020 <= $char_code and $char_code <= 0xE007F)) {
65 wakaba 1.1 ## NOTE: From Unicode 5.1.0 |PropList.txt| (Deprecated).
66     $self->{onerror}->(type => 'unicode deprecated',
67     text => (sprintf 'U+%04X', $char_code),
68     layer => 'charset',
69 wakaba 1.6 level => $self->{level}->{unicode_deprecated},
70     line_diff => $self->{line_diff},
71     column_diff => $self->{column_diff},
72     ($self->{set_column} ? (column => 1) : ()));
73 wakaba 1.1 } elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or
74     {
75     0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
76     0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
77     0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
78     0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
79     0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
80     0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
81     0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
82     0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
83     0x10FFFE => 1, 0x10FFFF => 1,
84     }->{$char_code}) {
85     ## NOTE: From Unicode 5.1.0 |PropList.txt| (Noncharacter_Code_Point).
86     $self->{onerror}->(type => 'nonchar',
87     text => (sprintf 'U+%04X', $char_code),
88     layer => 'charset',
89 wakaba 1.6 level => $self->{level}->{unicode_should},
90     line_diff => $self->{line_diff},
91     column_diff => $self->{column_diff},
92     ($self->{set_column} ? (column => 1) : ()));
93 wakaba 1.1 } elsif ({
94     0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS
95     0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ...
96     0x20A4 => 1, # LIRA SIGN
97    
98     0x2126 => 1, # OHM SIGN # also, discouraged
99     0x212A => 1, # KELVIN SIGN
100     0x212B => 1, # ANGSTROM SIGN
101     }->{$char_code} or
102     (0xFB50 <= $char_code and $char_code <= 0xFDFB) or
103     (0xFE70 <= $char_code and $char_code <= 0xFEFE) or
104     (0xFA30 <= $char_code and $char_code <= 0xFA6A) or
105     (0xFA70 <= $char_code and $char_code <= 0xFAD9) or
106     (0x2F800 <= $char_code and $char_code <= 0x2FA1D) or
107     (0x239B <= $char_code and $char_code <= 0x23B3)) {
108     ## NOTE: This case must come AFTER noncharacter checking, due to
109     ## their range overwrap.
110     if ({
111     ## In the Arabic Presentation Forms-A block, but no character is
112     ## assigned in Unicode 5.1.
113     0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1,
114     0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1,
115     0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1,
116     0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1,
117     0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1,
118     0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1,
119     0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1,
120     0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1,
121     0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1,
122     0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1,
123     0xFD4F => 1,
124     0xFD90 => 1, 0xFD91 => 1,
125     0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1,
126     0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1,
127     # 0xFDD0-0xFDEF noncharacters
128    
129     ## In Arabic Presentation Forms-A block, but explicitly
130     ## allowed
131     0xFD3E => 1, 0xFD3F => 1,
132    
133     ## In Arabic Presentation Forms-B block, unassigned
134     0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1,
135     }->{$char_code}) {
136     #
137     } else {
138     $self->{onerror}->(type => 'unicode should',
139     text => (sprintf 'U+%04X', $char_code),
140     layer => 'charset',
141 wakaba 1.6 level => $self->{level}->{unicode_should},
142     line_diff => $self->{line_diff},
143     column_diff => $self->{column_diff},
144     ($self->{set_column} ? (column => 1) : ()));
145 wakaba 1.1 }
146     } elsif ({
147     ## Styled overlines/underlines in CJK Compatibility Forms
148     0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1,
149     0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1,
150    
151     0x037E => 1, 0x0387 => 1, # greek punctuations
152    
153     #0x17A3 => 1, # also, deprecated
154     0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1,
155    
156     0x2121 => 1, # tel
157     0x213B => 1, # fax
158     #0x2120 => 1, # SM (superscript)
159     #0x2122 => 1, # TM (superscript)
160    
161     0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations
162     }->{$char_code} or
163     (0x2153 <= $char_code and $char_code <= 0x217F)) {
164     $self->{onerror}->(type => 'unicode discouraged',
165     text => (sprintf 'U+%04X', $char_code),
166     layer => 'charset',
167 wakaba 1.6 level => $self->{level}->{unicode_discouraged},
168     line_diff => $self->{line_diff},
169     column_diff => $self->{column_diff},
170     ($self->{set_column} ? (column => 1) : ()));
171 wakaba 1.1 } elsif ({
172     0x055A => 1, 0x0559 =>1, # greek punctuations
173    
174     0x2103 => 1, 0x2109 => 1, # degree signs
175    
176     0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER
177     }->{$char_code}) {
178     $self->{onerror}->(type => 'unicode not preferred',
179     text => (sprintf 'U+%04X', $char_code),
180     layer => 'charset',
181 wakaba 1.6 level => $self->{level}->{unicode_preferred},
182     line_diff => $self->{line_diff},
183     column_diff => $self->{column_diff},
184     ($self->{set_column} ? (column => 1) : ()));
185 wakaba 1.1 }
186    
187     ## TODO: "khanda ta" should be represented by U+09CE
188     ## rather than <U+09A4, U+09CD, U+200D>.
189    
190     ## TODO: IDS syntax
191    
192     ## TODO: langtag syntax
193 wakaba 1.6 }; # $check_char
194 wakaba 1.1
195 wakaba 1.6 sub read ($$$;$) {
196     my $self = shift;
197     my $offset = $_[2] || 0;
198     my $count = $self->{handle}->read (@_);
199     $self->{line_diff} = 0;
200     $self->{column_diff} = -1;
201     delete $self->{set_column};
202     delete $self->{has_cr};
203     for ($offset .. ($offset + $count - 1)) {
204     $check_char->($self, ord substr $_[0], $_, 1);
205     }
206     return $count;
207     } # read
208 wakaba 1.4
209 wakaba 1.5 sub manakai_read_until ($$$;$) {
210     #my ($self, $scalar, $pattern, $offset) = @_;
211     my $self = shift;
212 wakaba 1.6 my $offset = $_[2] || 0;
213     my $count = $self->{handle}->manakai_read_until (@_);
214     $self->{line_diff} = 0;
215     $self->{column_diff} = -1;
216     delete $self->{set_column};
217     delete $self->{has_cr};
218     for ($offset .. ($offset + $count - 1)) {
219     $check_char->($self, ord substr $_[0], $_, 1);
220 wakaba 1.4 }
221 wakaba 1.6 return $count;
222 wakaba 1.5 } # manakai_read_until
223 wakaba 1.1
224     sub ungetc ($$) {
225     unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
226     } # ungetc
227    
228     sub close ($) {
229     shift->{handle}->close;
230     } # close
231    
232     sub charset ($) {
233     shift->{handle}->charset;
234     } # charset
235    
236     sub has_bom ($) {
237     shift->{handle}->has_bom;
238     } # has_bom
239    
240     sub input_encoding ($) {
241     shift->{handle}->input_encoding;
242     } # input_encoding
243    
244     sub onerror ($;$) {
245     if (@_ > 1) {
246     if (defined $_[1]) {
247     $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
248     } else {
249     $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
250     }
251     }
252    
253     return $_[0]->{onerror};
254     } # onerror
255    
256     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24