/[suikacvs]/markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm
Suika

Contents of /markup/html/whatpm/Whatpm/Charset/UnicodeChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (hide annotations) (download)
Mon Sep 15 00:49:09 2008 UTC (16 years, 10 months ago) by wakaba
Branch: MAIN
Changes since 1.6: +124 -121 lines
++ whatpm/Whatpm/Charset/ChangeLog	15 Sep 2008 00:48:59 -0000
2008-09-15  Wakaba  <wakaba@suika.fam.cx>

	* UnicodeChecker.pm: Use hash for better performance.

1 wakaba 1.1 package Whatpm::Charset::UnicodeChecker;
2     use strict;
3    
4     ## NOTE: For more information (including rationals of checks performed
5     ## in this module), see
6     ## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7    
8     ## NOTE: Unicode's definition for character string conformance is
9     ## very, very vague so that it is difficult to determine what error
10     ## level is appropriate for each error. The Unicode Standard abuses
11     ## conformance-creteria-like terms such as "deprecated", "discouraged",
12     ## "preferred", "better", "not encouraged", "should", and so on with no
13     ## clear explanation of their difference (if any) or relationship to
14     ## the conformance. In fact, that specification does not define the
15     ## conformance class for character strings.
16    
17     sub new_handle ($$) {
18     my $self = bless {
19     queue => [],
20 wakaba 1.2 new_queue => [],
21 wakaba 1.1 onerror => sub {},
22     level => {
23     unicode_should => 'w',
24     unicode_deprecated => 'w', # = unicode_should
25     unicode_discouraged => 'w',
26     unicode_preferred => 'w',
27     ## NOTE: We do some "unification" of levels - for example,
28     ## "not encouraged" is unified with "discouraged" and
29     ## "better" is unified with "preferred".
30     },
31     }, shift;
32     $self->{handle} = shift; # char stream
33     return $self;
34     } # new_handle
35    
36 wakaba 1.7 my $etypes = {
37     0x0340 => 'unicode deprecated',
38     0x0341 => 'unicode deprecated',
39     0x17A3 => 'unicode deprecated',
40     0x17D3 => 'unicode deprecated',
41     0x206A => 'unicode deprecated',
42     0x206B => 'unicode deprecated',
43     0x206C => 'unicode deprecated',
44     0x206D => 'unicode deprecated',
45     0x206E => 'unicode deprecated',
46     0x206F => 'unicode deprecated',
47     0xE0001 => 'unicode deprecated',
48    
49     0xFFFE => 'nonchar',
50     0xFFFF => 'nonchar',
51     0x1FFFE => 'nonchar',
52     0x1FFFF => 'nonchar',
53     0x2FFFE => 'nonchar',
54     0x2FFFF => 'nonchar',
55     0x3FFFE => 'nonchar',
56     0x3FFFF => 'nonchar',
57     0x4FFFE => 'nonchar',
58     0x4FFFF => 'nonchar',
59     0x5FFFE => 'nonchar',
60     0x5FFFF => 'nonchar',
61     0x6FFFE => 'nonchar',
62     0x6FFFF => 'nonchar',
63     0x7FFFE => 'nonchar',
64     0x7FFFF => 'nonchar',
65     0x8FFFE => 'nonchar',
66     0x8FFFF => 'nonchar',
67     0x9FFFE => 'nonchar',
68     0x9FFFF => 'nonchar',
69     0xAFFFE => 'nonchar',
70     0xAFFFF => 'nonchar',
71     0xBFFFE => 'nonchar',
72     0xBFFFF => 'nonchar',
73     0xCFFFE => 'nonchar',
74     0xCFFFF => 'nonchar',
75     0xDFFFE => 'nonchar',
76     0xDFFFF => 'nonchar',
77     0xEFFFE => 'nonchar',
78     0xEFFFF => 'nonchar',
79     0xFFFFE => 'nonchar',
80     0xFFFFF => 'nonchar',
81     0x10FFFE => 'nonchar',
82     0x10FFFF => 'nonchar',
83    
84     0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS
85     0x03D3 => 'unicode should', # GREEK UPSILON WITH ...
86     0x03D4 => 'unicode should', # GREEK UPSILON WITH ...
87     0x20A4 => 'unicode should', # LIRA SIGN
88    
89     0x2126 => 'unicode should', # OHM SIGN # also, discouraged
90     0x212A => 'unicode should', # KELVIN SIGN
91     0x212B => 'unicode should', # ANGSTROM SIGN
92    
93     ## Styled overlines/underlines in CJK Compatibility Forms
94     0xFE49 => 'unicode discouraged',
95     0xFE4A => 'unicode discouraged',
96     0xFE4B => 'unicode discouraged',
97     0xFE4C => 'unicode discouraged',
98     0xFE4D => 'unicode discouraged',
99     0xFE4E => 'unicode discouraged',
100     0xFE4F => 'unicode discouraged',
101    
102     0x037E => 'unicode discouraged', # greek punctuations
103     0x0387 => 'unicode discouraged', # greek punctuations
104    
105     #0x17A3 => 'unicode discouraged', # also, deprecated
106     0x17A4 => 'unicode discouraged',
107     0x17B4 => 'unicode discouraged',
108     0x17B5 => 'unicode discouraged',
109     0x17D8 => 'unicode discouraged',
110    
111     0x2121 => 'unicode discouraged', # tel
112     0x213B => 'unicode discouraged', # fax
113     #0x2120 => 'unicode discouraged', # SM (superscript)
114     #0x2122 => 'unicode discouraged', # TM (superscript)
115    
116     ## inline annotations
117     0xFFF9 => 'unicode discouraged',
118     0xFFFA => 'unicode discouraged',
119     0xFFFB => 'unicode discouraged',
120    
121     ## greek punctuations
122     0x055A => 'unicode not preferred',
123     0x0559 => 'unicode not preferred',
124    
125     ## degree signs
126     0x2103 => 'unicode not preferred',
127     0x2109 => 'unicode not preferred',
128    
129     ## strongly preferrs U+2060 WORD JOINTER
130     0xFEFE => 'unicode not preferred',
131     };
132    
133     $etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F;
134     $etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF;
135     $etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9;
136     $etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3;
137     $etypes->{$_} = 'unicode should'
138     for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F,
139     0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74,
140     0xFE76 .. 0xFEFC;
141     ## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where
142     ## no character is assigned, noncharacter code points, and
143     ## U+FD3E and U+FD3F, which are explicitly allowed.
144     $etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F;
145    
146     my $levels = {
147     'unicode deprecated' => 'unicode_deprecated',
148     'nonchar' => 'unicode_should',
149     'unicode should' => 'unicode_should',
150     'unicode discouraged' => 'unicode_discouraged',
151     'unicode not preferred' => 'unicode_preferred',
152     };
153    
154 wakaba 1.6 my $check_char = sub ($$) {
155     my ($self, $char_code) = @_;
156 wakaba 1.1
157 wakaba 1.6 if ($char_code == 0x000D) {
158     $self->{line_diff}++;
159     $self->{column_diff} = 0;
160     $self->{set_column} = 1;
161     $self->{has_cr} = 1;
162     return;
163     } elsif ($char_code == 0x000A) {
164     if ($self->{has_cr}) {
165     delete $self->{has_cr};
166     } else {
167     $self->{line_diff}++;
168     $self->{column_diff} = 0;
169     $self->{set_column} = 1;
170     }
171     return;
172     } else {
173     $self->{column_diff}++;
174     delete $self->{has_cr};
175     }
176 wakaba 1.7
177     ## TODO: $char_code > U+10FFFF
178 wakaba 1.6
179 wakaba 1.7 my $etype = $etypes->{$char_code};
180     if (defined $etype) {
181     $self->{onerror}->(type => $etype,
182 wakaba 1.1 text => (sprintf 'U+%04X', $char_code),
183     layer => 'charset',
184 wakaba 1.7 level => $self->{level}->{$levels->{$etype}},
185 wakaba 1.6 line_diff => $self->{line_diff},
186     column_diff => $self->{column_diff},
187     ($self->{set_column} ? (column => 1) : ()));
188 wakaba 1.1 }
189    
190     ## TODO: "khanda ta" should be represented by U+09CE
191     ## rather than <U+09A4, U+09CD, U+200D>.
192    
193     ## TODO: IDS syntax
194    
195     ## TODO: langtag syntax
196 wakaba 1.6 }; # $check_char
197 wakaba 1.1
198 wakaba 1.6 sub read ($$$;$) {
199     my $self = shift;
200     my $offset = $_[2] || 0;
201     my $count = $self->{handle}->read (@_);
202     $self->{line_diff} = 0;
203     $self->{column_diff} = -1;
204     delete $self->{set_column};
205     delete $self->{has_cr};
206     for ($offset .. ($offset + $count - 1)) {
207     $check_char->($self, ord substr $_[0], $_, 1);
208     }
209     return $count;
210     } # read
211 wakaba 1.4
212 wakaba 1.5 sub manakai_read_until ($$$;$) {
213     #my ($self, $scalar, $pattern, $offset) = @_;
214     my $self = shift;
215 wakaba 1.6 my $offset = $_[2] || 0;
216     my $count = $self->{handle}->manakai_read_until (@_);
217     $self->{line_diff} = 0;
218     $self->{column_diff} = -1;
219     delete $self->{set_column};
220     delete $self->{has_cr};
221     for ($offset .. ($offset + $count - 1)) {
222     $check_char->($self, ord substr $_[0], $_, 1);
223 wakaba 1.4 }
224 wakaba 1.6 return $count;
225 wakaba 1.5 } # manakai_read_until
226 wakaba 1.1
227     sub ungetc ($$) {
228     unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
229     } # ungetc
230    
231     sub close ($) {
232     shift->{handle}->close;
233     } # close
234    
235     sub charset ($) {
236     shift->{handle}->charset;
237     } # charset
238    
239     sub has_bom ($) {
240     shift->{handle}->has_bom;
241     } # has_bom
242    
243     sub input_encoding ($) {
244     shift->{handle}->input_encoding;
245     } # input_encoding
246    
247     sub onerror ($;$) {
248     if (@_ > 1) {
249     if (defined $_[1]) {
250     $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
251     } else {
252     $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
253     }
254     }
255    
256     return $_[0]->{onerror};
257     } # onerror
258    
259     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24