Whatpm/Charset/UnicodeChecker.pm

package Whatpm::Charset::UnicodeChecker;
use strict;

## NOTE: For more information (including rationals of checks performed
## in this module), see
## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.

## NOTE: Unicode's definition for character string conformance is 
## very, very vague so that it is difficult to determine what error
## level is appropriate for each error.  The Unicode Standard abuses
## conformance-creteria-like terms such as "deprecated", "discouraged",
## "preferred", "better", "not encouraged", "should", and so on with no
## clear explanation of their difference (if any) or relationship to
## the conformance.  In fact, that specification does not define the
## conformance class for character strings.

sub new_handle ($$) {
  my $self = bless {
    queue => [],
    new_queue => [],
    onerror => sub {},
    level => {
      unicode_should => 'w',
      unicode_deprecated => 'w', # = unicode_should
      unicode_discouraged => 'w',
      unicode_preferred => 'w',
      ## NOTE: We do some "unification" of levels - for example,
      ## "not encouraged" is unified with "discouraged" and
      ## "better" is unified with "preferred".
    },
  }, shift;
  $self->{handle} = shift; # char stream
  return $self;
} # new_handle

## TODO: We need to do some perf optimization

sub getc ($) {
  my $self = $_[0];
  return shift @{$self->{queue}} if @{$self->{queue}};

  my $char;
  unless (@{$self->{new_queue}}) {
    my $s = '';
    $self->{handle}->read ($s, 256) or return undef;
    push @{$self->{new_queue}}, split //, $s;
  }  
  $char = shift @{$self->{new_queue}};
  
  my $char_code = ord $char;

  if ({
    0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1,
    0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1,
    0x206E => 1, 0x206F => 1, 0xE0001 => 1,
  }->{$char_code} or
  (0xE0020 <= $char_code and $char_code <= 0xE007F)) {
    ## NOTE: From Unicode 5.1.0 |PropList.txt| (Deprecated).
    $self->{onerror}->(type => 'unicode deprecated',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_deprecated});
  } elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or
           {
             0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
             0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
             0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
             0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
             0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
             0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
             0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
             0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
             0x10FFFE => 1, 0x10FFFF => 1,
           }->{$char_code}) {
    ## NOTE: From Unicode 5.1.0 |PropList.txt| (Noncharacter_Code_Point).
    $self->{onerror}->(type => 'nonchar',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_should});
  } elsif ({
            0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS
            0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ...
            0x20A4 => 1, # LIRA SIGN

            0x2126 => 1, # OHM SIGN # also, discouraged
            0x212A => 1, # KELVIN SIGN
            0x212B => 1, # ANGSTROM SIGN
           }->{$char_code} or
           (0xFB50 <= $char_code and $char_code <= 0xFDFB) or
           (0xFE70 <= $char_code and $char_code <= 0xFEFE) or
           (0xFA30 <= $char_code and $char_code <= 0xFA6A) or
           (0xFA70 <= $char_code and $char_code <= 0xFAD9) or
           (0x2F800 <= $char_code and $char_code <= 0x2FA1D) or
           (0x239B <= $char_code and $char_code <= 0x23B3)) {
    ## NOTE: This case must come AFTER noncharacter checking, due to
    ## their range overwrap.
    if ({
         ## In the Arabic Presentation Forms-A block, but no character is
         ## assigned in Unicode 5.1.
         0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1,
         0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1,
         0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1,
         0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1,
         0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1,
         0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1,
         0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1,
         0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1,
         0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1,
         0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1,
         0xFD4F => 1,
         0xFD90 => 1, 0xFD91 => 1,
         0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1,
         0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1,
         # 0xFDD0-0xFDEF noncharacters

         ## In Arabic Presentation Forms-A block, but explicitly
         ## allowed
         0xFD3E => 1, 0xFD3F => 1,

         ## In Arabic Presentation Forms-B block, unassigned
         0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1,
        }->{$char_code}) {
      #
    } else {
      $self->{onerror}->(type => 'unicode should',
                         text => (sprintf 'U+%04X', $char_code),
                         layer => 'charset',
                         level => $self->{level}->{unicode_should});
    }
  } elsif ({
            ## Styled overlines/underlines in CJK Compatibility Forms
            0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1,
            0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1,
            
            0x037E => 1, 0x0387 => 1, # greek punctuations
            
            #0x17A3 => 1, # also, deprecated
            0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1,

            0x2121 => 1, # tel
            0x213B => 1, # fax
            #0x2120 => 1, # SM (superscript)
            #0x2122 => 1, # TM (superscript)

            0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations
           }->{$char_code} or
           (0x2153 <= $char_code and $char_code <= 0x217F)) {
    $self->{onerror}->(type => 'unicode discouraged',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_discouraged});
  } elsif ({
            0x055A => 1, 0x0559 =>1, # greek punctuations
            
            0x2103 => 1, 0x2109 => 1, # degree signs

            0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER
           }->{$char_code}) {
    $self->{onerror}->(type => 'unicode not preferred',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_preferred});
  }

  ## TODO: "khanda ta" should be represented by U+09CE
  ## rather than <U+09A4, U+09CD, U+200D>.

  ## TODO: IDS syntax

  ## TODO: langtag syntax

  return $char;
} # getc

sub ungetc ($$) {
  unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
} # ungetc

sub close ($) {
  shift->{handle}->close;
} # close

sub charset ($) {
  shift->{handle}->charset;
} # charset

sub has_bom ($) {
  shift->{handle}->has_bom;
} # has_bom

sub input_encoding ($) {
  shift->{handle}->input_encoding;
} # input_encoding

sub onerror ($;$) {
  if (@_ > 1) {
    if (defined $_[1]) {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
    } else {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
    }
  }

  return $_[0]->{onerror};
} # onerror

1;
1	wakaba	1.1	package Whatpm::Charset::UnicodeChecker;
2			use strict;
3
4			## NOTE: For more information (including rationals of checks performed
5			## in this module), see
6			## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7
8			## NOTE: Unicode's definition for character string conformance is
9			## very, very vague so that it is difficult to determine what error
10			## level is appropriate for each error. The Unicode Standard abuses
11			## conformance-creteria-like terms such as "deprecated", "discouraged",
12			## "preferred", "better", "not encouraged", "should", and so on with no
13			## clear explanation of their difference (if any) or relationship to
14			## the conformance. In fact, that specification does not define the
15			## conformance class for character strings.
16
17			sub new_handle ($$) {
18			my $self = bless {
19			queue => [],
20	wakaba	1.2	new_queue => [],
21	wakaba	1.1	onerror => sub {},
22			level => {
23			unicode_should => 'w',
24			unicode_deprecated => 'w', # = unicode_should
25			unicode_discouraged => 'w',
26			unicode_preferred => 'w',
27			## NOTE: We do some "unification" of levels - for example,
28			## "not encouraged" is unified with "discouraged" and
29			## "better" is unified with "preferred".
30			},
31			}, shift;
32			$self->{handle} = shift; # char stream
33			return $self;
34			} # new_handle
35
36			## TODO: We need to do some perf optimization
37
38			sub getc ($) {
39			my $self = $_[0];
40			return shift @{$self->{queue}} if @{$self->{queue}};
41	wakaba	1.2
42			my $char;
43			unless (@{$self->{new_queue}}) {
44			my $s = '';
45			$self->{handle}->read ($s, 256) or return undef;
46			push @{$self->{new_queue}}, split //, $s;
47			}
48			$char = shift @{$self->{new_queue}};
49
50	wakaba	1.1	my $char_code = ord $char;
51
52			if ({
53			0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1,
54			0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1,
55			0x206E => 1, 0x206F => 1, 0xE0001 => 1,
56			}->{$char_code} or
57			(0xE0020 <= $char_code and $char_code <= 0xE007F)) {
58			## NOTE: From Unicode 5.1.0 \|PropList.txt\| (Deprecated).
59			$self->{onerror}->(type => 'unicode deprecated',
60			text => (sprintf 'U+%04X', $char_code),
61			layer => 'charset',
62			level => $self->{level}->{unicode_deprecated});
63			} elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or
64			{
65			0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
66			0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
67			0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
68			0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
69			0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
70			0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
71			0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
72			0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
73			0x10FFFE => 1, 0x10FFFF => 1,
74			}->{$char_code}) {
75			## NOTE: From Unicode 5.1.0 \|PropList.txt\| (Noncharacter_Code_Point).
76			$self->{onerror}->(type => 'nonchar',
77			text => (sprintf 'U+%04X', $char_code),
78			layer => 'charset',
79			level => $self->{level}->{unicode_should});
80			} elsif ({
81			0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS
82			0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ...
83			0x20A4 => 1, # LIRA SIGN
84
85			0x2126 => 1, # OHM SIGN # also, discouraged
86			0x212A => 1, # KELVIN SIGN
87			0x212B => 1, # ANGSTROM SIGN
88			}->{$char_code} or
89			(0xFB50 <= $char_code and $char_code <= 0xFDFB) or
90			(0xFE70 <= $char_code and $char_code <= 0xFEFE) or
91			(0xFA30 <= $char_code and $char_code <= 0xFA6A) or
92			(0xFA70 <= $char_code and $char_code <= 0xFAD9) or
93			(0x2F800 <= $char_code and $char_code <= 0x2FA1D) or
94			(0x239B <= $char_code and $char_code <= 0x23B3)) {
95			## NOTE: This case must come AFTER noncharacter checking, due to
96			## their range overwrap.
97			if ({
98			## In the Arabic Presentation Forms-A block, but no character is
99			## assigned in Unicode 5.1.
100			0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1,
101			0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1,
102			0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1,
103			0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1,
104			0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1,
105			0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1,
106			0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1,
107			0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1,
108			0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1,
109			0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1,
110			0xFD4F => 1,
111			0xFD90 => 1, 0xFD91 => 1,
112			0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1,
113			0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1,
114			# 0xFDD0-0xFDEF noncharacters
115
116			## In Arabic Presentation Forms-A block, but explicitly
117			## allowed
118			0xFD3E => 1, 0xFD3F => 1,
119
120			## In Arabic Presentation Forms-B block, unassigned
121			0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1,
122			}->{$char_code}) {
123			#
124			} else {
125			$self->{onerror}->(type => 'unicode should',
126			text => (sprintf 'U+%04X', $char_code),
127			layer => 'charset',
128			level => $self->{level}->{unicode_should});
129			}
130			} elsif ({
131			## Styled overlines/underlines in CJK Compatibility Forms
132			0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1,
133			0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1,
134
135			0x037E => 1, 0x0387 => 1, # greek punctuations
136
137			#0x17A3 => 1, # also, deprecated
138			0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1,
139
140			0x2121 => 1, # tel
141			0x213B => 1, # fax
142			#0x2120 => 1, # SM (superscript)
143			#0x2122 => 1, # TM (superscript)
144
145			0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations
146			}->{$char_code} or
147			(0x2153 <= $char_code and $char_code <= 0x217F)) {
148			$self->{onerror}->(type => 'unicode discouraged',
149			text => (sprintf 'U+%04X', $char_code),
150			layer => 'charset',
151			level => $self->{level}->{unicode_discouraged});
152			} elsif ({
153			0x055A => 1, 0x0559 =>1, # greek punctuations
154
155			0x2103 => 1, 0x2109 => 1, # degree signs
156
157			0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER
158			}->{$char_code}) {
159			$self->{onerror}->(type => 'unicode not preferred',
160			text => (sprintf 'U+%04X', $char_code),
161			layer => 'charset',
162			level => $self->{level}->{unicode_preferred});
163			}
164
165			## TODO: "khanda ta" should be represented by U+09CE
166			## rather than <U+09A4, U+09CD, U+200D>.
167
168			## TODO: IDS syntax
169
170			## TODO: langtag syntax
171
172			return $char;
173			} # getc
174
175			sub ungetc ($$) {
176			unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
177			} # ungetc
178
179			sub close ($) {
180			shift->{handle}->close;
181			} # close
182
183			sub charset ($) {
184			shift->{handle}->charset;
185			} # charset
186
187			sub has_bom ($) {
188			shift->{handle}->has_bom;
189			} # has_bom
190
191			sub input_encoding ($) {
192			shift->{handle}->input_encoding;
193			} # input_encoding
194
195			sub onerror ($;$) {
196			if (@_ > 1) {
197			if (defined $_[1]) {
198			$_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
199			} else {
200			$_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
201			}
202			}
203
204			return $_[0]->{onerror};
205			} # onerror
206
207			1;