Whatpm/Charset/UnicodeChecker.pm

package Whatpm::Charset::UnicodeChecker;
use strict;

## NOTE: For more information (including rationals of checks performed
## in this module), see
## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.

## NOTE: Unicode's definition for character string conformance is 
## very, very vague so that it is difficult to determine what error
## level is appropriate for each error.  The Unicode Standard abuses
## conformance-creteria-like terms such as "deprecated", "discouraged",
## "preferred", "better", "not encouraged", "should", and so on with no
## clear explanation of their difference (if any) or relationship to
## the conformance.  In fact, that specification does not define the
## conformance class for character strings.

sub new_handle ($$) {
  my $self = bless {
    queue => [],
    onerror => sub {},
    level => {
      unicode_should => 'w',
      unicode_deprecated => 'w', # = unicode_should
      unicode_discouraged => 'w',
      unicode_preferred => 'w',
      ## NOTE: We do some "unification" of levels - for example,
      ## "not encouraged" is unified with "discouraged" and
      ## "better" is unified with "preferred".
    },
  }, shift;
  $self->{handle} = shift; # char stream
  return $self;
} # new_handle

## TODO: We need to do some perf optimization

sub getc ($) {
  my $self = $_[0];
  return shift @{$self->{queue}} if @{$self->{queue}};
  my $char = $self->{handle}->getc;
  my $char_code = ord $char;

  if ({
    0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1,
    0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1,
    0x206E => 1, 0x206F => 1, 0xE0001 => 1,
  }->{$char_code} or
  (0xE0020 <= $char_code and $char_code <= 0xE007F)) {
    ## NOTE: From Unicode 5.1.0 |PropList.txt| (Deprecated).
    $self->{onerror}->(type => 'unicode deprecated',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_deprecated});
  } elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or
           {
             0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
             0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
             0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
             0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
             0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
             0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
             0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
             0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
             0x10FFFE => 1, 0x10FFFF => 1,
           }->{$char_code}) {
    ## NOTE: From Unicode 5.1.0 |PropList.txt| (Noncharacter_Code_Point).
    $self->{onerror}->(type => 'nonchar',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_should});
  } elsif ({
            0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS
            0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ...
            0x20A4 => 1, # LIRA SIGN

            0x2126 => 1, # OHM SIGN # also, discouraged
            0x212A => 1, # KELVIN SIGN
            0x212B => 1, # ANGSTROM SIGN
           }->{$char_code} or
           (0xFB50 <= $char_code and $char_code <= 0xFDFB) or
           (0xFE70 <= $char_code and $char_code <= 0xFEFE) or
           (0xFA30 <= $char_code and $char_code <= 0xFA6A) or
           (0xFA70 <= $char_code and $char_code <= 0xFAD9) or
           (0x2F800 <= $char_code and $char_code <= 0x2FA1D) or
           (0x239B <= $char_code and $char_code <= 0x23B3)) {
    ## NOTE: This case must come AFTER noncharacter checking, due to
    ## their range overwrap.
    if ({
         ## In the Arabic Presentation Forms-A block, but no character is
         ## assigned in Unicode 5.1.
         0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1,
         0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1,
         0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1,
         0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1,
         0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1,
         0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1,
         0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1,
         0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1,
         0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1,
         0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1,
         0xFD4F => 1,
         0xFD90 => 1, 0xFD91 => 1,
         0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1,
         0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1,
         # 0xFDD0-0xFDEF noncharacters

         ## In Arabic Presentation Forms-A block, but explicitly
         ## allowed
         0xFD3E => 1, 0xFD3F => 1,

         ## In Arabic Presentation Forms-B block, unassigned
         0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1,
        }->{$char_code}) {
      #
    } else {
      $self->{onerror}->(type => 'unicode should',
                         text => (sprintf 'U+%04X', $char_code),
                         layer => 'charset',
                         level => $self->{level}->{unicode_should});
    }
  } elsif ({
            ## Styled overlines/underlines in CJK Compatibility Forms
            0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1,
            0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1,
            
            0x037E => 1, 0x0387 => 1, # greek punctuations
            
            #0x17A3 => 1, # also, deprecated
            0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1,

            0x2121 => 1, # tel
            0x213B => 1, # fax
            #0x2120 => 1, # SM (superscript)
            #0x2122 => 1, # TM (superscript)

            0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations
           }->{$char_code} or
           (0x2153 <= $char_code and $char_code <= 0x217F)) {
    $self->{onerror}->(type => 'unicode discouraged',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_discouraged});
  } elsif ({
            0x055A => 1, 0x0559 =>1, # greek punctuations
            
            0x2103 => 1, 0x2109 => 1, # degree signs

            0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER
           }->{$char_code}) {
    $self->{onerror}->(type => 'unicode not preferred',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_preferred});
  }

  ## TODO: "khanda ta" should be represented by U+09CE
  ## rather than <U+09A4, U+09CD, U+200D>.

  ## TODO: IDS syntax

  ## TODO: langtag syntax

  return $char;
} # getc

sub ungetc ($$) {
  unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
} # ungetc

sub close ($) {
  shift->{handle}->close;
} # close

sub charset ($) {
  shift->{handle}->charset;
} # charset

sub has_bom ($) {
  shift->{handle}->has_bom;
} # has_bom

sub input_encoding ($) {
  shift->{handle}->input_encoding;
} # input_encoding

sub onerror ($;$) {
  if (@_ > 1) {
    if (defined $_[1]) {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
    } else {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
    }
  }

  return $_[0]->{onerror};
} # onerror

1;
1	wakaba	1.1	package Whatpm::Charset::UnicodeChecker;
2			use strict;
3
4			## NOTE: For more information (including rationals of checks performed
5			## in this module), see
6			## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7
8			## NOTE: Unicode's definition for character string conformance is
9			## very, very vague so that it is difficult to determine what error
10			## level is appropriate for each error. The Unicode Standard abuses
11			## conformance-creteria-like terms such as "deprecated", "discouraged",
12			## "preferred", "better", "not encouraged", "should", and so on with no
13			## clear explanation of their difference (if any) or relationship to
14			## the conformance. In fact, that specification does not define the
15			## conformance class for character strings.
16
17			sub new_handle ($$) {
18			my $self = bless {
19			queue => [],
20			onerror => sub {},
21			level => {
22			unicode_should => 'w',
23			unicode_deprecated => 'w', # = unicode_should
24			unicode_discouraged => 'w',
25			unicode_preferred => 'w',
26			## NOTE: We do some "unification" of levels - for example,
27			## "not encouraged" is unified with "discouraged" and
28			## "better" is unified with "preferred".
29			},
30			}, shift;
31			$self->{handle} = shift; # char stream
32			return $self;
33			} # new_handle
34
35			## TODO: We need to do some perf optimization
36
37			sub getc ($) {
38			my $self = $_[0];
39			return shift @{$self->{queue}} if @{$self->{queue}};
40			my $char = $self->{handle}->getc;
41			my $char_code = ord $char;
42
43			if ({
44			0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1,
45			0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1,
46			0x206E => 1, 0x206F => 1, 0xE0001 => 1,
47			}->{$char_code} or
48			(0xE0020 <= $char_code and $char_code <= 0xE007F)) {
49			## NOTE: From Unicode 5.1.0 \|PropList.txt\| (Deprecated).
50			$self->{onerror}->(type => 'unicode deprecated',
51			text => (sprintf 'U+%04X', $char_code),
52			layer => 'charset',
53			level => $self->{level}->{unicode_deprecated});
54			} elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or
55			{
56			0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
57			0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
58			0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
59			0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
60			0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
61			0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
62			0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
63			0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
64			0x10FFFE => 1, 0x10FFFF => 1,
65			}->{$char_code}) {
66			## NOTE: From Unicode 5.1.0 \|PropList.txt\| (Noncharacter_Code_Point).
67			$self->{onerror}->(type => 'nonchar',
68			text => (sprintf 'U+%04X', $char_code),
69			layer => 'charset',
70			level => $self->{level}->{unicode_should});
71			} elsif ({
72			0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS
73			0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ...
74			0x20A4 => 1, # LIRA SIGN
75
76			0x2126 => 1, # OHM SIGN # also, discouraged
77			0x212A => 1, # KELVIN SIGN
78			0x212B => 1, # ANGSTROM SIGN
79			}->{$char_code} or
80			(0xFB50 <= $char_code and $char_code <= 0xFDFB) or
81			(0xFE70 <= $char_code and $char_code <= 0xFEFE) or
82			(0xFA30 <= $char_code and $char_code <= 0xFA6A) or
83			(0xFA70 <= $char_code and $char_code <= 0xFAD9) or
84			(0x2F800 <= $char_code and $char_code <= 0x2FA1D) or
85			(0x239B <= $char_code and $char_code <= 0x23B3)) {
86			## NOTE: This case must come AFTER noncharacter checking, due to
87			## their range overwrap.
88			if ({
89			## In the Arabic Presentation Forms-A block, but no character is
90			## assigned in Unicode 5.1.
91			0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1,
92			0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1,
93			0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1,
94			0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1,
95			0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1,
96			0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1,
97			0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1,
98			0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1,
99			0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1,
100			0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1,
101			0xFD4F => 1,
102			0xFD90 => 1, 0xFD91 => 1,
103			0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1,
104			0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1,
105			# 0xFDD0-0xFDEF noncharacters
106
107			## In Arabic Presentation Forms-A block, but explicitly
108			## allowed
109			0xFD3E => 1, 0xFD3F => 1,
110
111			## In Arabic Presentation Forms-B block, unassigned
112			0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1,
113			}->{$char_code}) {
114			#
115			} else {
116			$self->{onerror}->(type => 'unicode should',
117			text => (sprintf 'U+%04X', $char_code),
118			layer => 'charset',
119			level => $self->{level}->{unicode_should});
120			}
121			} elsif ({
122			## Styled overlines/underlines in CJK Compatibility Forms
123			0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1,
124			0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1,
125
126			0x037E => 1, 0x0387 => 1, # greek punctuations
127
128			#0x17A3 => 1, # also, deprecated
129			0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1,
130
131			0x2121 => 1, # tel
132			0x213B => 1, # fax
133			#0x2120 => 1, # SM (superscript)
134			#0x2122 => 1, # TM (superscript)
135
136			0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations
137			}->{$char_code} or
138			(0x2153 <= $char_code and $char_code <= 0x217F)) {
139			$self->{onerror}->(type => 'unicode discouraged',
140			text => (sprintf 'U+%04X', $char_code),
141			layer => 'charset',
142			level => $self->{level}->{unicode_discouraged});
143			} elsif ({
144			0x055A => 1, 0x0559 =>1, # greek punctuations
145
146			0x2103 => 1, 0x2109 => 1, # degree signs
147
148			0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER
149			}->{$char_code}) {
150			$self->{onerror}->(type => 'unicode not preferred',
151			text => (sprintf 'U+%04X', $char_code),
152			layer => 'charset',
153			level => $self->{level}->{unicode_preferred});
154			}
155
156			## TODO: "khanda ta" should be represented by U+09CE
157			## rather than <U+09A4, U+09CD, U+200D>.
158
159			## TODO: IDS syntax
160
161			## TODO: langtag syntax
162
163			return $char;
164			} # getc
165
166			sub ungetc ($$) {
167			unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
168			} # ungetc
169
170			sub close ($) {
171			shift->{handle}->close;
172			} # close
173
174			sub charset ($) {
175			shift->{handle}->charset;
176			} # charset
177
178			sub has_bom ($) {
179			shift->{handle}->has_bom;
180			} # has_bom
181
182			sub input_encoding ($) {
183			shift->{handle}->input_encoding;
184			} # input_encoding
185
186			sub onerror ($;$) {
187			if (@_ > 1) {
188			if (defined $_[1]) {
189			$_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
190			} else {
191			$_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
192			}
193			}
194
195			return $_[0]->{onerror};
196			} # onerror
197
198			1;