Whatpm/Charset/UnicodeChecker.pm

package Whatpm::Charset::UnicodeChecker;
use strict;

## NOTE: For more information (including rationals of checks performed
## in this module), see
## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.

## NOTE: Unicode's definition for character string conformance is 
## very, very vague so that it is difficult to determine what error
## level is appropriate for each error.  The Unicode Standard abuses
## conformance-creteria-like terms such as "deprecated", "discouraged",
## "preferred", "better", "not encouraged", "should", and so on with no
## clear explanation of their difference (if any) or relationship to
## the conformance.  In fact, that specification does not define the
## conformance class for character strings.

sub new_handle ($$) {
  my $self = bless {
    queue => [],
    new_queue => [],
    onerror => sub {},
    level => {
      unicode_should => 'w',
      unicode_deprecated => 'w', # = unicode_should
      unicode_discouraged => 'w',
      unicode_preferred => 'w',
      ## NOTE: We do some "unification" of levels - for example,
      ## "not encouraged" is unified with "discouraged" and
      ## "better" is unified with "preferred".
    },
  }, shift;
  $self->{handle} = shift; # char stream
  return $self;
} # new_handle

my $check_char = sub ($$) {
  my ($self, $char_code) = @_;

  if ($char_code == 0x000D) {
    $self->{line_diff}++;
    $self->{column_diff} = 0;
    $self->{set_column} = 1;
    $self->{has_cr} = 1;
    return;
  } elsif ($char_code == 0x000A) {
    if ($self->{has_cr}) {
      delete $self->{has_cr};
    } else {
      $self->{line_diff}++;
      $self->{column_diff} = 0;
      $self->{set_column} = 1;
    }
    return;
  } else {
    $self->{column_diff}++;
    delete $self->{has_cr};
  }
  
  if ({
       0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1,
       0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1,
       0x206E => 1, 0x206F => 1, 0xE0001 => 1,
      }->{$char_code} or
      (0xE0020 <= $char_code and $char_code <= 0xE007F)) {
    ## NOTE: From Unicode 5.1.0 |PropList.txt| (Deprecated).
    $self->{onerror}->(type => 'unicode deprecated',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_deprecated},
                       line_diff => $self->{line_diff},
                       column_diff => $self->{column_diff},
                       ($self->{set_column} ? (column => 1) : ()));
  } elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or
           {
             0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
             0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
             0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
             0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
             0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
             0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
             0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
             0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
             0x10FFFE => 1, 0x10FFFF => 1,
           }->{$char_code}) {
    ## NOTE: From Unicode 5.1.0 |PropList.txt| (Noncharacter_Code_Point).
    $self->{onerror}->(type => 'nonchar',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_should},
                       line_diff => $self->{line_diff},
                       column_diff => $self->{column_diff},
                       ($self->{set_column} ? (column => 1) : ()));
  } elsif ({
            0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS
            0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ...
            0x20A4 => 1, # LIRA SIGN

            0x2126 => 1, # OHM SIGN # also, discouraged
            0x212A => 1, # KELVIN SIGN
            0x212B => 1, # ANGSTROM SIGN
           }->{$char_code} or
           (0xFB50 <= $char_code and $char_code <= 0xFDFB) or
           (0xFE70 <= $char_code and $char_code <= 0xFEFE) or
           (0xFA30 <= $char_code and $char_code <= 0xFA6A) or
           (0xFA70 <= $char_code and $char_code <= 0xFAD9) or
           (0x2F800 <= $char_code and $char_code <= 0x2FA1D) or
           (0x239B <= $char_code and $char_code <= 0x23B3)) {
    ## NOTE: This case must come AFTER noncharacter checking, due to
    ## their range overwrap.
    if ({
         ## In the Arabic Presentation Forms-A block, but no character is
         ## assigned in Unicode 5.1.
         0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1,
         0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1,
         0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1,
         0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1,
         0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1,
         0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1,
         0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1,
         0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1,
         0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1,
         0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1,
         0xFD4F => 1,
         0xFD90 => 1, 0xFD91 => 1,
         0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1,
         0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1,
         # 0xFDD0-0xFDEF noncharacters

         ## In Arabic Presentation Forms-A block, but explicitly
         ## allowed
         0xFD3E => 1, 0xFD3F => 1,

         ## In Arabic Presentation Forms-B block, unassigned
         0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1,
        }->{$char_code}) {
      #
    } else {
      $self->{onerror}->(type => 'unicode should',
                         text => (sprintf 'U+%04X', $char_code),
                         layer => 'charset',
                         level => $self->{level}->{unicode_should},
                         line_diff => $self->{line_diff},
                         column_diff => $self->{column_diff},
                         ($self->{set_column} ? (column => 1) : ()));
    }
  } elsif ({
            ## Styled overlines/underlines in CJK Compatibility Forms
            0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1,
            0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1,
            
            0x037E => 1, 0x0387 => 1, # greek punctuations
            
            #0x17A3 => 1, # also, deprecated
            0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1,

            0x2121 => 1, # tel
            0x213B => 1, # fax
            #0x2120 => 1, # SM (superscript)
            #0x2122 => 1, # TM (superscript)

            0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations
           }->{$char_code} or
           (0x2153 <= $char_code and $char_code <= 0x217F)) {
    $self->{onerror}->(type => 'unicode discouraged',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_discouraged},
                       line_diff => $self->{line_diff},
                       column_diff => $self->{column_diff},
                       ($self->{set_column} ? (column => 1) : ()));
  } elsif ({
            0x055A => 1, 0x0559 =>1, # greek punctuations
            
            0x2103 => 1, 0x2109 => 1, # degree signs

            0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER
           }->{$char_code}) {
    $self->{onerror}->(type => 'unicode not preferred',
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{unicode_preferred},
                       line_diff => $self->{line_diff},
                       column_diff => $self->{column_diff},
                       ($self->{set_column} ? (column => 1) : ()));
  }

  ## TODO: "khanda ta" should be represented by U+09CE
  ## rather than <U+09A4, U+09CD, U+200D>.

  ## TODO: IDS syntax

  ## TODO: langtag syntax
}; # $check_char

sub read ($$$;$) {
  my $self = shift;
  my $offset = $_[2] || 0;
  my $count = $self->{handle}->read (@_);
  $self->{line_diff} = 0;
  $self->{column_diff} = -1;
  delete $self->{set_column};
  delete $self->{has_cr};
  for ($offset .. ($offset + $count - 1)) {
    $check_char->($self, ord substr $_[0], $_, 1);
  }
  return $count;
} # read

sub manakai_read_until ($$$;$) {
  #my ($self, $scalar, $pattern, $offset) = @_;
  my $self = shift;
  my $offset = $_[2] || 0;
  my $count = $self->{handle}->manakai_read_until (@_);
  $self->{line_diff} = 0;
  $self->{column_diff} = -1;
  delete $self->{set_column};
  delete $self->{has_cr};
  for ($offset .. ($offset + $count - 1)) {
    $check_char->($self, ord substr $_[0], $_, 1);
  }
  return $count;
} # manakai_read_until

sub ungetc ($$) {
  unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
} # ungetc

sub close ($) {
  shift->{handle}->close;
} # close

sub charset ($) {
  shift->{handle}->charset;
} # charset

sub has_bom ($) {
  shift->{handle}->has_bom;
} # has_bom

sub input_encoding ($) {
  shift->{handle}->input_encoding;
} # input_encoding

sub onerror ($;$) {
  if (@_ > 1) {
    if (defined $_[1]) {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
    } else {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
    }
  }

  return $_[0]->{onerror};
} # onerror

1;
1	wakaba	1.1	package Whatpm::Charset::UnicodeChecker;
2			use strict;
3
4			## NOTE: For more information (including rationals of checks performed
5			## in this module), see
6			## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7
8			## NOTE: Unicode's definition for character string conformance is
9			## very, very vague so that it is difficult to determine what error
10			## level is appropriate for each error. The Unicode Standard abuses
11			## conformance-creteria-like terms such as "deprecated", "discouraged",
12			## "preferred", "better", "not encouraged", "should", and so on with no
13			## clear explanation of their difference (if any) or relationship to
14			## the conformance. In fact, that specification does not define the
15			## conformance class for character strings.
16
17			sub new_handle ($$) {
18			my $self = bless {
19			queue => [],
20	wakaba	1.2	new_queue => [],
21	wakaba	1.1	onerror => sub {},
22			level => {
23			unicode_should => 'w',
24			unicode_deprecated => 'w', # = unicode_should
25			unicode_discouraged => 'w',
26			unicode_preferred => 'w',
27			## NOTE: We do some "unification" of levels - for example,
28			## "not encouraged" is unified with "discouraged" and
29			## "better" is unified with "preferred".
30			},
31			}, shift;
32			$self->{handle} = shift; # char stream
33			return $self;
34			} # new_handle
35
36	wakaba	1.6	my $check_char = sub ($$) {
37			my ($self, $char_code) = @_;
38	wakaba	1.1
39	wakaba	1.6	if ($char_code == 0x000D) {
40			$self->{line_diff}++;
41			$self->{column_diff} = 0;
42			$self->{set_column} = 1;
43			$self->{has_cr} = 1;
44			return;
45			} elsif ($char_code == 0x000A) {
46			if ($self->{has_cr}) {
47			delete $self->{has_cr};
48			} else {
49			$self->{line_diff}++;
50			$self->{column_diff} = 0;
51			$self->{set_column} = 1;
52			}
53			return;
54			} else {
55			$self->{column_diff}++;
56			delete $self->{has_cr};
57			}
58
59	wakaba	1.1	if ({
60	wakaba	1.6	0x0340 => 1, 0x0341 => 1, 0x17A3 => 1, 0x17D3 => 1,
61			0x206A => 1, 0x206B => 1, 0x206C => 1, 0x206D => 1,
62			0x206E => 1, 0x206F => 1, 0xE0001 => 1,
63			}->{$char_code} or
64			(0xE0020 <= $char_code and $char_code <= 0xE007F)) {
65	wakaba	1.1	## NOTE: From Unicode 5.1.0 \|PropList.txt\| (Deprecated).
66			$self->{onerror}->(type => 'unicode deprecated',
67			text => (sprintf 'U+%04X', $char_code),
68			layer => 'charset',
69	wakaba	1.6	level => $self->{level}->{unicode_deprecated},
70			line_diff => $self->{line_diff},
71			column_diff => $self->{column_diff},
72			($self->{set_column} ? (column => 1) : ()));
73	wakaba	1.1	} elsif ((0xFDD0 <= $char_code and $char_code <= 0xFDDF) or
74			{
75			0xFFFE => 1, 0xFFFF => 1, 0x1FFFE => 1, 0x1FFFF => 1,
76			0x2FFFE => 1, 0x2FFFF => 1, 0x3FFFE => 1, 0x3FFFF => 1,
77			0x4FFFE => 1, 0x4FFFF => 1, 0x5FFFE => 1, 0x5FFFF => 1,
78			0x6FFFE => 1, 0x6FFFF => 1, 0x7FFFE => 1, 0x7FFFF => 1,
79			0x8FFFE => 1, 0x8FFFF => 1, 0x9FFFE => 1, 0x9FFFF => 1,
80			0xAFFFE => 1, 0xAFFFF => 1, 0xBFFFE => 1, 0xBFFFF => 1,
81			0xCFFFE => 1, 0xCFFFF => 1, 0xDFFFE => 1, 0xDFFFF => 1,
82			0xEFFFE => 1, 0xEFFFF => 1, 0xFFFFE => 1, 0xFFFFF => 1,
83			0x10FFFE => 1, 0x10FFFF => 1,
84			}->{$char_code}) {
85			## NOTE: From Unicode 5.1.0 \|PropList.txt\| (Noncharacter_Code_Point).
86			$self->{onerror}->(type => 'nonchar',
87			text => (sprintf 'U+%04X', $char_code),
88			layer => 'charset',
89	wakaba	1.6	level => $self->{level}->{unicode_should},
90			line_diff => $self->{line_diff},
91			column_diff => $self->{column_diff},
92			($self->{set_column} ? (column => 1) : ()));
93	wakaba	1.1	} elsif ({
94			0x0344 => 1, # COMBINING GREEK DIALYTIKA TONOS
95			0x03D3 => 1, 0x03D4 => 1, # GREEK UPSILON WITH ...
96			0x20A4 => 1, # LIRA SIGN
97
98			0x2126 => 1, # OHM SIGN # also, discouraged
99			0x212A => 1, # KELVIN SIGN
100			0x212B => 1, # ANGSTROM SIGN
101			}->{$char_code} or
102			(0xFB50 <= $char_code and $char_code <= 0xFDFB) or
103			(0xFE70 <= $char_code and $char_code <= 0xFEFE) or
104			(0xFA30 <= $char_code and $char_code <= 0xFA6A) or
105			(0xFA70 <= $char_code and $char_code <= 0xFAD9) or
106			(0x2F800 <= $char_code and $char_code <= 0x2FA1D) or
107			(0x239B <= $char_code and $char_code <= 0x23B3)) {
108			## NOTE: This case must come AFTER noncharacter checking, due to
109			## their range overwrap.
110			if ({
111			## In the Arabic Presentation Forms-A block, but no character is
112			## assigned in Unicode 5.1.
113			0xFBB2 => 1, 0xFBB3 => 1, 0xFBB4 => 1, 0xFBB5 => 1, 0xFBB6 => 1,
114			0xFBB7 => 1, 0xFBB8 => 1, 0xFBB9 => 1, 0xFBBA => 1, 0xFBBB => 1,
115			0xFBBC => 1, 0xFBBD => 1, 0xFBBE => 1, 0xFBBF => 1, 0xFBC0 => 1,
116			0xFBC1 => 1, 0xFBC2 => 1, 0xFBC3 => 1, 0xFBC4 => 1, 0xFBC5 => 1,
117			0xFBC6 => 1, 0xFBC7 => 1, 0xFBC8 => 1, 0xFBC9 => 1, 0xFBCA => 1,
118			0xFBCB => 1, 0xFBCC => 1, 0xFBCD => 1, 0xFBCE => 1, 0xFBCF => 1,
119			0xFBD0 => 1, 0xFBD1 => 1, 0xFBD2 => 1,
120			0xFD40 => 1, 0xFD41 => 1, 0xFD42 => 1, 0xFD43 => 1, 0xFD44 => 1,
121			0xFD45 => 1, 0xFD46 => 1, 0xFD47 => 1, 0xFD48 => 1, 0xFD49 => 1,
122			0xFD4A => 1, 0xFD4B => 1, 0xFD4C => 1, 0xFD4D => 1, 0xFD4E => 1,
123			0xFD4F => 1,
124			0xFD90 => 1, 0xFD91 => 1,
125			0xFDC8 => 1, 0xFDC9 => 1, 0xFDCA => 1, 0xFDCB => 1, 0xFDCC => 1,
126			0xFDCD => 1, 0xFDCE => 1, 0xFDCF => 1,
127			# 0xFDD0-0xFDEF noncharacters
128
129			## In Arabic Presentation Forms-A block, but explicitly
130			## allowed
131			0xFD3E => 1, 0xFD3F => 1,
132
133			## In Arabic Presentation Forms-B block, unassigned
134			0xFE75 => 1, 0xFEFD => 1, 0xFEFE => 1,
135			}->{$char_code}) {
136			#
137			} else {
138			$self->{onerror}->(type => 'unicode should',
139			text => (sprintf 'U+%04X', $char_code),
140			layer => 'charset',
141	wakaba	1.6	level => $self->{level}->{unicode_should},
142			line_diff => $self->{line_diff},
143			column_diff => $self->{column_diff},
144			($self->{set_column} ? (column => 1) : ()));
145	wakaba	1.1	}
146			} elsif ({
147			## Styled overlines/underlines in CJK Compatibility Forms
148			0xFE49 => 1, 0xFE4A => 1, 0xFE4B => 1, 0xFE4C => 1,
149			0xFE4D => 1, 0xFE4E => 1, 0xFE4F => 1,
150
151			0x037E => 1, 0x0387 => 1, # greek punctuations
152
153			#0x17A3 => 1, # also, deprecated
154			0x17A4 => 1, 0x17B4 => 1, 0x17B5 => 1, 0x17D8 => 1,
155
156			0x2121 => 1, # tel
157			0x213B => 1, # fax
158			#0x2120 => 1, # SM (superscript)
159			#0x2122 => 1, # TM (superscript)
160
161			0xFFF9 => 1, 0xFFFA => 1, 0xFFFB => 1, # inline annotations
162			}->{$char_code} or
163			(0x2153 <= $char_code and $char_code <= 0x217F)) {
164			$self->{onerror}->(type => 'unicode discouraged',
165			text => (sprintf 'U+%04X', $char_code),
166			layer => 'charset',
167	wakaba	1.6	level => $self->{level}->{unicode_discouraged},
168			line_diff => $self->{line_diff},
169			column_diff => $self->{column_diff},
170			($self->{set_column} ? (column => 1) : ()));
171	wakaba	1.1	} elsif ({
172			0x055A => 1, 0x0559 =>1, # greek punctuations
173
174			0x2103 => 1, 0x2109 => 1, # degree signs
175
176			0xFEFE => 1, # strongly preferrs U+2060 WORD JOINTER
177			}->{$char_code}) {
178			$self->{onerror}->(type => 'unicode not preferred',
179			text => (sprintf 'U+%04X', $char_code),
180			layer => 'charset',
181	wakaba	1.6	level => $self->{level}->{unicode_preferred},
182			line_diff => $self->{line_diff},
183			column_diff => $self->{column_diff},
184			($self->{set_column} ? (column => 1) : ()));
185	wakaba	1.1	}
186
187			## TODO: "khanda ta" should be represented by U+09CE
188			## rather than <U+09A4, U+09CD, U+200D>.
189
190			## TODO: IDS syntax
191
192			## TODO: langtag syntax
193	wakaba	1.6	}; # $check_char
194	wakaba	1.1
195	wakaba	1.6	sub read ($$$;$) {
196			my $self = shift;
197			my $offset = $_[2] \|\| 0;
198			my $count = $self->{handle}->read (@_);
199			$self->{line_diff} = 0;
200			$self->{column_diff} = -1;
201			delete $self->{set_column};
202			delete $self->{has_cr};
203			for ($offset .. ($offset + $count - 1)) {
204			$check_char->($self, ord substr $_[0], $_, 1);
205			}
206			return $count;
207			} # read
208	wakaba	1.4
209	wakaba	1.5	sub manakai_read_until ($$$;$) {
210			#my ($self, $scalar, $pattern, $offset) = @_;
211			my $self = shift;
212	wakaba	1.6	my $offset = $_[2] \|\| 0;
213			my $count = $self->{handle}->manakai_read_until (@_);
214			$self->{line_diff} = 0;
215			$self->{column_diff} = -1;
216			delete $self->{set_column};
217			delete $self->{has_cr};
218			for ($offset .. ($offset + $count - 1)) {
219			$check_char->($self, ord substr $_[0], $_, 1);
220	wakaba	1.4	}
221	wakaba	1.6	return $count;
222	wakaba	1.5	} # manakai_read_until
223	wakaba	1.1
224			sub ungetc ($$) {
225			unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
226			} # ungetc
227
228			sub close ($) {
229			shift->{handle}->close;
230			} # close
231
232			sub charset ($) {
233			shift->{handle}->charset;
234			} # charset
235
236			sub has_bom ($) {
237			shift->{handle}->has_bom;
238			} # has_bom
239
240			sub input_encoding ($) {
241			shift->{handle}->input_encoding;
242			} # input_encoding
243
244			sub onerror ($;$) {
245			if (@_ > 1) {
246			if (defined $_[1]) {
247			$_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
248			} else {
249			$_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
250			}
251			}
252
253			return $_[0]->{onerror};
254			} # onerror
255
256			1;