Whatpm/Charset/UnicodeChecker.pm

package Whatpm::Charset::UnicodeChecker;
use strict;

## NOTE: For more information (including rationals of checks performed
## in this module), see
## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.

## NOTE: Unicode's definition for character string conformance is 
## very, very vague so that it is difficult to determine what error
## level is appropriate for each error.  The Unicode Standard abuses
## conformance-creteria-like terms such as "deprecated", "discouraged",
## "preferred", "better", "not encouraged", "should", and so on with no
## clear explanation of their difference (if any) or relationship to
## the conformance.  In fact, that specification does not define the
## conformance class for character strings.

sub new_handle ($$) {
  my $self = bless {
    queue => [],
    new_queue => [],
    onerror => sub {},
    level => {
      unicode_should => 'w',
      unicode_deprecated => 'w', # = unicode_should
      unicode_discouraged => 'w',
      unicode_preferred => 'w',
      ## NOTE: We do some "unification" of levels - for example,
      ## "not encouraged" is unified with "discouraged" and
      ## "better" is unified with "preferred".
    },
  }, shift;
  $self->{handle} = shift; # char stream
  return $self;
} # new_handle

my $etypes = {
              0x0340 => 'unicode deprecated',
              0x0341 => 'unicode deprecated',
              0x17A3 => 'unicode deprecated',
              0x17D3 => 'unicode deprecated',
              0x206A => 'unicode deprecated',
              0x206B => 'unicode deprecated',
              0x206C => 'unicode deprecated',
              0x206D => 'unicode deprecated',
              0x206E => 'unicode deprecated',
              0x206F => 'unicode deprecated',
              0xE0001 => 'unicode deprecated',
              
              0xFFFE => 'nonchar',
              0xFFFF => 'nonchar',
              0x1FFFE => 'nonchar',
              0x1FFFF => 'nonchar',
              0x2FFFE => 'nonchar',
              0x2FFFF => 'nonchar',
              0x3FFFE => 'nonchar',
              0x3FFFF => 'nonchar',
              0x4FFFE => 'nonchar',
              0x4FFFF => 'nonchar',
              0x5FFFE => 'nonchar',
              0x5FFFF => 'nonchar',
              0x6FFFE => 'nonchar',
              0x6FFFF => 'nonchar',
              0x7FFFE => 'nonchar',
              0x7FFFF => 'nonchar',
              0x8FFFE => 'nonchar',
              0x8FFFF => 'nonchar',
              0x9FFFE => 'nonchar',
              0x9FFFF => 'nonchar',
              0xAFFFE => 'nonchar',
              0xAFFFF => 'nonchar',
              0xBFFFE => 'nonchar',
              0xBFFFF => 'nonchar',
              0xCFFFE => 'nonchar',
              0xCFFFF => 'nonchar',
              0xDFFFE => 'nonchar',
              0xDFFFF => 'nonchar',
              0xEFFFE => 'nonchar',
              0xEFFFF => 'nonchar',
              0xFFFFE => 'nonchar',
              0xFFFFF => 'nonchar',
              0x10FFFE => 'nonchar',
              0x10FFFF => 'nonchar',

              0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS
              0x03D3 => 'unicode should', # GREEK UPSILON WITH ...
              0x03D4 => 'unicode should', # GREEK UPSILON WITH ...
              0x20A4 => 'unicode should', # LIRA SIGN
              
              0x2126 => 'unicode should', # OHM SIGN # also, discouraged
              0x212A => 'unicode should', # KELVIN SIGN
              0x212B => 'unicode should', # ANGSTROM SIGN
              
              ## Styled overlines/underlines in CJK Compatibility Forms
              0xFE49 => 'unicode discouraged',
              0xFE4A => 'unicode discouraged',
              0xFE4B => 'unicode discouraged',
              0xFE4C => 'unicode discouraged',
              0xFE4D => 'unicode discouraged',
              0xFE4E => 'unicode discouraged',
              0xFE4F => 'unicode discouraged',
              
              0x037E => 'unicode discouraged', # greek punctuations
              0x0387 => 'unicode discouraged', # greek punctuations
              
              #0x17A3 => 'unicode discouraged', # also, deprecated
              0x17A4 => 'unicode discouraged',
              0x17B4 => 'unicode discouraged',
              0x17B5 => 'unicode discouraged',
              0x17D8 => 'unicode discouraged',

              0x2121 => 'unicode discouraged', # tel
              0x213B => 'unicode discouraged', # fax
              #0x2120 => 'unicode discouraged', # SM (superscript)
              #0x2122 => 'unicode discouraged', # TM (superscript)

              ## inline annotations
              0xFFF9 => 'unicode discouraged',
              0xFFFA => 'unicode discouraged',
              0xFFFB => 'unicode discouraged',

              ## greek punctuations
              0x055A => 'unicode not preferred',
              0x0559 => 'unicode not preferred',
            
              ## degree signs
              0x2103 => 'unicode not preferred',
              0x2109 => 'unicode not preferred',
              
              ## strongly preferrs U+2060 WORD JOINTER
              0xFEFE => 'unicode not preferred',
             };

$etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F;
$etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF;
$etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9;
$etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3;
$etypes->{$_} = 'unicode should'
    for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F,
        0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74,
        0xFE76 .. 0xFEFC;
    ## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where
    ## no character is assigned, noncharacter code points, and 
    ## U+FD3E and U+FD3F, which are explicitly allowed.
$etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F;

my $levels = {
  'unicode deprecated' => 'unicode_deprecated',
  'nonchar' => 'unicode_should',
  'unicode should' => 'unicode_should',
  'unicode discouraged' => 'unicode_discouraged',
  'unicode not preferred' => 'unicode_preferred',
};

my $check_char = sub ($$) {
  my ($self, $char_code) = @_;

  if ($char_code == 0x000D) {
    $self->{line_diff}++;
    $self->{column_diff} = 0;
    $self->{set_column} = 1;
    $self->{has_cr} = 1;
    return;
  } elsif ($char_code == 0x000A) {
    if ($self->{has_cr}) {
      delete $self->{has_cr};
    } else {
      $self->{line_diff}++;
      $self->{column_diff} = 0;
      $self->{set_column} = 1;
    }
    return;
  } else {
    $self->{column_diff}++;
    delete $self->{has_cr};
  }

  ## TODO: $char_code > U+10FFFF
  
  my $etype = $etypes->{$char_code};
  if (defined $etype) {
    $self->{onerror}->(type => $etype,
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{$levels->{$etype}},
                       line_diff => $self->{line_diff},
                       column_diff => $self->{column_diff},
                       ($self->{set_column} ? (column => 1) : ()));
  }

  ## TODO: "khanda ta" should be represented by U+09CE
  ## rather than <U+09A4, U+09CD, U+200D>.

  ## TODO: IDS syntax

  ## TODO: langtag syntax
}; # $check_char

sub read ($$$;$) {
  my $self = shift;
  my $offset = $_[2] || 0;
  my $count = $self->{handle}->read (@_);
  $self->{line_diff} = 0;
  $self->{column_diff} = -1;
  delete $self->{set_column};
  delete $self->{has_cr};
  for ($offset .. ($offset + $count - 1)) {
    $check_char->($self, ord substr $_[0], $_, 1);
  }
  return $count;
} # read

sub manakai_read_until ($$$;$) {
  #my ($self, $scalar, $pattern, $offset) = @_;
  my $self = shift;
  my $offset = $_[2] || 0;
  my $count = $self->{handle}->manakai_read_until (@_);
  $self->{line_diff} = 0;
  $self->{column_diff} = -1;
  delete $self->{set_column};
  delete $self->{has_cr};
  for ($offset .. ($offset + $count - 1)) {
    $check_char->($self, ord substr $_[0], $_, 1);
  }
  return $count;
} # manakai_read_until

sub ungetc ($$) {
  unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
} # ungetc

sub close ($) {
  shift->{handle}->close;
} # close

sub charset ($) {
  shift->{handle}->charset;
} # charset

sub has_bom ($) {
  shift->{handle}->has_bom;
} # has_bom

sub input_encoding ($) {
  shift->{handle}->input_encoding;
} # input_encoding

sub onerror ($;$) {
  if (@_ > 1) {
    if (defined $_[1]) {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
    } else {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
    }
  }

  return $_[0]->{onerror};
} # onerror

1;
1	wakaba	1.1	package Whatpm::Charset::UnicodeChecker;
2			use strict;
3
4			## NOTE: For more information (including rationals of checks performed
5			## in this module), see
6			## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7
8			## NOTE: Unicode's definition for character string conformance is
9			## very, very vague so that it is difficult to determine what error
10			## level is appropriate for each error. The Unicode Standard abuses
11			## conformance-creteria-like terms such as "deprecated", "discouraged",
12			## "preferred", "better", "not encouraged", "should", and so on with no
13			## clear explanation of their difference (if any) or relationship to
14			## the conformance. In fact, that specification does not define the
15			## conformance class for character strings.
16
17			sub new_handle ($$) {
18			my $self = bless {
19			queue => [],
20	wakaba	1.2	new_queue => [],
21	wakaba	1.1	onerror => sub {},
22			level => {
23			unicode_should => 'w',
24			unicode_deprecated => 'w', # = unicode_should
25			unicode_discouraged => 'w',
26			unicode_preferred => 'w',
27			## NOTE: We do some "unification" of levels - for example,
28			## "not encouraged" is unified with "discouraged" and
29			## "better" is unified with "preferred".
30			},
31			}, shift;
32			$self->{handle} = shift; # char stream
33			return $self;
34			} # new_handle
35
36	wakaba	1.7	my $etypes = {
37			0x0340 => 'unicode deprecated',
38			0x0341 => 'unicode deprecated',
39			0x17A3 => 'unicode deprecated',
40			0x17D3 => 'unicode deprecated',
41			0x206A => 'unicode deprecated',
42			0x206B => 'unicode deprecated',
43			0x206C => 'unicode deprecated',
44			0x206D => 'unicode deprecated',
45			0x206E => 'unicode deprecated',
46			0x206F => 'unicode deprecated',
47			0xE0001 => 'unicode deprecated',
48
49			0xFFFE => 'nonchar',
50			0xFFFF => 'nonchar',
51			0x1FFFE => 'nonchar',
52			0x1FFFF => 'nonchar',
53			0x2FFFE => 'nonchar',
54			0x2FFFF => 'nonchar',
55			0x3FFFE => 'nonchar',
56			0x3FFFF => 'nonchar',
57			0x4FFFE => 'nonchar',
58			0x4FFFF => 'nonchar',
59			0x5FFFE => 'nonchar',
60			0x5FFFF => 'nonchar',
61			0x6FFFE => 'nonchar',
62			0x6FFFF => 'nonchar',
63			0x7FFFE => 'nonchar',
64			0x7FFFF => 'nonchar',
65			0x8FFFE => 'nonchar',
66			0x8FFFF => 'nonchar',
67			0x9FFFE => 'nonchar',
68			0x9FFFF => 'nonchar',
69			0xAFFFE => 'nonchar',
70			0xAFFFF => 'nonchar',
71			0xBFFFE => 'nonchar',
72			0xBFFFF => 'nonchar',
73			0xCFFFE => 'nonchar',
74			0xCFFFF => 'nonchar',
75			0xDFFFE => 'nonchar',
76			0xDFFFF => 'nonchar',
77			0xEFFFE => 'nonchar',
78			0xEFFFF => 'nonchar',
79			0xFFFFE => 'nonchar',
80			0xFFFFF => 'nonchar',
81			0x10FFFE => 'nonchar',
82			0x10FFFF => 'nonchar',
83
84			0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS
85			0x03D3 => 'unicode should', # GREEK UPSILON WITH ...
86			0x03D4 => 'unicode should', # GREEK UPSILON WITH ...
87			0x20A4 => 'unicode should', # LIRA SIGN
88
89			0x2126 => 'unicode should', # OHM SIGN # also, discouraged
90			0x212A => 'unicode should', # KELVIN SIGN
91			0x212B => 'unicode should', # ANGSTROM SIGN
92
93			## Styled overlines/underlines in CJK Compatibility Forms
94			0xFE49 => 'unicode discouraged',
95			0xFE4A => 'unicode discouraged',
96			0xFE4B => 'unicode discouraged',
97			0xFE4C => 'unicode discouraged',
98			0xFE4D => 'unicode discouraged',
99			0xFE4E => 'unicode discouraged',
100			0xFE4F => 'unicode discouraged',
101
102			0x037E => 'unicode discouraged', # greek punctuations
103			0x0387 => 'unicode discouraged', # greek punctuations
104
105			#0x17A3 => 'unicode discouraged', # also, deprecated
106			0x17A4 => 'unicode discouraged',
107			0x17B4 => 'unicode discouraged',
108			0x17B5 => 'unicode discouraged',
109			0x17D8 => 'unicode discouraged',
110
111			0x2121 => 'unicode discouraged', # tel
112			0x213B => 'unicode discouraged', # fax
113			#0x2120 => 'unicode discouraged', # SM (superscript)
114			#0x2122 => 'unicode discouraged', # TM (superscript)
115
116			## inline annotations
117			0xFFF9 => 'unicode discouraged',
118			0xFFFA => 'unicode discouraged',
119			0xFFFB => 'unicode discouraged',
120
121			## greek punctuations
122			0x055A => 'unicode not preferred',
123			0x0559 => 'unicode not preferred',
124
125			## degree signs
126			0x2103 => 'unicode not preferred',
127			0x2109 => 'unicode not preferred',
128
129			## strongly preferrs U+2060 WORD JOINTER
130			0xFEFE => 'unicode not preferred',
131			};
132
133			$etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F;
134			$etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF;
135			$etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9;
136			$etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3;
137			$etypes->{$_} = 'unicode should'
138			for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F,
139			0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74,
140			0xFE76 .. 0xFEFC;
141			## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where
142			## no character is assigned, noncharacter code points, and
143			## U+FD3E and U+FD3F, which are explicitly allowed.
144			$etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F;
145
146			my $levels = {
147			'unicode deprecated' => 'unicode_deprecated',
148			'nonchar' => 'unicode_should',
149			'unicode should' => 'unicode_should',
150			'unicode discouraged' => 'unicode_discouraged',
151			'unicode not preferred' => 'unicode_preferred',
152			};
153
154	wakaba	1.6	my $check_char = sub ($$) {
155			my ($self, $char_code) = @_;
156	wakaba	1.1
157	wakaba	1.6	if ($char_code == 0x000D) {
158			$self->{line_diff}++;
159			$self->{column_diff} = 0;
160			$self->{set_column} = 1;
161			$self->{has_cr} = 1;
162			return;
163			} elsif ($char_code == 0x000A) {
164			if ($self->{has_cr}) {
165			delete $self->{has_cr};
166			} else {
167			$self->{line_diff}++;
168			$self->{column_diff} = 0;
169			$self->{set_column} = 1;
170			}
171			return;
172			} else {
173			$self->{column_diff}++;
174			delete $self->{has_cr};
175			}
176	wakaba	1.7
177			## TODO: $char_code > U+10FFFF
178	wakaba	1.6
179	wakaba	1.7	my $etype = $etypes->{$char_code};
180			if (defined $etype) {
181			$self->{onerror}->(type => $etype,
182	wakaba	1.1	text => (sprintf 'U+%04X', $char_code),
183			layer => 'charset',
184	wakaba	1.7	level => $self->{level}->{$levels->{$etype}},
185	wakaba	1.6	line_diff => $self->{line_diff},
186			column_diff => $self->{column_diff},
187			($self->{set_column} ? (column => 1) : ()));
188	wakaba	1.1	}
189
190			## TODO: "khanda ta" should be represented by U+09CE
191			## rather than <U+09A4, U+09CD, U+200D>.
192
193			## TODO: IDS syntax
194
195			## TODO: langtag syntax
196	wakaba	1.6	}; # $check_char
197	wakaba	1.1
198	wakaba	1.6	sub read ($$$;$) {
199			my $self = shift;
200			my $offset = $_[2] \|\| 0;
201			my $count = $self->{handle}->read (@_);
202			$self->{line_diff} = 0;
203			$self->{column_diff} = -1;
204			delete $self->{set_column};
205			delete $self->{has_cr};
206			for ($offset .. ($offset + $count - 1)) {
207			$check_char->($self, ord substr $_[0], $_, 1);
208			}
209			return $count;
210			} # read
211	wakaba	1.4
212	wakaba	1.5	sub manakai_read_until ($$$;$) {
213			#my ($self, $scalar, $pattern, $offset) = @_;
214			my $self = shift;
215	wakaba	1.6	my $offset = $_[2] \|\| 0;
216			my $count = $self->{handle}->manakai_read_until (@_);
217			$self->{line_diff} = 0;
218			$self->{column_diff} = -1;
219			delete $self->{set_column};
220			delete $self->{has_cr};
221			for ($offset .. ($offset + $count - 1)) {
222			$check_char->($self, ord substr $_[0], $_, 1);
223	wakaba	1.4	}
224	wakaba	1.6	return $count;
225	wakaba	1.5	} # manakai_read_until
226	wakaba	1.1
227			sub ungetc ($$) {
228			unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
229			} # ungetc
230
231			sub close ($) {
232			shift->{handle}->close;
233			} # close
234
235			sub charset ($) {
236			shift->{handle}->charset;
237			} # charset
238
239			sub has_bom ($) {
240			shift->{handle}->has_bom;
241			} # has_bom
242
243			sub input_encoding ($) {
244			shift->{handle}->input_encoding;
245			} # input_encoding
246
247			sub onerror ($;$) {
248			if (@_ > 1) {
249			if (defined $_[1]) {
250			$_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
251			} else {
252			$_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
253			}
254			}
255
256			return $_[0]->{onerror};
257			} # onerror
258
259			1;