Whatpm/Charset/UnicodeChecker.pm

package Whatpm::Charset::UnicodeChecker;
use strict;

## NOTE: For more information (including rationals of checks performed
## in this module), see
## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.

## NOTE: Unicode's definition for character string conformance is 
## very, very vague so that it is difficult to determine what error
## level is appropriate for each error.  The Unicode Standard abuses
## conformance-creteria-like terms such as "deprecated", "discouraged",
## "preferred", "better", "not encouraged", "should", and so on with no
## clear explanation of their difference (if any) or relationship to
## the conformance.  In fact, that specification does not define the
## conformance class for character strings.

sub new_handle ($$;$) {
  my $self = bless {
    queue => [],
    new_queue => [],
    onerror => sub {},
    #onerror_set
    level => {
      unicode_should => 'w',
      unicode_deprecated => 'w', # = unicode_should
      unicode_discouraged => 'w',
      unicode_preferred => 'w',
      ## NOTE: We do some "unification" of levels - for example,
      ## "not encouraged" is unified with "discouraged" and
      ## "better" is unified with "preferred".

      must => 'm',
      warn => 'w',
    },
  }, $_[0];
  $self->{handle} = $_[1]; # char stream
  my $mode = $_[2] || 'default'; # or 'html5'
  $self->{level_map} = {
    ## Unicode errors
    'unicode deprecated' => 'unicode_deprecated',
    'nonchar' => $mode eq 'html5' ? 'must' : 'unicode_should',
        ## NOTE: HTML5 parse error.
    'unicode should' => 'unicode_should',
    'unicode discouraged' => 'unicode_discouraged',
    'unicode not preferred' => 'unicode_preferred',

    ## HTML5 errors (See "text" definition of the spec)
    'control char' => $mode eq 'html5' ? 'must' : 'warn',
        ## NOTE: HTML5 parse error.
    'non unicode' => $mode eq 'html5' ? 'must' : 'warn',
        ## NOTE: In HTML5, replaced by U+FFFD (not a parse error).
  };
  $self->{replace_non_unicode} = ($mode eq 'html5');
  return $self;
} # new_handle

my $etypes = {
              0x0340 => 'unicode deprecated',
              0x0341 => 'unicode deprecated',
              0x17A3 => 'unicode deprecated',
              0x17D3 => 'unicode deprecated',
              0x206A => 'unicode deprecated',
              0x206B => 'unicode deprecated',
              0x206C => 'unicode deprecated',
              0x206D => 'unicode deprecated',
              0x206E => 'unicode deprecated',
              0x206F => 'unicode deprecated',
              0xE0001 => 'unicode deprecated',
              
              0xFFFE => 'nonchar',
              0xFFFF => 'nonchar',
              0x1FFFE => 'nonchar',
              0x1FFFF => 'nonchar',
              0x2FFFE => 'nonchar',
              0x2FFFF => 'nonchar',
              0x3FFFE => 'nonchar',
              0x3FFFF => 'nonchar',
              0x4FFFE => 'nonchar',
              0x4FFFF => 'nonchar',
              0x5FFFE => 'nonchar',
              0x5FFFF => 'nonchar',
              0x6FFFE => 'nonchar',
              0x6FFFF => 'nonchar',
              0x7FFFE => 'nonchar',
              0x7FFFF => 'nonchar',
              0x8FFFE => 'nonchar',
              0x8FFFF => 'nonchar',
              0x9FFFE => 'nonchar',
              0x9FFFF => 'nonchar',
              0xAFFFE => 'nonchar',
              0xAFFFF => 'nonchar',
              0xBFFFE => 'nonchar',
              0xBFFFF => 'nonchar',
              0xCFFFE => 'nonchar',
              0xCFFFF => 'nonchar',
              0xDFFFE => 'nonchar',
              0xDFFFF => 'nonchar',
              0xEFFFE => 'nonchar',
              0xEFFFF => 'nonchar',
              0xFFFFE => 'nonchar',
              0xFFFFF => 'nonchar',
              0x10FFFE => 'nonchar',
              0x10FFFF => 'nonchar',

              0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS
              0x03D3 => 'unicode should', # GREEK UPSILON WITH ...
              0x03D4 => 'unicode should', # GREEK UPSILON WITH ...
              0x20A4 => 'unicode should', # LIRA SIGN
              
              0x2126 => 'unicode should', # OHM SIGN # also, discouraged
              0x212A => 'unicode should', # KELVIN SIGN
              0x212B => 'unicode should', # ANGSTROM SIGN
              
              ## Styled overlines/underlines in CJK Compatibility Forms
              0xFE49 => 'unicode discouraged',
              0xFE4A => 'unicode discouraged',
              0xFE4B => 'unicode discouraged',
              0xFE4C => 'unicode discouraged',
              0xFE4D => 'unicode discouraged',
              0xFE4E => 'unicode discouraged',
              0xFE4F => 'unicode discouraged',
              
              0x037E => 'unicode discouraged', # greek punctuations
              0x0387 => 'unicode discouraged', # greek punctuations
              
              #0x17A3 => 'unicode discouraged', # also, deprecated
              0x17A4 => 'unicode discouraged',
              0x17B4 => 'unicode discouraged',
              0x17B5 => 'unicode discouraged',
              0x17D8 => 'unicode discouraged',

              0x2121 => 'unicode discouraged', # tel
              0x213B => 'unicode discouraged', # fax
              #0x2120 => 'unicode discouraged', # SM (superscript)
              #0x2122 => 'unicode discouraged', # TM (superscript)

              ## inline annotations
              0xFFF9 => 'unicode discouraged',
              0xFFFA => 'unicode discouraged',
              0xFFFB => 'unicode discouraged',

              ## greek punctuations
              0x055A => 'unicode not preferred',
              0x0559 => 'unicode not preferred',
            
              ## degree signs
              0x2103 => 'unicode not preferred',
              0x2109 => 'unicode not preferred',
              
              ## strongly preferrs U+2060 WORD JOINTER
              0xFEFE => 'unicode not preferred',
             };

$etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F;
$etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF;
    ## ISSUE: U+FDE0-U+FDEF are not excluded in HTML5.
$etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9;
$etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3;
$etypes->{$_} = 'unicode should'
    for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F,
        0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74,
        0xFE76 .. 0xFEFC;
    ## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where
    ## no character is assigned, noncharacter code points, and 
    ## U+FD3E and U+FD3F, which are explicitly allowed.
$etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F;
$etypes->{$_} = 'control char'
    for 0x0001 .. 0x0008, 0x000B, 0x000E .. 0x001F, 0x007F .. 0x009F;
$etypes->{$_} = 'control char' for 0xD800 .. 0xDFFF;

my $check_char = sub ($$) {
  my ($self, $char_code) = @_;

  ## NOTE: Negative $char_code is not supported.

  if ($char_code == 0x000D) {
    $self->{line_diff}++;
    $self->{column_diff} = 0;
    $self->{set_column} = 1;
    $self->{has_cr} = 1;
    return;
  } elsif ($char_code == 0x000A) {
    if ($self->{has_cr}) {
      delete $self->{has_cr};
    } else {
      $self->{line_diff}++;
      $self->{column_diff} = 0;
      $self->{set_column} = 1;
    }
    return;
  } else {
    $self->{column_diff}++;
    delete $self->{has_cr};
  }

  if ($char_code > 0x10FFFF) {
    $self->{onerror}->(type => 'non unicode',
                       text => (sprintf 'U-%08X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{$self->{level_map}->{'non unicode'}},
                       line_diff => $self->{line_diff},
                       column_diff => $self->{column_diff},
                       ($self->{set_column} ? (column => 1) : ()));
    if ($self->{replace_non_unicode}) {
      return "\x{FFFD}";
    } else {
      return;
    }
  }
  
  my $etype = $etypes->{$char_code};
  if (defined $etype) {
    $self->{onerror}->(type => $etype,
                       text => (sprintf 'U+%04X', $char_code),
                       layer => 'charset',
                       level => $self->{level}->{$self->{level_map}->{$etype}},
                       line_diff => $self->{line_diff},
                       column_diff => $self->{column_diff},
                       ($self->{set_column} ? (column => 1) : ()));
  }

  ## TODO: "khanda ta" should be represented by U+09CE
  ## rather than <U+09A4, U+09CD, U+200D>.

  ## TODO: IDS syntax

  ## TODO: langtag syntax

  return;
}; # $check_char

sub read ($$$;$) {
  my $self = shift;
  my $offset = $_[2] || 0;
  my $count = $self->{handle}->read (@_);
  $self->{line_diff} = 0;
  $self->{column_diff} = -1;
  delete $self->{set_column};
  delete $self->{has_cr};
  for ($offset .. ($offset + $count - 1)) {
    my $c = $check_char->($self, ord substr $_[0], $_, 1);
    if (defined $c) {
      substr ($_[0], $_, 1) = $c;
    }
  }
  return $count;
} # read

sub manakai_read_until ($$$;$) {
  #my ($self, $scalar, $pattern, $offset) = @_;
  my $self = shift;
  my $offset = $_[2] || 0;
  my $count = $self->{handle}->manakai_read_until (@_);
  $self->{line_diff} = 0;
  $self->{column_diff} = -1;
  delete $self->{set_column};
  delete $self->{has_cr};
  for ($offset .. ($offset + $count - 1)) {
    my $c = $check_char->($self, ord substr $_[0], $_, 1);
    if (defined $c) {
      substr ($_[0], $_, 1) = $c;
    }
  }
  return $count;
} # manakai_read_until

sub ungetc ($$) {
  unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
} # ungetc

sub close ($) {
  shift->{handle}->close;
} # close

sub charset ($) {
  shift->{handle}->charset;
} # charset

sub has_bom ($) {
  shift->{handle}->has_bom;
} # has_bom

sub input_encoding ($) {
  shift->{handle}->input_encoding;
} # input_encoding

sub onerror ($;$) {
  if (@_ > 1) {
    if (defined $_[1]) {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
      $_[0]->{onerror_set} = 1;
    } else {
      $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
      delete $_[0]->{onerror_set};
    }
  }

  return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef;
} # onerror

1;
1	wakaba	1.1	package Whatpm::Charset::UnicodeChecker;
2			use strict;
3
4			## NOTE: For more information (including rationals of checks performed
5			## in this module), see
6			## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>.
7
8			## NOTE: Unicode's definition for character string conformance is
9			## very, very vague so that it is difficult to determine what error
10			## level is appropriate for each error. The Unicode Standard abuses
11			## conformance-creteria-like terms such as "deprecated", "discouraged",
12			## "preferred", "better", "not encouraged", "should", and so on with no
13			## clear explanation of their difference (if any) or relationship to
14			## the conformance. In fact, that specification does not define the
15			## conformance class for character strings.
16
17	wakaba	1.8	sub new_handle ($$;$) {
18	wakaba	1.1	my $self = bless {
19			queue => [],
20	wakaba	1.2	new_queue => [],
21	wakaba	1.1	onerror => sub {},
22	wakaba	1.8	#onerror_set
23	wakaba	1.1	level => {
24			unicode_should => 'w',
25			unicode_deprecated => 'w', # = unicode_should
26			unicode_discouraged => 'w',
27			unicode_preferred => 'w',
28			## NOTE: We do some "unification" of levels - for example,
29			## "not encouraged" is unified with "discouraged" and
30			## "better" is unified with "preferred".
31	wakaba	1.8
32			must => 'm',
33			warn => 'w',
34	wakaba	1.1	},
35	wakaba	1.8	}, $_[0];
36			$self->{handle} = $_[1]; # char stream
37			my $mode = $_[2] \|\| 'default'; # or 'html5'
38			$self->{level_map} = {
39			## Unicode errors
40			'unicode deprecated' => 'unicode_deprecated',
41			'nonchar' => $mode eq 'html5' ? 'must' : 'unicode_should',
42			## NOTE: HTML5 parse error.
43			'unicode should' => 'unicode_should',
44			'unicode discouraged' => 'unicode_discouraged',
45			'unicode not preferred' => 'unicode_preferred',
46
47			## HTML5 errors (See "text" definition of the spec)
48			'control char' => $mode eq 'html5' ? 'must' : 'warn',
49			## NOTE: HTML5 parse error.
50			'non unicode' => $mode eq 'html5' ? 'must' : 'warn',
51			## NOTE: In HTML5, replaced by U+FFFD (not a parse error).
52			};
53			$self->{replace_non_unicode} = ($mode eq 'html5');
54	wakaba	1.1	return $self;
55			} # new_handle
56
57	wakaba	1.7	my $etypes = {
58			0x0340 => 'unicode deprecated',
59			0x0341 => 'unicode deprecated',
60			0x17A3 => 'unicode deprecated',
61			0x17D3 => 'unicode deprecated',
62			0x206A => 'unicode deprecated',
63			0x206B => 'unicode deprecated',
64			0x206C => 'unicode deprecated',
65			0x206D => 'unicode deprecated',
66			0x206E => 'unicode deprecated',
67			0x206F => 'unicode deprecated',
68			0xE0001 => 'unicode deprecated',
69
70			0xFFFE => 'nonchar',
71			0xFFFF => 'nonchar',
72			0x1FFFE => 'nonchar',
73			0x1FFFF => 'nonchar',
74			0x2FFFE => 'nonchar',
75			0x2FFFF => 'nonchar',
76			0x3FFFE => 'nonchar',
77			0x3FFFF => 'nonchar',
78			0x4FFFE => 'nonchar',
79			0x4FFFF => 'nonchar',
80			0x5FFFE => 'nonchar',
81			0x5FFFF => 'nonchar',
82			0x6FFFE => 'nonchar',
83			0x6FFFF => 'nonchar',
84			0x7FFFE => 'nonchar',
85			0x7FFFF => 'nonchar',
86			0x8FFFE => 'nonchar',
87			0x8FFFF => 'nonchar',
88			0x9FFFE => 'nonchar',
89			0x9FFFF => 'nonchar',
90			0xAFFFE => 'nonchar',
91			0xAFFFF => 'nonchar',
92			0xBFFFE => 'nonchar',
93			0xBFFFF => 'nonchar',
94			0xCFFFE => 'nonchar',
95			0xCFFFF => 'nonchar',
96			0xDFFFE => 'nonchar',
97			0xDFFFF => 'nonchar',
98			0xEFFFE => 'nonchar',
99			0xEFFFF => 'nonchar',
100			0xFFFFE => 'nonchar',
101			0xFFFFF => 'nonchar',
102			0x10FFFE => 'nonchar',
103			0x10FFFF => 'nonchar',
104
105			0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS
106			0x03D3 => 'unicode should', # GREEK UPSILON WITH ...
107			0x03D4 => 'unicode should', # GREEK UPSILON WITH ...
108			0x20A4 => 'unicode should', # LIRA SIGN
109
110			0x2126 => 'unicode should', # OHM SIGN # also, discouraged
111			0x212A => 'unicode should', # KELVIN SIGN
112			0x212B => 'unicode should', # ANGSTROM SIGN
113
114			## Styled overlines/underlines in CJK Compatibility Forms
115			0xFE49 => 'unicode discouraged',
116			0xFE4A => 'unicode discouraged',
117			0xFE4B => 'unicode discouraged',
118			0xFE4C => 'unicode discouraged',
119			0xFE4D => 'unicode discouraged',
120			0xFE4E => 'unicode discouraged',
121			0xFE4F => 'unicode discouraged',
122
123			0x037E => 'unicode discouraged', # greek punctuations
124			0x0387 => 'unicode discouraged', # greek punctuations
125
126			#0x17A3 => 'unicode discouraged', # also, deprecated
127			0x17A4 => 'unicode discouraged',
128			0x17B4 => 'unicode discouraged',
129			0x17B5 => 'unicode discouraged',
130			0x17D8 => 'unicode discouraged',
131
132			0x2121 => 'unicode discouraged', # tel
133			0x213B => 'unicode discouraged', # fax
134			#0x2120 => 'unicode discouraged', # SM (superscript)
135			#0x2122 => 'unicode discouraged', # TM (superscript)
136
137			## inline annotations
138			0xFFF9 => 'unicode discouraged',
139			0xFFFA => 'unicode discouraged',
140			0xFFFB => 'unicode discouraged',
141
142			## greek punctuations
143			0x055A => 'unicode not preferred',
144			0x0559 => 'unicode not preferred',
145
146			## degree signs
147			0x2103 => 'unicode not preferred',
148			0x2109 => 'unicode not preferred',
149
150			## strongly preferrs U+2060 WORD JOINTER
151			0xFEFE => 'unicode not preferred',
152			};
153
154			$etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F;
155			$etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF;
156	wakaba	1.8	## ISSUE: U+FDE0-U+FDEF are not excluded in HTML5.
157	wakaba	1.7	$etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9;
158			$etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3;
159			$etypes->{$_} = 'unicode should'
160			for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F,
161			0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74,
162			0xFE76 .. 0xFEFC;
163			## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where
164			## no character is assigned, noncharacter code points, and
165			## U+FD3E and U+FD3F, which are explicitly allowed.
166			$etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F;
167	wakaba	1.8	$etypes->{$_} = 'control char'
168	wakaba	1.9	for 0x0001 .. 0x0008, 0x000B, 0x000E .. 0x001F, 0x007F .. 0x009F;
169	wakaba	1.8	$etypes->{$_} = 'control char' for 0xD800 .. 0xDFFF;
170	wakaba	1.7
171	wakaba	1.6	my $check_char = sub ($$) {
172			my ($self, $char_code) = @_;
173	wakaba	1.1
174	wakaba	1.8	## NOTE: Negative $char_code is not supported.
175
176	wakaba	1.6	if ($char_code == 0x000D) {
177			$self->{line_diff}++;
178			$self->{column_diff} = 0;
179			$self->{set_column} = 1;
180			$self->{has_cr} = 1;
181			return;
182			} elsif ($char_code == 0x000A) {
183			if ($self->{has_cr}) {
184			delete $self->{has_cr};
185			} else {
186			$self->{line_diff}++;
187			$self->{column_diff} = 0;
188			$self->{set_column} = 1;
189			}
190			return;
191			} else {
192			$self->{column_diff}++;
193			delete $self->{has_cr};
194			}
195	wakaba	1.7
196	wakaba	1.8	if ($char_code > 0x10FFFF) {
197			$self->{onerror}->(type => 'non unicode',
198			text => (sprintf 'U-%08X', $char_code),
199			layer => 'charset',
200			level => $self->{level}->{$self->{level_map}->{'non unicode'}},
201			line_diff => $self->{line_diff},
202			column_diff => $self->{column_diff},
203			($self->{set_column} ? (column => 1) : ()));
204			if ($self->{replace_non_unicode}) {
205			return "\x{FFFD}";
206			} else {
207			return;
208			}
209			}
210	wakaba	1.6
211	wakaba	1.7	my $etype = $etypes->{$char_code};
212			if (defined $etype) {
213			$self->{onerror}->(type => $etype,
214	wakaba	1.1	text => (sprintf 'U+%04X', $char_code),
215			layer => 'charset',
216	wakaba	1.8	level => $self->{level}->{$self->{level_map}->{$etype}},
217	wakaba	1.6	line_diff => $self->{line_diff},
218			column_diff => $self->{column_diff},
219			($self->{set_column} ? (column => 1) : ()));
220	wakaba	1.1	}
221
222			## TODO: "khanda ta" should be represented by U+09CE
223			## rather than <U+09A4, U+09CD, U+200D>.
224
225			## TODO: IDS syntax
226
227			## TODO: langtag syntax
228	wakaba	1.8
229			return;
230	wakaba	1.6	}; # $check_char
231	wakaba	1.1
232	wakaba	1.6	sub read ($$$;$) {
233			my $self = shift;
234			my $offset = $_[2] \|\| 0;
235			my $count = $self->{handle}->read (@_);
236			$self->{line_diff} = 0;
237			$self->{column_diff} = -1;
238			delete $self->{set_column};
239			delete $self->{has_cr};
240			for ($offset .. ($offset + $count - 1)) {
241	wakaba	1.8	my $c = $check_char->($self, ord substr $_[0], $_, 1);
242			if (defined $c) {
243			substr ($_[0], $_, 1) = $c;
244			}
245	wakaba	1.6	}
246			return $count;
247			} # read
248	wakaba	1.4
249	wakaba	1.5	sub manakai_read_until ($$$;$) {
250			#my ($self, $scalar, $pattern, $offset) = @_;
251			my $self = shift;
252	wakaba	1.6	my $offset = $_[2] \|\| 0;
253			my $count = $self->{handle}->manakai_read_until (@_);
254			$self->{line_diff} = 0;
255			$self->{column_diff} = -1;
256			delete $self->{set_column};
257			delete $self->{has_cr};
258			for ($offset .. ($offset + $count - 1)) {
259	wakaba	1.8	my $c = $check_char->($self, ord substr $_[0], $_, 1);
260			if (defined $c) {
261			substr ($_[0], $_, 1) = $c;
262			}
263	wakaba	1.4	}
264	wakaba	1.6	return $count;
265	wakaba	1.5	} # manakai_read_until
266	wakaba	1.1
267			sub ungetc ($$) {
268			unshift @{$_[0]->{queue}}, chr int ($_[1] or 0);
269			} # ungetc
270
271			sub close ($) {
272			shift->{handle}->close;
273			} # close
274
275			sub charset ($) {
276			shift->{handle}->charset;
277			} # charset
278
279			sub has_bom ($) {
280			shift->{handle}->has_bom;
281			} # has_bom
282
283			sub input_encoding ($) {
284			shift->{handle}->input_encoding;
285			} # input_encoding
286
287			sub onerror ($;$) {
288			if (@_ > 1) {
289			if (defined $_[1]) {
290			$_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]);
291	wakaba	1.8	$_[0]->{onerror_set} = 1;
292	wakaba	1.1	} else {
293			$_[0]->{handle}->onerror ($_[0]->{onerror} = sub {});
294	wakaba	1.8	delete $_[0]->{onerror_set};
295	wakaba	1.1	}
296			}
297
298	wakaba	1.8	return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef;
299	wakaba	1.1	} # onerror
300
301			1;