package Whatpm::Charset::UnicodeChecker; use strict; ## NOTE: For more information (including rationals of checks performed ## in this module), see ## . ## NOTE: Unicode's definition for character string conformance is ## very, very vague so that it is difficult to determine what error ## level is appropriate for each error. The Unicode Standard abuses ## conformance-creteria-like terms such as "deprecated", "discouraged", ## "preferred", "better", "not encouraged", "should", and so on with no ## clear explanation of their difference (if any) or relationship to ## the conformance. In fact, that specification does not define the ## conformance class for character strings. sub new_handle ($$) { my $self = bless { queue => [], new_queue => [], onerror => sub {}, level => { unicode_should => 'w', unicode_deprecated => 'w', # = unicode_should unicode_discouraged => 'w', unicode_preferred => 'w', ## NOTE: We do some "unification" of levels - for example, ## "not encouraged" is unified with "discouraged" and ## "better" is unified with "preferred". }, }, shift; $self->{handle} = shift; # char stream return $self; } # new_handle my $etypes = { 0x0340 => 'unicode deprecated', 0x0341 => 'unicode deprecated', 0x17A3 => 'unicode deprecated', 0x17D3 => 'unicode deprecated', 0x206A => 'unicode deprecated', 0x206B => 'unicode deprecated', 0x206C => 'unicode deprecated', 0x206D => 'unicode deprecated', 0x206E => 'unicode deprecated', 0x206F => 'unicode deprecated', 0xE0001 => 'unicode deprecated', 0xFFFE => 'nonchar', 0xFFFF => 'nonchar', 0x1FFFE => 'nonchar', 0x1FFFF => 'nonchar', 0x2FFFE => 'nonchar', 0x2FFFF => 'nonchar', 0x3FFFE => 'nonchar', 0x3FFFF => 'nonchar', 0x4FFFE => 'nonchar', 0x4FFFF => 'nonchar', 0x5FFFE => 'nonchar', 0x5FFFF => 'nonchar', 0x6FFFE => 'nonchar', 0x6FFFF => 'nonchar', 0x7FFFE => 'nonchar', 0x7FFFF => 'nonchar', 0x8FFFE => 'nonchar', 0x8FFFF => 'nonchar', 0x9FFFE => 'nonchar', 0x9FFFF => 'nonchar', 0xAFFFE => 'nonchar', 0xAFFFF => 'nonchar', 0xBFFFE => 'nonchar', 0xBFFFF => 'nonchar', 0xCFFFE => 'nonchar', 0xCFFFF => 'nonchar', 0xDFFFE => 'nonchar', 0xDFFFF => 'nonchar', 0xEFFFE => 'nonchar', 0xEFFFF => 'nonchar', 0xFFFFE => 'nonchar', 0xFFFFF => 'nonchar', 0x10FFFE => 'nonchar', 0x10FFFF => 'nonchar', 0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS 0x03D3 => 'unicode should', # GREEK UPSILON WITH ... 0x03D4 => 'unicode should', # GREEK UPSILON WITH ... 0x20A4 => 'unicode should', # LIRA SIGN 0x2126 => 'unicode should', # OHM SIGN # also, discouraged 0x212A => 'unicode should', # KELVIN SIGN 0x212B => 'unicode should', # ANGSTROM SIGN ## Styled overlines/underlines in CJK Compatibility Forms 0xFE49 => 'unicode discouraged', 0xFE4A => 'unicode discouraged', 0xFE4B => 'unicode discouraged', 0xFE4C => 'unicode discouraged', 0xFE4D => 'unicode discouraged', 0xFE4E => 'unicode discouraged', 0xFE4F => 'unicode discouraged', 0x037E => 'unicode discouraged', # greek punctuations 0x0387 => 'unicode discouraged', # greek punctuations #0x17A3 => 'unicode discouraged', # also, deprecated 0x17A4 => 'unicode discouraged', 0x17B4 => 'unicode discouraged', 0x17B5 => 'unicode discouraged', 0x17D8 => 'unicode discouraged', 0x2121 => 'unicode discouraged', # tel 0x213B => 'unicode discouraged', # fax #0x2120 => 'unicode discouraged', # SM (superscript) #0x2122 => 'unicode discouraged', # TM (superscript) ## inline annotations 0xFFF9 => 'unicode discouraged', 0xFFFA => 'unicode discouraged', 0xFFFB => 'unicode discouraged', ## greek punctuations 0x055A => 'unicode not preferred', 0x0559 => 'unicode not preferred', ## degree signs 0x2103 => 'unicode not preferred', 0x2109 => 'unicode not preferred', ## strongly preferrs U+2060 WORD JOINTER 0xFEFE => 'unicode not preferred', }; $etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F; $etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF; $etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9; $etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3; $etypes->{$_} = 'unicode should' for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F, 0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74, 0xFE76 .. 0xFEFC; ## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where ## no character is assigned, noncharacter code points, and ## U+FD3E and U+FD3F, which are explicitly allowed. $etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F; my $levels = { 'unicode deprecated' => 'unicode_deprecated', 'nonchar' => 'unicode_should', 'unicode should' => 'unicode_should', 'unicode discouraged' => 'unicode_discouraged', 'unicode not preferred' => 'unicode_preferred', }; my $check_char = sub ($$) { my ($self, $char_code) = @_; if ($char_code == 0x000D) { $self->{line_diff}++; $self->{column_diff} = 0; $self->{set_column} = 1; $self->{has_cr} = 1; return; } elsif ($char_code == 0x000A) { if ($self->{has_cr}) { delete $self->{has_cr}; } else { $self->{line_diff}++; $self->{column_diff} = 0; $self->{set_column} = 1; } return; } else { $self->{column_diff}++; delete $self->{has_cr}; } ## TODO: $char_code > U+10FFFF my $etype = $etypes->{$char_code}; if (defined $etype) { $self->{onerror}->(type => $etype, text => (sprintf 'U+%04X', $char_code), layer => 'charset', level => $self->{level}->{$levels->{$etype}}, line_diff => $self->{line_diff}, column_diff => $self->{column_diff}, ($self->{set_column} ? (column => 1) : ())); } ## TODO: "khanda ta" should be represented by U+09CE ## rather than . ## TODO: IDS syntax ## TODO: langtag syntax }; # $check_char sub read ($$$;$) { my $self = shift; my $offset = $_[2] || 0; my $count = $self->{handle}->read (@_); $self->{line_diff} = 0; $self->{column_diff} = -1; delete $self->{set_column}; delete $self->{has_cr}; for ($offset .. ($offset + $count - 1)) { $check_char->($self, ord substr $_[0], $_, 1); } return $count; } # read sub manakai_read_until ($$$;$) { #my ($self, $scalar, $pattern, $offset) = @_; my $self = shift; my $offset = $_[2] || 0; my $count = $self->{handle}->manakai_read_until (@_); $self->{line_diff} = 0; $self->{column_diff} = -1; delete $self->{set_column}; delete $self->{has_cr}; for ($offset .. ($offset + $count - 1)) { $check_char->($self, ord substr $_[0], $_, 1); } return $count; } # manakai_read_until sub ungetc ($$) { unshift @{$_[0]->{queue}}, chr int ($_[1] or 0); } # ungetc sub close ($) { shift->{handle}->close; } # close sub charset ($) { shift->{handle}->charset; } # charset sub has_bom ($) { shift->{handle}->has_bom; } # has_bom sub input_encoding ($) { shift->{handle}->input_encoding; } # input_encoding sub onerror ($;$) { if (@_ > 1) { if (defined $_[1]) { $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]); } else { $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {}); } } return $_[0]->{onerror}; } # onerror 1;