package Whatpm::HTML; use strict; our $VERSION=do{my @r=(q$Revision: 1.181 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; use Error qw(:try); ## NOTE: This module don't check all HTML5 parse errors; character ## encoding related parse errors are expected to be handled by relevant ## modules. ## Parse errors for control characters that are not allowed in HTML5 ## documents, for surrogate code points, and for noncharacter code ## points, as well as U+FFFD substitions for characters whose code points ## is higher than U+10FFFF may be detected by combining the parser with ## the checker implemented by Whatpm::Charset::UnicodeChecker (for its ## usage example, see |t/HTML-tree.t| in the Whatpm package or the ## WebHACC::Language::HTML module in the WebHACC package). ## ISSUE: ## var doc = implementation.createDocument (null, null, null); ## doc.write (''); ## alert (doc.compatMode); require IO::Handle; my $HTML_NS = q; my $MML_NS = q; my $SVG_NS = q; my $XLINK_NS = q; my $XML_NS = q; my $XMLNS_NS = q; sub A_EL () { 0b1 } sub ADDRESS_EL () { 0b10 } sub BODY_EL () { 0b100 } sub BUTTON_EL () { 0b1000 } sub CAPTION_EL () { 0b10000 } sub DD_EL () { 0b100000 } sub DIV_EL () { 0b1000000 } sub DT_EL () { 0b10000000 } sub FORM_EL () { 0b100000000 } sub FORMATTING_EL () { 0b1000000000 } sub FRAMESET_EL () { 0b10000000000 } sub HEADING_EL () { 0b100000000000 } sub HTML_EL () { 0b1000000000000 } sub LI_EL () { 0b10000000000000 } sub NOBR_EL () { 0b100000000000000 } sub OPTION_EL () { 0b1000000000000000 } sub OPTGROUP_EL () { 0b10000000000000000 } sub P_EL () { 0b100000000000000000 } sub SELECT_EL () { 0b1000000000000000000 } sub TABLE_EL () { 0b10000000000000000000 } sub TABLE_CELL_EL () { 0b100000000000000000000 } sub TABLE_ROW_EL () { 0b1000000000000000000000 } sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 } sub MISC_SCOPING_EL () { 0b100000000000000000000000 } sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 } sub FOREIGN_EL () { 0b10000000000000000000000000 } sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 } sub MML_AXML_EL () { 0b1000000000000000000000000000 } sub RUBY_EL () { 0b10000000000000000000000000000 } sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 } sub TABLE_ROWS_EL () { TABLE_EL | TABLE_ROW_EL | TABLE_ROW_GROUP_EL } ## NOTE: Used in "generate implied end tags" algorithm. ## NOTE: There is a code where a modified version of END_TAG_OPTIONAL_EL ## is used in "generate implied end tags" implementation (search for the ## function mae). sub END_TAG_OPTIONAL_EL () { DD_EL | DT_EL | LI_EL | P_EL | RUBY_COMPONENT_EL } ## NOTE: Used in and EOF algorithms. sub ALL_END_TAG_OPTIONAL_EL () { DD_EL | DT_EL | LI_EL | P_EL | BODY_EL | HTML_EL | TABLE_CELL_EL | TABLE_ROW_EL | TABLE_ROW_GROUP_EL } sub SCOPING_EL () { BUTTON_EL | CAPTION_EL | HTML_EL | TABLE_EL | TABLE_CELL_EL | MISC_SCOPING_EL } sub TABLE_SCOPING_EL () { HTML_EL | TABLE_EL } sub TABLE_ROWS_SCOPING_EL () { HTML_EL | TABLE_ROW_GROUP_EL } sub TABLE_ROW_SCOPING_EL () { HTML_EL | TABLE_ROW_EL } sub SPECIAL_EL () { ADDRESS_EL | BODY_EL | DIV_EL | DD_EL | DT_EL | LI_EL | P_EL | FORM_EL | FRAMESET_EL | HEADING_EL | OPTION_EL | OPTGROUP_EL | SELECT_EL | TABLE_ROW_EL | TABLE_ROW_GROUP_EL | MISC_SPECIAL_EL } my $el_category = { a => A_EL | FORMATTING_EL, address => ADDRESS_EL, applet => MISC_SCOPING_EL, area => MISC_SPECIAL_EL, b => FORMATTING_EL, base => MISC_SPECIAL_EL, basefont => MISC_SPECIAL_EL, bgsound => MISC_SPECIAL_EL, big => FORMATTING_EL, blockquote => MISC_SPECIAL_EL, body => BODY_EL, br => MISC_SPECIAL_EL, button => BUTTON_EL, caption => CAPTION_EL, center => MISC_SPECIAL_EL, col => MISC_SPECIAL_EL, colgroup => MISC_SPECIAL_EL, dd => DD_EL, dir => MISC_SPECIAL_EL, div => DIV_EL, dl => MISC_SPECIAL_EL, dt => DT_EL, em => FORMATTING_EL, embed => MISC_SPECIAL_EL, fieldset => MISC_SPECIAL_EL, font => FORMATTING_EL, form => FORM_EL, frame => MISC_SPECIAL_EL, frameset => FRAMESET_EL, h1 => HEADING_EL, h2 => HEADING_EL, h3 => HEADING_EL, h4 => HEADING_EL, h5 => HEADING_EL, h6 => HEADING_EL, head => MISC_SPECIAL_EL, hr => MISC_SPECIAL_EL, html => HTML_EL, i => FORMATTING_EL, iframe => MISC_SPECIAL_EL, img => MISC_SPECIAL_EL, input => MISC_SPECIAL_EL, isindex => MISC_SPECIAL_EL, li => LI_EL, link => MISC_SPECIAL_EL, listing => MISC_SPECIAL_EL, marquee => MISC_SCOPING_EL, menu => MISC_SPECIAL_EL, meta => MISC_SPECIAL_EL, nobr => NOBR_EL | FORMATTING_EL, noembed => MISC_SPECIAL_EL, noframes => MISC_SPECIAL_EL, noscript => MISC_SPECIAL_EL, object => MISC_SCOPING_EL, ol => MISC_SPECIAL_EL, optgroup => OPTGROUP_EL, option => OPTION_EL, p => P_EL, param => MISC_SPECIAL_EL, plaintext => MISC_SPECIAL_EL, pre => MISC_SPECIAL_EL, rp => RUBY_COMPONENT_EL, rt => RUBY_COMPONENT_EL, ruby => RUBY_EL, s => FORMATTING_EL, script => MISC_SPECIAL_EL, select => SELECT_EL, small => FORMATTING_EL, spacer => MISC_SPECIAL_EL, strike => FORMATTING_EL, strong => FORMATTING_EL, style => MISC_SPECIAL_EL, table => TABLE_EL, tbody => TABLE_ROW_GROUP_EL, td => TABLE_CELL_EL, textarea => MISC_SPECIAL_EL, tfoot => TABLE_ROW_GROUP_EL, th => TABLE_CELL_EL, thead => TABLE_ROW_GROUP_EL, title => MISC_SPECIAL_EL, tr => TABLE_ROW_EL, tt => FORMATTING_EL, u => FORMATTING_EL, ul => MISC_SPECIAL_EL, wbr => MISC_SPECIAL_EL, }; my $el_category_f = { $MML_NS => { 'annotation-xml' => MML_AXML_EL, mi => FOREIGN_FLOW_CONTENT_EL, mo => FOREIGN_FLOW_CONTENT_EL, mn => FOREIGN_FLOW_CONTENT_EL, ms => FOREIGN_FLOW_CONTENT_EL, mtext => FOREIGN_FLOW_CONTENT_EL, }, $SVG_NS => { foreignObject => FOREIGN_FLOW_CONTENT_EL, desc => FOREIGN_FLOW_CONTENT_EL, title => FOREIGN_FLOW_CONTENT_EL, }, ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements. }; my $svg_attr_name = { attributename => 'attributeName', attributetype => 'attributeType', basefrequency => 'baseFrequency', baseprofile => 'baseProfile', calcmode => 'calcMode', clippathunits => 'clipPathUnits', contentscripttype => 'contentScriptType', contentstyletype => 'contentStyleType', diffuseconstant => 'diffuseConstant', edgemode => 'edgeMode', externalresourcesrequired => 'externalResourcesRequired', filterres => 'filterRes', filterunits => 'filterUnits', glyphref => 'glyphRef', gradienttransform => 'gradientTransform', gradientunits => 'gradientUnits', kernelmatrix => 'kernelMatrix', kernelunitlength => 'kernelUnitLength', keypoints => 'keyPoints', keysplines => 'keySplines', keytimes => 'keyTimes', lengthadjust => 'lengthAdjust', limitingconeangle => 'limitingConeAngle', markerheight => 'markerHeight', markerunits => 'markerUnits', markerwidth => 'markerWidth', maskcontentunits => 'maskContentUnits', maskunits => 'maskUnits', numoctaves => 'numOctaves', pathlength => 'pathLength', patterncontentunits => 'patternContentUnits', patterntransform => 'patternTransform', patternunits => 'patternUnits', pointsatx => 'pointsAtX', pointsaty => 'pointsAtY', pointsatz => 'pointsAtZ', preservealpha => 'preserveAlpha', preserveaspectratio => 'preserveAspectRatio', primitiveunits => 'primitiveUnits', refx => 'refX', refy => 'refY', repeatcount => 'repeatCount', repeatdur => 'repeatDur', requiredextensions => 'requiredExtensions', requiredfeatures => 'requiredFeatures', specularconstant => 'specularConstant', specularexponent => 'specularExponent', spreadmethod => 'spreadMethod', startoffset => 'startOffset', stddeviation => 'stdDeviation', stitchtiles => 'stitchTiles', surfacescale => 'surfaceScale', systemlanguage => 'systemLanguage', tablevalues => 'tableValues', targetx => 'targetX', targety => 'targetY', textlength => 'textLength', viewbox => 'viewBox', viewtarget => 'viewTarget', xchannelselector => 'xChannelSelector', ychannelselector => 'yChannelSelector', zoomandpan => 'zoomAndPan', }; my $foreign_attr_xname = { 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']], 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']], 'xlink:href' => [$XLINK_NS, ['xlink', 'href']], 'xlink:role' => [$XLINK_NS, ['xlink', 'role']], 'xlink:show' => [$XLINK_NS, ['xlink', 'show']], 'xlink:title' => [$XLINK_NS, ['xlink', 'title']], 'xlink:type' => [$XLINK_NS, ['xlink', 'type']], 'xml:base' => [$XML_NS, ['xml', 'base']], 'xml:lang' => [$XML_NS, ['xml', 'lang']], 'xml:space' => [$XML_NS, ['xml', 'space']], 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']], 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']], }; ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error. my $c1_entity_char = { 0x80 => 0x20AC, 0x81 => 0xFFFD, 0x82 => 0x201A, 0x83 => 0x0192, 0x84 => 0x201E, 0x85 => 0x2026, 0x86 => 0x2020, 0x87 => 0x2021, 0x88 => 0x02C6, 0x89 => 0x2030, 0x8A => 0x0160, 0x8B => 0x2039, 0x8C => 0x0152, 0x8D => 0xFFFD, 0x8E => 0x017D, 0x8F => 0xFFFD, 0x90 => 0xFFFD, 0x91 => 0x2018, 0x92 => 0x2019, 0x93 => 0x201C, 0x94 => 0x201D, 0x95 => 0x2022, 0x96 => 0x2013, 0x97 => 0x2014, 0x98 => 0x02DC, 0x99 => 0x2122, 0x9A => 0x0161, 0x9B => 0x203A, 0x9C => 0x0153, 0x9D => 0xFFFD, 0x9E => 0x017E, 0x9F => 0x0178, }; # $c1_entity_char sub parse_byte_string ($$$$;$) { my $self = shift; my $charset_name = shift; open my $input, '<', ref $_[0] ? $_[0] : \($_[0]); return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]); } # parse_byte_string sub parse_byte_stream ($$$$;$$) { # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_; my $self = ref $_[0] ? shift : shift->new; my $charset_name = shift; my $byte_stream = $_[0]; my $onerror = $_[2] || sub { my (%opt) = @_; warn "Parse error ($opt{type})\n"; }; $self->{parse_error} = $onerror; # updated later by parse_char_string my $get_wrapper = $_[3] || sub ($) { return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle }; ## HTML5 encoding sniffing algorithm require Message::Charset::Info; my $charset; my $buffer; my ($char_stream, $e_status); SNIFFING: { ## NOTE: By setting |allow_fallback| option true when the ## |get_decode_handle| method is invoked, we ignore what the HTML5 ## spec requires, i.e. unsupported encoding should be ignored. ## TODO: We should not do this unless the parser is invoked ## in the conformance checking mode, in which this behavior ## would be useful. ## Step 1 if (defined $charset_name) { $charset = Message::Charset::Info->get_by_html_name ($charset_name); ## TODO: Is this ok? Transfer protocol's parameter should be ## interpreted in its semantics? ## ISSUE: Unsupported encoding is not ignored according to the spec. ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1); if ($char_stream) { $self->{confident} = 1; last SNIFFING; } else { ## TODO: unsupported error } } ## Step 2 my $byte_buffer = ''; for (1..1024) { my $char = $byte_stream->getc; last unless defined $char; $byte_buffer .= $char; } ## TODO: timeout ## Step 3 if ($byte_buffer =~ /^\xFE\xFF/) { $charset = Message::Charset::Info->get_by_html_name ('utf-16be'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } elsif ($byte_buffer =~ /^\xFF\xFE/) { $charset = Message::Charset::Info->get_by_html_name ('utf-16le'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) { $charset = Message::Charset::Info->get_by_html_name ('utf-8'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } ## Step 4 ## TODO: ## Step 5 ## TODO: from history ## Step 6 require Whatpm::Charset::UniversalCharDet; $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string ($byte_buffer); if (defined $charset_name) { $charset = Message::Charset::Info->get_by_html_name ($charset_name); ## ISSUE: Unsupported encoding is not ignored according to the spec. require Whatpm::Charset::DecodeHandle; $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new ($byte_stream); ($char_stream, $e_status) = $charset->get_decode_handle ($buffer, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); if ($char_stream) { $buffer->{buffer} = $byte_buffer; $self->{parse_error}->(level => $self->{level}->{must}, type => 'sniffing:chardet', text => $charset_name, level => $self->{level}->{info}, layer => 'encode', line => 1, column => 1); $self->{confident} = 0; last SNIFFING; } } ## Step 7: default ## TODO: Make this configurable. $charset = Message::Charset::Info->get_by_html_name ('windows-1252'); ## NOTE: We choose |windows-1252| here, since |utf-8| should be ## detectable in the step 6. require Whatpm::Charset::DecodeHandle; $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new ($byte_stream); ($char_stream, $e_status) = $charset->get_decode_handle ($buffer, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $buffer->{buffer} = $byte_buffer; $self->{parse_error}->(level => $self->{level}->{must}, type => 'sniffing:default', text => 'windows-1252', level => $self->{level}->{info}, line => 1, column => 1, layer => 'encode'); $self->{confident} = 0; } # SNIFFING if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name? $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:fallback', #text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } elsif (not ($e_status & Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) { $self->{input_encoding} = $charset->get_iana_name; $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:no error', text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } else { $self->{input_encoding} = $charset->get_iana_name; } $self->{change_encoding} = sub { my $self = shift; $charset_name = shift; my $token = shift; $charset = Message::Charset::Info->get_by_html_name ($charset_name); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \ $buffer->{buffer}); if ($char_stream) { # if supported ## "Change the encoding" algorithm: ## Step 1 if ($charset->{category} & Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) { $charset = Message::Charset::Info->get_by_html_name ('utf-8'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, byte_buffer => \ $buffer->{buffer}); } $charset_name = $charset->get_iana_name; ## Step 2 if (defined $self->{input_encoding} and $self->{input_encoding} eq $charset_name) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'charset label:matching', text => $charset_name, level => $self->{level}->{info}); $self->{confident} = 1; return; } $self->{parse_error}->(level => $self->{level}->{must}, type => 'charset label detected', text => $self->{input_encoding}, value => $charset_name, level => $self->{level}->{warn}, token => $token); ## Step 3 # if (can) { ## change the encoding on the fly. #$self->{confident} = 1; #return; # } ## Step 4 throw Whatpm::HTML::RestartParser (); } }; # $self->{change_encoding} my $char_onerror = sub { my (undef, $type, %opt) = @_; $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode', line => $self->{line}, column => $self->{column} + 1, %opt, type => $type); if ($opt{octets}) { ${$opt{octets}} = "\x{FFFD}"; # relacement character } }; my $wrapped_char_stream = $get_wrapper->($char_stream); $wrapped_char_stream->onerror ($char_onerror); my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef; my $return; try { $return = $self->parse_char_stream ($wrapped_char_stream, @args); } catch Whatpm::HTML::RestartParser with { ## NOTE: Invoked after {change_encoding}. if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name? $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:fallback', level => $self->{level}->{uncertain}, #text => $self->{input_encoding}, line => 1, column => 1, layer => 'encode'); } elsif (not ($e_status & Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) { $self->{input_encoding} = $charset->get_iana_name; $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:no error', text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } else { $self->{input_encoding} = $charset->get_iana_name; } $self->{confident} = 1; $wrapped_char_stream = $get_wrapper->($char_stream); $wrapped_char_stream->onerror ($char_onerror); $return = $self->parse_char_stream ($wrapped_char_stream, @args); }; return $return; } # parse_byte_stream ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM ## and the HTML layer MUST ignore it. However, we does strip BOM in ## the encoding layer and the HTML layer does not ignore any U+FEFF, ## because the core part of our HTML parser expects a string of character, ## not a string of bytes or code units or anything which might contain a BOM. ## Therefore, any parser interface that accepts a string of bytes, ## such as |parse_byte_string| in this module, must ensure that it does ## strip the BOM and never strip any ZWNBSP. sub parse_char_string ($$$;$$) { #my ($self, $s, $doc, $onerror, $get_wrapper) = @_; my $self = shift; my $s = ref $_[0] ? $_[0] : \($_[0]); require Whatpm::Charset::DecodeHandle; my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s); return $self->parse_char_stream ($input, @_[1..$#_]); } # parse_char_string *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility. sub parse_char_stream ($$$;$$) { my $self = ref $_[0] ? shift : shift->new; my $input = $_[0]; $self->{document} = $_[1]; @{$self->{document}->child_nodes} = (); ## NOTE: |set_inner_html| copies most of this method's code $self->{confident} = 1 unless exists $self->{confident}; $self->{document}->input_encoding ($self->{input_encoding}) if defined $self->{input_encoding}; ## TODO: |{input_encoding}| is needless? $self->{line_prev} = $self->{line} = 1; $self->{column_prev} = -1; $self->{column} = 0; $self->{set_nc} = sub { my $self = shift; my $char = ''; if (defined $self->{next_nc}) { $char = $self->{next_nc}; delete $self->{next_nc}; $self->{nc} = ord $char; } else { $self->{char_buffer} = ''; $self->{char_buffer_pos} = 0; my $count = $input->manakai_read_until ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos}); if ($count) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); return; } if ($input->read ($char, 1)) { $self->{nc} = ord $char; } else { $self->{nc} = -1; return; } } ($self->{line_prev}, $self->{column_prev}) = ($self->{line}, $self->{column}); $self->{column}++; if ($self->{nc} == 0x000A) { # LF $self->{line}++; $self->{column} = 0; } elsif ($self->{nc} == 0x000D) { # CR ## TODO: support for abort/streaming my $next = ''; if ($input->read ($next, 1) and $next ne "\x0A") { $self->{next_nc} = $next; } $self->{nc} = 0x000A; # LF # MUST $self->{line}++; $self->{column} = 0; } elsif ($self->{nc} == 0x0000) { # NULL $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST } }; $self->{read_until} = sub { #my ($scalar, $specials_range, $offset) = @_; return 0 if defined $self->{next_nc}; my $pattern = qr/[^$_[1]\x00\x0A\x0D]/; my $offset = $_[2] || 0; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { pos ($self->{char_buffer}) = $self->{char_buffer_pos}; if ($self->{char_buffer} =~ /\G(?>$pattern)+/) { substr ($_[0], $offset) = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]); my $count = $+[0] - $-[0]; if ($count) { $self->{column} += $count; $self->{char_buffer_pos} += $count; $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column} - 1; $self->{nc} = -1; } return $count; } else { return 0; } } else { my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]); if ($count) { $self->{column} += $count; $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column} - 1; $self->{nc} = -1; } return $count; } }; # $self->{read_until} my $onerror = $_[2] || sub { my (%opt) = @_; my $line = $opt{token} ? $opt{token}->{line} : $opt{line}; my $column = $opt{token} ? $opt{token}->{column} : $opt{column}; warn "Parse error ($opt{type}) at line $line column $column\n"; }; $self->{parse_error} = sub { $onerror->(line => $self->{line}, column => $self->{column}, @_); }; my $char_onerror = sub { my (undef, $type, %opt) = @_; $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode', line => $self->{line}, column => $self->{column} + 1, %opt, type => $type); }; # $char_onerror if ($_[3]) { $input = $_[3]->($input); $input->onerror ($char_onerror); } else { $input->onerror ($char_onerror) unless defined $input->onerror; } $self->_initialize_tokenizer; $self->_initialize_tree_constructor; $self->_construct_tree; $self->_terminate_tree_constructor; delete $self->{parse_error}; # remove loop return $self->{document}; } # parse_char_stream sub new ($) { my $class = shift; my $self = bless { level => {must => 'm', should => 's', warn => 'w', info => 'i', uncertain => 'u'}, }, $class; $self->{set_nc} = sub { $self->{nc} = -1; }; $self->{parse_error} = sub { # }; $self->{change_encoding} = sub { # if ($_[0] is a supported encoding) { # run "change the encoding" algorithm; # throw Whatpm::HTML::RestartParser (charset => $new_encoding); # } }; $self->{application_cache_selection} = sub { # }; return $self; } # new sub CM_ENTITY () { 0b001 } # & markup in data sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited) sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any) sub PLAINTEXT_CONTENT_MODEL () { 0 } sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP } sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP } sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP } sub DATA_STATE () { 0 } #sub ENTITY_DATA_STATE () { 1 } sub TAG_OPEN_STATE () { 2 } sub CLOSE_TAG_OPEN_STATE () { 3 } sub TAG_NAME_STATE () { 4 } sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 } sub ATTRIBUTE_NAME_STATE () { 6 } sub AFTER_ATTRIBUTE_NAME_STATE () { 7 } sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 } sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 } sub MARKUP_DECLARATION_OPEN_STATE () { 13 } sub COMMENT_START_STATE () { 14 } sub COMMENT_START_DASH_STATE () { 15 } sub COMMENT_STATE () { 16 } sub COMMENT_END_STATE () { 17 } sub COMMENT_END_DASH_STATE () { 18 } sub BOGUS_COMMENT_STATE () { 19 } sub DOCTYPE_STATE () { 20 } sub BEFORE_DOCTYPE_NAME_STATE () { 21 } sub DOCTYPE_NAME_STATE () { 22 } sub AFTER_DOCTYPE_NAME_STATE () { 23 } sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 } sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 } sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 } sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 } sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 } sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 } sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 } sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 } sub BOGUS_DOCTYPE_STATE () { 32 } sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 } sub SELF_CLOSING_START_TAG_STATE () { 34 } sub CDATA_SECTION_STATE () { 35 } sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec ## NOTE: "Entity data state", "entity in attribute value state", and ## "consume a character reference" algorithm are jointly implemented ## using the following six states: sub ENTITY_STATE () { 44 } sub ENTITY_HASH_STATE () { 45 } sub NCR_NUM_STATE () { 46 } sub HEXREF_X_STATE () { 47 } sub HEXREF_HEX_STATE () { 48 } sub ENTITY_NAME_STATE () { 49 } sub PCDATA_STATE () { 50 } # "data state" in the spec sub DOCTYPE_TOKEN () { 1 } sub COMMENT_TOKEN () { 2 } sub START_TAG_TOKEN () { 3 } sub END_TAG_TOKEN () { 4 } sub END_OF_FILE_TOKEN () { 5 } sub CHARACTER_TOKEN () { 6 } sub AFTER_HTML_IMS () { 0b100 } sub HEAD_IMS () { 0b1000 } sub BODY_IMS () { 0b10000 } sub BODY_TABLE_IMS () { 0b100000 } sub TABLE_IMS () { 0b1000000 } sub ROW_IMS () { 0b10000000 } sub BODY_AFTER_IMS () { 0b100000000 } sub FRAME_IMS () { 0b1000000000 } sub SELECT_IMS () { 0b10000000000 } sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 } ## NOTE: "in foreign content" insertion mode is special; it is combined ## with the secondary insertion mode. In this parser, they are stored ## together in the bit-or'ed form. ## NOTE: "initial" and "before html" insertion modes have no constants. ## NOTE: "after after body" insertion mode. sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS } ## NOTE: "after after frameset" insertion mode. sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS } sub IN_HEAD_IM () { HEAD_IMS | 0b00 } sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 } sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 } sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 } sub IN_BODY_IM () { BODY_IMS } sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 } sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 } sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 } sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 } sub IN_TABLE_IM () { TABLE_IMS } sub AFTER_BODY_IM () { BODY_AFTER_IMS } sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 } sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 } sub IN_SELECT_IM () { SELECT_IMS | 0b01 } sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 } sub IN_COLUMN_GROUP_IM () { 0b10 } ## Implementations MUST act as if state machine in the spec sub _initialize_tokenizer ($) { my $self = shift; $self->{state} = DATA_STATE; # MUST #$self->{s_kwd}; # state keyword - initialized when used #$self->{entity__value}; # initialized when used #$self->{entity__match}; # initialized when used $self->{content_model} = PCDATA_CONTENT_MODEL; # be undef $self->{ct}; # current token undef $self->{ca}; # current attribute undef $self->{last_stag_name}; # last emitted start tag name #$self->{prev_state}; # initialized when used delete $self->{self_closing}; $self->{char_buffer} = ''; $self->{char_buffer_pos} = 0; $self->{nc} = -1; # next input character #$self->{next_nc} if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } $self->{token} = []; # $self->{escape} } # _initialize_tokenizer ## A token has: ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN ## ->{name} (DOCTYPE_TOKEN) ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) ## ->{pubid} (DOCTYPE_TOKEN) ## ->{sysid} (DOCTYPE_TOKEN) ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) ## ->{name} ## ->{value} ## ->{has_reference} == 1 or 0 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. ## |->{self_closing}| is used to save the value of |$self->{self_closing}| ## while the token is pushed back to the stack. ## Emitted token MUST immediately be handled by the tree construction state. ## Before each step, UA MAY check to see if either one of the scripts in ## "list of scripts that will execute as soon as possible" or the first ## script in the "list of scripts that will execute asynchronously", ## has completed loading. If one has, then it MUST be executed ## and removed from the list. ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## (This requirement was dropped from HTML5 spec, unfortunately.) sub _get_next_token ($) { my $self = shift; if ($self->{self_closing}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct}); ## NOTE: The |self_closing| flag is only set by start tag token. ## In addition, when a start tag token is emitted, it is always set to ## |ct|. delete $self->{self_closing}; } if (@{$self->{token}}) { $self->{self_closing} = $self->{token}->[0]->{self_closing}; return shift @{$self->{token}}; } A: { if ($self->{state} == PCDATA_STATE) { ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model. if ($self->{nc} == 0x0026) { # & ## NOTE: In the spec, the tokenizer is switched to the ## "entity data state". In this implementation, the tokenizer ## is switched to the |ENTITY_STATE|, which is an implementation ## of the "consume a character reference" algorithm. $self->{entity_add} = -1; $self->{prev_state} = DATA_STATE; $self->{state} = ENTITY_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{nc} == 0x003C) { # < $self->{state} = TAG_OPEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{nc} == -1) { return ({type => END_OF_FILE_TOKEN, line => $self->{line}, column => $self->{column}}); last A; ## TODO: ok? } else { # } # Anything else my $token = {type => CHARACTER_TOKEN, data => chr $self->{nc}, line => $self->{line}, column => $self->{column}, }; $self->{read_until}->($token->{data}, q[<&], length $token->{data}); ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($token); redo A; } elsif ($self->{state} == DATA_STATE) { $self->{s_kwd} = '' unless defined $self->{s_kwd}; if ($self->{nc} == 0x0026) { # & $self->{s_kwd} = ''; if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA not $self->{escape}) { ## NOTE: In the spec, the tokenizer is switched to the ## "entity data state". In this implementation, the tokenizer ## is switched to the |ENTITY_STATE|, which is an implementation ## of the "consume a character reference" algorithm. $self->{entity_add} = -1; $self->{prev_state} = DATA_STATE; $self->{state} = ENTITY_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { # } } elsif ($self->{nc} == 0x002D) { # - if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA $self->{s_kwd} .= '-'; if ($self->{s_kwd} eq '