package Whatpm::HTML; use strict; our $VERSION=do{my @r=(q$Revision: 1.198 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r}; use Error qw(:try); ## NOTE: This module don't check all HTML5 parse errors; character ## encoding related parse errors are expected to be handled by relevant ## modules. ## Parse errors for control characters that are not allowed in HTML5 ## documents, for surrogate code points, and for noncharacter code ## points, as well as U+FFFD substitions for characters whose code points ## is higher than U+10FFFF may be detected by combining the parser with ## the checker implemented by Whatpm::Charset::UnicodeChecker (for its ## usage example, see |t/HTML-tree.t| in the Whatpm package or the ## WebHACC::Language::HTML module in the WebHACC package). ## ISSUE: ## var doc = implementation.createDocument (null, null, null); ## doc.write (''); ## alert (doc.compatMode); require IO::Handle; my $HTML_NS = q; my $MML_NS = q; my $SVG_NS = q; my $XLINK_NS = q; my $XML_NS = q; my $XMLNS_NS = q; sub A_EL () { 0b1 } sub ADDRESS_EL () { 0b10 } sub BODY_EL () { 0b100 } sub BUTTON_EL () { 0b1000 } sub CAPTION_EL () { 0b10000 } sub DD_EL () { 0b100000 } sub DIV_EL () { 0b1000000 } sub DT_EL () { 0b10000000 } sub FORM_EL () { 0b100000000 } sub FORMATTING_EL () { 0b1000000000 } sub FRAMESET_EL () { 0b10000000000 } sub HEADING_EL () { 0b100000000000 } sub HTML_EL () { 0b1000000000000 } sub LI_EL () { 0b10000000000000 } sub NOBR_EL () { 0b100000000000000 } sub OPTION_EL () { 0b1000000000000000 } sub OPTGROUP_EL () { 0b10000000000000000 } sub P_EL () { 0b100000000000000000 } sub SELECT_EL () { 0b1000000000000000000 } sub TABLE_EL () { 0b10000000000000000000 } sub TABLE_CELL_EL () { 0b100000000000000000000 } sub TABLE_ROW_EL () { 0b1000000000000000000000 } sub TABLE_ROW_GROUP_EL () { 0b10000000000000000000000 } sub MISC_SCOPING_EL () { 0b100000000000000000000000 } sub MISC_SPECIAL_EL () { 0b1000000000000000000000000 } sub FOREIGN_EL () { 0b10000000000000000000000000 } sub FOREIGN_FLOW_CONTENT_EL () { 0b100000000000000000000000000 } sub MML_AXML_EL () { 0b1000000000000000000000000000 } sub RUBY_EL () { 0b10000000000000000000000000000 } sub RUBY_COMPONENT_EL () { 0b100000000000000000000000000000 } sub TABLE_ROWS_EL () { TABLE_EL | TABLE_ROW_EL | TABLE_ROW_GROUP_EL } ## NOTE: Used in "generate implied end tags" algorithm. ## NOTE: There is a code where a modified version of ## END_TAG_OPTIONAL_EL is used in "generate implied end tags" ## implementation (search for the algorithm name). sub END_TAG_OPTIONAL_EL () { DD_EL | DT_EL | LI_EL | OPTION_EL | OPTGROUP_EL | P_EL | RUBY_COMPONENT_EL } ## NOTE: Used in and EOF algorithms. sub ALL_END_TAG_OPTIONAL_EL () { DD_EL | DT_EL | LI_EL | P_EL | ## ISSUE: option, optgroup, rt, rp? BODY_EL | HTML_EL | TABLE_CELL_EL | TABLE_ROW_EL | TABLE_ROW_GROUP_EL } sub SCOPING_EL () { BUTTON_EL | CAPTION_EL | HTML_EL | TABLE_EL | TABLE_CELL_EL | MISC_SCOPING_EL } sub TABLE_SCOPING_EL () { HTML_EL | TABLE_EL } sub TABLE_ROWS_SCOPING_EL () { HTML_EL | TABLE_ROW_GROUP_EL } sub TABLE_ROW_SCOPING_EL () { HTML_EL | TABLE_ROW_EL } sub SPECIAL_EL () { ADDRESS_EL | BODY_EL | DIV_EL | DD_EL | DT_EL | LI_EL | P_EL | FORM_EL | FRAMESET_EL | HEADING_EL | SELECT_EL | TABLE_ROW_EL | TABLE_ROW_GROUP_EL | MISC_SPECIAL_EL } my $el_category = { a => A_EL | FORMATTING_EL, address => ADDRESS_EL, applet => MISC_SCOPING_EL, area => MISC_SPECIAL_EL, article => MISC_SPECIAL_EL, aside => MISC_SPECIAL_EL, b => FORMATTING_EL, base => MISC_SPECIAL_EL, basefont => MISC_SPECIAL_EL, bgsound => MISC_SPECIAL_EL, big => FORMATTING_EL, blockquote => MISC_SPECIAL_EL, body => BODY_EL, br => MISC_SPECIAL_EL, button => BUTTON_EL, caption => CAPTION_EL, center => MISC_SPECIAL_EL, col => MISC_SPECIAL_EL, colgroup => MISC_SPECIAL_EL, command => MISC_SPECIAL_EL, datagrid => MISC_SPECIAL_EL, dd => DD_EL, details => MISC_SPECIAL_EL, dialog => MISC_SPECIAL_EL, dir => MISC_SPECIAL_EL, div => DIV_EL, dl => MISC_SPECIAL_EL, dt => DT_EL, em => FORMATTING_EL, embed => MISC_SPECIAL_EL, eventsource => MISC_SPECIAL_EL, fieldset => MISC_SPECIAL_EL, figure => MISC_SPECIAL_EL, font => FORMATTING_EL, footer => MISC_SPECIAL_EL, form => FORM_EL, frame => MISC_SPECIAL_EL, frameset => FRAMESET_EL, h1 => HEADING_EL, h2 => HEADING_EL, h3 => HEADING_EL, h4 => HEADING_EL, h5 => HEADING_EL, h6 => HEADING_EL, head => MISC_SPECIAL_EL, header => MISC_SPECIAL_EL, hr => MISC_SPECIAL_EL, html => HTML_EL, i => FORMATTING_EL, iframe => MISC_SPECIAL_EL, img => MISC_SPECIAL_EL, #image => MISC_SPECIAL_EL, ## NOTE: Commented out in the spec. input => MISC_SPECIAL_EL, isindex => MISC_SPECIAL_EL, li => LI_EL, link => MISC_SPECIAL_EL, listing => MISC_SPECIAL_EL, marquee => MISC_SCOPING_EL, menu => MISC_SPECIAL_EL, meta => MISC_SPECIAL_EL, nav => MISC_SPECIAL_EL, nobr => NOBR_EL | FORMATTING_EL, noembed => MISC_SPECIAL_EL, noframes => MISC_SPECIAL_EL, noscript => MISC_SPECIAL_EL, object => MISC_SCOPING_EL, ol => MISC_SPECIAL_EL, optgroup => OPTGROUP_EL, option => OPTION_EL, p => P_EL, param => MISC_SPECIAL_EL, plaintext => MISC_SPECIAL_EL, pre => MISC_SPECIAL_EL, rp => RUBY_COMPONENT_EL, rt => RUBY_COMPONENT_EL, ruby => RUBY_EL, s => FORMATTING_EL, script => MISC_SPECIAL_EL, select => SELECT_EL, section => MISC_SPECIAL_EL, small => FORMATTING_EL, spacer => MISC_SPECIAL_EL, strike => FORMATTING_EL, strong => FORMATTING_EL, style => MISC_SPECIAL_EL, table => TABLE_EL, tbody => TABLE_ROW_GROUP_EL, td => TABLE_CELL_EL, textarea => MISC_SPECIAL_EL, tfoot => TABLE_ROW_GROUP_EL, th => TABLE_CELL_EL, thead => TABLE_ROW_GROUP_EL, title => MISC_SPECIAL_EL, tr => TABLE_ROW_EL, tt => FORMATTING_EL, u => FORMATTING_EL, ul => MISC_SPECIAL_EL, wbr => MISC_SPECIAL_EL, }; my $el_category_f = { $MML_NS => { 'annotation-xml' => MML_AXML_EL, mi => FOREIGN_FLOW_CONTENT_EL, mo => FOREIGN_FLOW_CONTENT_EL, mn => FOREIGN_FLOW_CONTENT_EL, ms => FOREIGN_FLOW_CONTENT_EL, mtext => FOREIGN_FLOW_CONTENT_EL, }, $SVG_NS => { foreignObject => FOREIGN_FLOW_CONTENT_EL | MISC_SCOPING_EL, desc => FOREIGN_FLOW_CONTENT_EL, title => FOREIGN_FLOW_CONTENT_EL, }, ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements. }; my $svg_attr_name = { attributename => 'attributeName', attributetype => 'attributeType', basefrequency => 'baseFrequency', baseprofile => 'baseProfile', calcmode => 'calcMode', clippathunits => 'clipPathUnits', contentscripttype => 'contentScriptType', contentstyletype => 'contentStyleType', diffuseconstant => 'diffuseConstant', edgemode => 'edgeMode', externalresourcesrequired => 'externalResourcesRequired', filterres => 'filterRes', filterunits => 'filterUnits', glyphref => 'glyphRef', gradienttransform => 'gradientTransform', gradientunits => 'gradientUnits', kernelmatrix => 'kernelMatrix', kernelunitlength => 'kernelUnitLength', keypoints => 'keyPoints', keysplines => 'keySplines', keytimes => 'keyTimes', lengthadjust => 'lengthAdjust', limitingconeangle => 'limitingConeAngle', markerheight => 'markerHeight', markerunits => 'markerUnits', markerwidth => 'markerWidth', maskcontentunits => 'maskContentUnits', maskunits => 'maskUnits', numoctaves => 'numOctaves', pathlength => 'pathLength', patterncontentunits => 'patternContentUnits', patterntransform => 'patternTransform', patternunits => 'patternUnits', pointsatx => 'pointsAtX', pointsaty => 'pointsAtY', pointsatz => 'pointsAtZ', preservealpha => 'preserveAlpha', preserveaspectratio => 'preserveAspectRatio', primitiveunits => 'primitiveUnits', refx => 'refX', refy => 'refY', repeatcount => 'repeatCount', repeatdur => 'repeatDur', requiredextensions => 'requiredExtensions', requiredfeatures => 'requiredFeatures', specularconstant => 'specularConstant', specularexponent => 'specularExponent', spreadmethod => 'spreadMethod', startoffset => 'startOffset', stddeviation => 'stdDeviation', stitchtiles => 'stitchTiles', surfacescale => 'surfaceScale', systemlanguage => 'systemLanguage', tablevalues => 'tableValues', targetx => 'targetX', targety => 'targetY', textlength => 'textLength', viewbox => 'viewBox', viewtarget => 'viewTarget', xchannelselector => 'xChannelSelector', ychannelselector => 'yChannelSelector', zoomandpan => 'zoomAndPan', }; my $foreign_attr_xname = { 'xlink:actuate' => [$XLINK_NS, ['xlink', 'actuate']], 'xlink:arcrole' => [$XLINK_NS, ['xlink', 'arcrole']], 'xlink:href' => [$XLINK_NS, ['xlink', 'href']], 'xlink:role' => [$XLINK_NS, ['xlink', 'role']], 'xlink:show' => [$XLINK_NS, ['xlink', 'show']], 'xlink:title' => [$XLINK_NS, ['xlink', 'title']], 'xlink:type' => [$XLINK_NS, ['xlink', 'type']], 'xml:base' => [$XML_NS, ['xml', 'base']], 'xml:lang' => [$XML_NS, ['xml', 'lang']], 'xml:space' => [$XML_NS, ['xml', 'space']], 'xmlns' => [$XMLNS_NS, [undef, 'xmlns']], 'xmlns:xlink' => [$XMLNS_NS, ['xmlns', 'xlink']], }; ## ISSUE: xmlns:xlink="non-xlink-ns" is not an error. my $charref_map = { 0x0D => 0x000A, 0x80 => 0x20AC, 0x81 => 0xFFFD, 0x82 => 0x201A, 0x83 => 0x0192, 0x84 => 0x201E, 0x85 => 0x2026, 0x86 => 0x2020, 0x87 => 0x2021, 0x88 => 0x02C6, 0x89 => 0x2030, 0x8A => 0x0160, 0x8B => 0x2039, 0x8C => 0x0152, 0x8D => 0xFFFD, 0x8E => 0x017D, 0x8F => 0xFFFD, 0x90 => 0xFFFD, 0x91 => 0x2018, 0x92 => 0x2019, 0x93 => 0x201C, 0x94 => 0x201D, 0x95 => 0x2022, 0x96 => 0x2013, 0x97 => 0x2014, 0x98 => 0x02DC, 0x99 => 0x2122, 0x9A => 0x0161, 0x9B => 0x203A, 0x9C => 0x0153, 0x9D => 0xFFFD, 0x9E => 0x017E, 0x9F => 0x0178, }; # $charref_map $charref_map->{$_} = 0xFFFD for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F, 0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF; ## TODO: Invoke the reset algorithm when a resettable element is ## created (cf. HTML5 revision 2259). sub parse_byte_string ($$$$;$) { my $self = shift; my $charset_name = shift; open my $input, '<', ref $_[0] ? $_[0] : \($_[0]); return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]); } # parse_byte_string sub parse_byte_stream ($$$$;$$) { # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_; my $self = ref $_[0] ? shift : shift->new; my $charset_name = shift; my $byte_stream = $_[0]; my $onerror = $_[2] || sub { my (%opt) = @_; warn "Parse error ($opt{type})\n"; }; $self->{parse_error} = $onerror; # updated later by parse_char_string my $get_wrapper = $_[3] || sub ($) { return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle }; ## HTML5 encoding sniffing algorithm require Message::Charset::Info; my $charset; my $buffer; my ($char_stream, $e_status); SNIFFING: { ## NOTE: By setting |allow_fallback| option true when the ## |get_decode_handle| method is invoked, we ignore what the HTML5 ## spec requires, i.e. unsupported encoding should be ignored. ## TODO: We should not do this unless the parser is invoked ## in the conformance checking mode, in which this behavior ## would be useful. ## Step 1 if (defined $charset_name) { $charset = Message::Charset::Info->get_by_html_name ($charset_name); ## TODO: Is this ok? Transfer protocol's parameter should be ## interpreted in its semantics? ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1); if ($char_stream) { $self->{confident} = 1; last SNIFFING; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'charset:not supported', layer => 'encode', line => 1, column => 1, value => $charset_name, level => $self->{level}->{uncertain}); } } ## Step 2 my $byte_buffer = ''; for (1..1024) { my $char = $byte_stream->getc; last unless defined $char; $byte_buffer .= $char; } ## TODO: timeout ## Step 3 if ($byte_buffer =~ /^\xFE\xFF/) { $charset = Message::Charset::Info->get_by_html_name ('utf-16be'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } elsif ($byte_buffer =~ /^\xFF\xFE/) { $charset = Message::Charset::Info->get_by_html_name ('utf-16le'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) { $charset = Message::Charset::Info->get_by_html_name ('utf-8'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } ## Step 4 ## TODO: ## Step 5 ## TODO: from history ## Step 6 require Whatpm::Charset::UniversalCharDet; $charset_name = Whatpm::Charset::UniversalCharDet->detect_byte_string ($byte_buffer); if (defined $charset_name) { $charset = Message::Charset::Info->get_by_html_name ($charset_name); require Whatpm::Charset::DecodeHandle; $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new ($byte_stream); ($char_stream, $e_status) = $charset->get_decode_handle ($buffer, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); if ($char_stream) { $buffer->{buffer} = $byte_buffer; $self->{parse_error}->(level => $self->{level}->{must}, type => 'sniffing:chardet', text => $charset_name, level => $self->{level}->{info}, layer => 'encode', line => 1, column => 1); $self->{confident} = 0; last SNIFFING; } } ## Step 7: default ## TODO: Make this configurable. $charset = Message::Charset::Info->get_by_html_name ('windows-1252'); ## NOTE: We choose |windows-1252| here, since |utf-8| should be ## detectable in the step 6. require Whatpm::Charset::DecodeHandle; $buffer = Whatpm::Charset::DecodeHandle::ByteBuffer->new ($byte_stream); ($char_stream, $e_status) = $charset->get_decode_handle ($buffer, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $buffer->{buffer} = $byte_buffer; $self->{parse_error}->(level => $self->{level}->{must}, type => 'sniffing:default', text => 'windows-1252', level => $self->{level}->{info}, line => 1, column => 1, layer => 'encode'); $self->{confident} = 0; } # SNIFFING if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name? $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:fallback', #text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } elsif (not ($e_status & Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) { $self->{input_encoding} = $charset->get_iana_name; $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:no error', text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } else { $self->{input_encoding} = $charset->get_iana_name; } $self->{change_encoding} = sub { my $self = shift; $charset_name = shift; my $token = shift; $charset = Message::Charset::Info->get_by_html_name ($charset_name); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \ $buffer->{buffer}); if ($char_stream) { # if supported ## "Change the encoding" algorithm: ## Step 1 if ($charset->{category} & Message::Charset::Info::CHARSET_CATEGORY_UTF16 ()) { $charset = Message::Charset::Info->get_by_html_name ('utf-8'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, byte_buffer => \ $buffer->{buffer}); } $charset_name = $charset->get_iana_name; ## Step 2 if (defined $self->{input_encoding} and $self->{input_encoding} eq $charset_name) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'charset label:matching', text => $charset_name, level => $self->{level}->{info}); $self->{confident} = 1; return; } $self->{parse_error}->(level => $self->{level}->{must}, type => 'charset label detected', text => $self->{input_encoding}, value => $charset_name, level => $self->{level}->{warn}, token => $token); ## Step 3 # if (can) { ## change the encoding on the fly. #$self->{confident} = 1; #return; # } ## Step 4 throw Whatpm::HTML::RestartParser (); } }; # $self->{change_encoding} my $char_onerror = sub { my (undef, $type, %opt) = @_; $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode', line => $self->{line}, column => $self->{column} + 1, %opt, type => $type); if ($opt{octets}) { ${$opt{octets}} = "\x{FFFD}"; # relacement character } }; my $wrapped_char_stream = $get_wrapper->($char_stream); $wrapped_char_stream->onerror ($char_onerror); my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef; my $return; try { $return = $self->parse_char_stream ($wrapped_char_stream, @args); } catch Whatpm::HTML::RestartParser with { ## NOTE: Invoked after {change_encoding}. if ($e_status & Message::Charset::Info::FALLBACK_ENCODING_IMPL ()) { $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name? $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:fallback', level => $self->{level}->{uncertain}, #text => $self->{input_encoding}, line => 1, column => 1, layer => 'encode'); } elsif (not ($e_status & Message::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) { $self->{input_encoding} = $charset->get_iana_name; $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:no error', text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } else { $self->{input_encoding} = $charset->get_iana_name; } $self->{confident} = 1; $wrapped_char_stream = $get_wrapper->($char_stream); $wrapped_char_stream->onerror ($char_onerror); $return = $self->parse_char_stream ($wrapped_char_stream, @args); }; return $return; } # parse_byte_stream ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM ## and the HTML layer MUST ignore it. However, we does strip BOM in ## the encoding layer and the HTML layer does not ignore any U+FEFF, ## because the core part of our HTML parser expects a string of character, ## not a string of bytes or code units or anything which might contain a BOM. ## Therefore, any parser interface that accepts a string of bytes, ## such as |parse_byte_string| in this module, must ensure that it does ## strip the BOM and never strip any ZWNBSP. sub parse_char_string ($$$;$$) { #my ($self, $s, $doc, $onerror, $get_wrapper) = @_; my $self = shift; my $s = ref $_[0] ? $_[0] : \($_[0]); require Whatpm::Charset::DecodeHandle; my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s); return $self->parse_char_stream ($input, @_[1..$#_]); } # parse_char_string *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility. sub parse_char_stream ($$$;$$) { my $self = ref $_[0] ? shift : shift->new; my $input = $_[0]; $self->{document} = $_[1]; @{$self->{document}->child_nodes} = (); ## NOTE: |set_inner_html| copies most of this method's code $self->{confident} = 1 unless exists $self->{confident}; $self->{document}->input_encoding ($self->{input_encoding}) if defined $self->{input_encoding}; ## TODO: |{input_encoding}| is needless? $self->{line_prev} = $self->{line} = 1; $self->{column_prev} = -1; $self->{column} = 0; $self->{set_nc} = sub { my $self = shift; my $char = ''; if (defined $self->{next_nc}) { $char = $self->{next_nc}; delete $self->{next_nc}; $self->{nc} = ord $char; } else { $self->{char_buffer} = ''; $self->{char_buffer_pos} = 0; my $count = $input->manakai_read_until ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos}); if ($count) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); return; } if ($input->read ($char, 1)) { $self->{nc} = ord $char; } else { $self->{nc} = -1; return; } } ($self->{line_prev}, $self->{column_prev}) = ($self->{line}, $self->{column}); $self->{column}++; if ($self->{nc} == 0x000A) { # LF $self->{line}++; $self->{column} = 0; } elsif ($self->{nc} == 0x000D) { # CR ## TODO: support for abort/streaming my $next = ''; if ($input->read ($next, 1) and $next ne "\x0A") { $self->{next_nc} = $next; } $self->{nc} = 0x000A; # LF # MUST $self->{line}++; $self->{column} = 0; } elsif ($self->{nc} == 0x0000) { # NULL $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST } }; $self->{read_until} = sub { #my ($scalar, $specials_range, $offset) = @_; return 0 if defined $self->{next_nc}; my $pattern = qr/[^$_[1]\x00\x0A\x0D]/; my $offset = $_[2] || 0; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { pos ($self->{char_buffer}) = $self->{char_buffer_pos}; if ($self->{char_buffer} =~ /\G(?>$pattern)+/) { substr ($_[0], $offset) = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]); my $count = $+[0] - $-[0]; if ($count) { $self->{column} += $count; $self->{char_buffer_pos} += $count; $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column} - 1; $self->{nc} = -1; } return $count; } else { return 0; } } else { my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]); if ($count) { $self->{column} += $count; $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column} - 1; $self->{nc} = -1; } return $count; } }; # $self->{read_until} my $onerror = $_[2] || sub { my (%opt) = @_; my $line = $opt{token} ? $opt{token}->{line} : $opt{line}; my $column = $opt{token} ? $opt{token}->{column} : $opt{column}; warn "Parse error ($opt{type}) at line $line column $column\n"; }; $self->{parse_error} = sub { $onerror->(line => $self->{line}, column => $self->{column}, @_); }; my $char_onerror = sub { my (undef, $type, %opt) = @_; $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode', line => $self->{line}, column => $self->{column} + 1, %opt, type => $type); }; # $char_onerror if ($_[3]) { $input = $_[3]->($input); $input->onerror ($char_onerror); } else { $input->onerror ($char_onerror) unless defined $input->onerror; } $self->_initialize_tokenizer; $self->_initialize_tree_constructor; $self->_construct_tree; $self->_terminate_tree_constructor; delete $self->{parse_error}; # remove loop return $self->{document}; } # parse_char_stream sub new ($) { my $class = shift; my $self = bless { level => {must => 'm', should => 's', warn => 'w', info => 'i', uncertain => 'u'}, }, $class; $self->{set_nc} = sub { $self->{nc} = -1; }; $self->{parse_error} = sub { # }; $self->{change_encoding} = sub { # if ($_[0] is a supported encoding) { # run "change the encoding" algorithm; # throw Whatpm::HTML::RestartParser (charset => $new_encoding); # } }; $self->{application_cache_selection} = sub { # }; return $self; } # new sub CM_ENTITY () { 0b001 } # & markup in data sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited) sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any) sub PLAINTEXT_CONTENT_MODEL () { 0 } sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP } sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP } sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP } sub DATA_STATE () { 0 } #sub ENTITY_DATA_STATE () { 1 } sub TAG_OPEN_STATE () { 2 } sub CLOSE_TAG_OPEN_STATE () { 3 } sub TAG_NAME_STATE () { 4 } sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 } sub ATTRIBUTE_NAME_STATE () { 6 } sub AFTER_ATTRIBUTE_NAME_STATE () { 7 } sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 } sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 } sub MARKUP_DECLARATION_OPEN_STATE () { 13 } sub COMMENT_START_STATE () { 14 } sub COMMENT_START_DASH_STATE () { 15 } sub COMMENT_STATE () { 16 } sub COMMENT_END_STATE () { 17 } sub COMMENT_END_DASH_STATE () { 18 } sub BOGUS_COMMENT_STATE () { 19 } sub DOCTYPE_STATE () { 20 } sub BEFORE_DOCTYPE_NAME_STATE () { 21 } sub DOCTYPE_NAME_STATE () { 22 } sub AFTER_DOCTYPE_NAME_STATE () { 23 } sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 } sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 } sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 } sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 } sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 } sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 } sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 } sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 } sub BOGUS_DOCTYPE_STATE () { 32 } sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 } sub SELF_CLOSING_START_TAG_STATE () { 34 } sub CDATA_SECTION_STATE () { 35 } sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec ## NOTE: "Entity data state", "entity in attribute value state", and ## "consume a character reference" algorithm are jointly implemented ## using the following six states: sub ENTITY_STATE () { 44 } sub ENTITY_HASH_STATE () { 45 } sub NCR_NUM_STATE () { 46 } sub HEXREF_X_STATE () { 47 } sub HEXREF_HEX_STATE () { 48 } sub ENTITY_NAME_STATE () { 49 } sub PCDATA_STATE () { 50 } # "data state" in the spec sub DOCTYPE_TOKEN () { 1 } sub COMMENT_TOKEN () { 2 } sub START_TAG_TOKEN () { 3 } sub END_TAG_TOKEN () { 4 } sub END_OF_FILE_TOKEN () { 5 } sub CHARACTER_TOKEN () { 6 } sub AFTER_HTML_IMS () { 0b100 } sub HEAD_IMS () { 0b1000 } sub BODY_IMS () { 0b10000 } sub BODY_TABLE_IMS () { 0b100000 } sub TABLE_IMS () { 0b1000000 } sub ROW_IMS () { 0b10000000 } sub BODY_AFTER_IMS () { 0b100000000 } sub FRAME_IMS () { 0b1000000000 } sub SELECT_IMS () { 0b10000000000 } sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 } ## NOTE: "in foreign content" insertion mode is special; it is combined ## with the secondary insertion mode. In this parser, they are stored ## together in the bit-or'ed form. ## NOTE: "initial" and "before html" insertion modes have no constants. ## NOTE: "after after body" insertion mode. sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS } ## NOTE: "after after frameset" insertion mode. sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS } sub IN_HEAD_IM () { HEAD_IMS | 0b00 } sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 } sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 } sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 } sub IN_BODY_IM () { BODY_IMS } sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 } sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 } sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 } sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 } sub IN_TABLE_IM () { TABLE_IMS } sub AFTER_BODY_IM () { BODY_AFTER_IMS } sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 } sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 } sub IN_SELECT_IM () { SELECT_IMS | 0b01 } sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 } sub IN_COLUMN_GROUP_IM () { 0b10 } ## Implementations MUST act as if state machine in the spec sub _initialize_tokenizer ($) { my $self = shift; $self->{state} = DATA_STATE; # MUST #$self->{s_kwd}; # state keyword - initialized when used #$self->{entity__value}; # initialized when used #$self->{entity__match}; # initialized when used $self->{content_model} = PCDATA_CONTENT_MODEL; # be undef $self->{ct}; # current token undef $self->{ca}; # current attribute undef $self->{last_stag_name}; # last emitted start tag name #$self->{prev_state}; # initialized when used delete $self->{self_closing}; $self->{char_buffer} = ''; $self->{char_buffer_pos} = 0; $self->{nc} = -1; # next input character #$self->{next_nc} if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } $self->{token} = []; # $self->{escape} } # _initialize_tokenizer ## A token has: ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN ## ->{name} (DOCTYPE_TOKEN) ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) ## ->{pubid} (DOCTYPE_TOKEN) ## ->{sysid} (DOCTYPE_TOKEN) ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) ## ->{name} ## ->{value} ## ->{has_reference} == 1 or 0 ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN) ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. ## |->{self_closing}| is used to save the value of |$self->{self_closing}| ## while the token is pushed back to the stack. ## Emitted token MUST immediately be handled by the tree construction state. ## Before each step, UA MAY check to see if either one of the scripts in ## "list of scripts that will execute as soon as possible" or the first ## script in the "list of scripts that will execute asynchronously", ## has completed loading. If one has, then it MUST be executed ## and removed from the list. ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## (This requirement was dropped from HTML5 spec, unfortunately.) my $is_space = { 0x0009 => 1, # CHARACTER TABULATION (HT) 0x000A => 1, # LINE FEED (LF) #0x000B => 0, # LINE TABULATION (VT) 0x000C => 1, # FORM FEED (FF) #0x000D => 1, # CARRIAGE RETURN (CR) 0x0020 => 1, # SPACE (SP) }; sub _get_next_token ($) { my $self = shift; if ($self->{self_closing}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct}); ## NOTE: The |self_closing| flag is only set by start tag token. ## In addition, when a start tag token is emitted, it is always set to ## |ct|. delete $self->{self_closing}; } if (@{$self->{token}}) { $self->{self_closing} = $self->{token}->[0]->{self_closing}; return shift @{$self->{token}}; } A: { if ($self->{state} == PCDATA_STATE) { ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model. if ($self->{nc} == 0x0026) { # & ## NOTE: In the spec, the tokenizer is switched to the ## "entity data state". In this implementation, the tokenizer ## is switched to the |ENTITY_STATE|, which is an implementation ## of the "consume a character reference" algorithm. $self->{entity_add} = -1; $self->{prev_state} = DATA_STATE; $self->{state} = ENTITY_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{nc} == 0x003C) { # < $self->{state} = TAG_OPEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{nc} == -1) { return ({type => END_OF_FILE_TOKEN, line => $self->{line}, column => $self->{column}}); last A; ## TODO: ok? } else { # } # Anything else my $token = {type => CHARACTER_TOKEN, data => chr $self->{nc}, line => $self->{line}, column => $self->{column}, }; $self->{read_until}->($token->{data}, q[<&], length $token->{data}); ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($token); redo A; } elsif ($self->{state} == DATA_STATE) { $self->{s_kwd} = '' unless defined $self->{s_kwd}; if ($self->{nc} == 0x0026) { # & $self->{s_kwd} = ''; if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA not $self->{escape}) { ## NOTE: In the spec, the tokenizer is switched to the ## "entity data state". In this implementation, the tokenizer ## is switched to the |ENTITY_STATE|, which is an implementation ## of the "consume a character reference" algorithm. $self->{entity_add} = -1; $self->{prev_state} = DATA_STATE; $self->{state} = ENTITY_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { # } } elsif ($self->{nc} == 0x002D) { # - if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA $self->{s_kwd} .= '-'; if ($self->{s_kwd} eq '