/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.111 by wakaba, Tue Mar 11 14:02:08 2008 UTC revision 1.112 by wakaba, Sun Mar 16 06:39:57 2008 UTC
# Line 177  sub parse_string ($$$;$) { Line 177  sub parse_string ($$$;$) {
177        if defined $self->{input_encoding};        if defined $self->{input_encoding};
178    
179    my $i = 0;    my $i = 0;
180    my $line = 1;    $self->{line_prev} = $self->{line} = 1;
181    my $column = 0;    $self->{column_prev} = $self->{column} = 0;
182    $self->{set_next_char} = sub {    $self->{set_next_char} = sub {
183      my $self = shift;      my $self = shift;
184    
# Line 187  sub parse_string ($$$;$) { Line 187  sub parse_string ($$$;$) {
187    
188      $self->{next_char} = -1 and return if $i >= length $$s;      $self->{next_char} = -1 and return if $i >= length $$s;
189      $self->{next_char} = ord substr $$s, $i++, 1;      $self->{next_char} = ord substr $$s, $i++, 1;
190      $column++;  
191        ($self->{line_prev}, $self->{column_prev})
192            = ($self->{line}, $self->{column});
193        $self->{column}++;
194            
195      if ($self->{next_char} == 0x000A) { # LF      if ($self->{next_char} == 0x000A) { # LF
196        $line++;        $self->{line}++;
197        $column = 0;        $self->{column} = 0;
198      } elsif ($self->{next_char} == 0x000D) { # CR      } elsif ($self->{next_char} == 0x000D) { # CR
199        $i++ if substr ($$s, $i, 1) eq "\x0A";        $i++ if substr ($$s, $i, 1) eq "\x0A";
200        $self->{next_char} = 0x000A; # LF # MUST        $self->{next_char} = 0x000A; # LF # MUST
201        $line++;        $self->{line}++;
202        $column = 0;        $self->{column} = 0;
203      } elsif ($self->{next_char} > 0x10FFFF) {      } elsif ($self->{next_char} > 0x10FFFF) {
204        $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST        $self->{next_char} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
205      } elsif ($self->{next_char} == 0x0000) { # NULL      } elsif ($self->{next_char} == 0x0000) { # NULL
# Line 209  sub parse_string ($$$;$) { Line 212  sub parse_string ($$$;$) {
212    
213    my $onerror = $_[2] || sub {    my $onerror = $_[2] || sub {
214      my (%opt) = @_;      my (%opt) = @_;
215      warn "Parse error ($opt{type}) at line $opt{line} column $opt{column}\n";      my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
216        my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
217        warn "Parse error ($opt{type}) at line $line column $column\n";
218    };    };
219    $self->{parse_error} = sub {    $self->{parse_error} = sub {
220      $onerror->(@_, line => $line, column => $column);      $onerror->(line => $self->{line}, column => $self->{column}, @_);
221    };    };
222    
223    $self->_initialize_tokenizer;    $self->_initialize_tokenizer;
# Line 220  sub parse_string ($$$;$) { Line 225  sub parse_string ($$$;$) {
225    $self->_construct_tree;    $self->_construct_tree;
226    $self->_terminate_tree_constructor;    $self->_terminate_tree_constructor;
227    
228      delete $self->{parse_error}; # remove loop
229    
230    return $self->{document};    return $self->{document};
231  } # parse_string  } # parse_string
232    
# Line 449  sub _get_next_token ($) { Line 456  sub _get_next_token ($) {
456          #          #
457        } elsif ($self->{next_char} == -1) {        } elsif ($self->{next_char} == -1) {
458          !!!cp (11);          !!!cp (11);
459          !!!emit ({type => END_OF_FILE_TOKEN});          !!!emit ({type => END_OF_FILE_TOKEN,
460                      line => $self->{line}, column => $self->{column}});
461          last A; ## TODO: ok?          last A; ## TODO: ok?
462        } else {        } else {
463          !!!cp (12);          !!!cp (12);
464        }        }
465        # Anything else        # Anything else
466        my $token = {type => CHARACTER_TOKEN,        my $token = {type => CHARACTER_TOKEN,
467                     data => chr $self->{next_char}};                     data => chr $self->{next_char},
468                       line => $self->{line}, column => $self->{column}};
469        ## Stay in the data state        ## Stay in the data state
470        !!!next-input-character;        !!!next-input-character;
471    
# Line 465  sub _get_next_token ($) { Line 474  sub _get_next_token ($) {
474        redo A;        redo A;
475      } elsif ($self->{state} == ENTITY_DATA_STATE) {      } elsif ($self->{state} == ENTITY_DATA_STATE) {
476        ## (cannot happen in CDATA state)        ## (cannot happen in CDATA state)
477    
478          my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
479                
480        my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);        my $token = $self->_tokenize_attempt_to_consume_an_entity (0, -1);
481    
# Line 473  sub _get_next_token ($) { Line 484  sub _get_next_token ($) {
484    
485        unless (defined $token) {        unless (defined $token) {
486          !!!cp (13);          !!!cp (13);
487          !!!emit ({type => CHARACTER_TOKEN, data => '&'});          !!!emit ({type => CHARACTER_TOKEN, data => '&',
488                      line => $l, column => $c});
489        } else {        } else {
490          !!!cp (14);          !!!cp (14);
491          !!!emit ($token);          !!!emit ($token);
# Line 492  sub _get_next_token ($) { Line 504  sub _get_next_token ($) {
504            ## reconsume            ## reconsume
505            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
506    
507            !!!emit ({type => CHARACTER_TOKEN, data => '<'});            !!!emit ({type => CHARACTER_TOKEN, data => '<',
508                        line => $self->{line_prev},
509                        column => $self->{column_prev}});
510    
511            redo A;            redo A;
512          }          }
# Line 512  sub _get_next_token ($) { Line 526  sub _get_next_token ($) {
526            !!!cp (19);            !!!cp (19);
527            $self->{current_token}            $self->{current_token}
528              = {type => START_TAG_TOKEN,              = {type => START_TAG_TOKEN,
529                 tag_name => chr ($self->{next_char} + 0x0020)};                 tag_name => chr ($self->{next_char} + 0x0020),
530                   line => $self->{line_prev},
531                   column => $self->{column_prev}};
532            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
533            !!!next-input-character;            !!!next-input-character;
534            redo A;            redo A;
# Line 520  sub _get_next_token ($) { Line 536  sub _get_next_token ($) {
536                   $self->{next_char} <= 0x007A) { # a..z                   $self->{next_char} <= 0x007A) { # a..z
537            !!!cp (20);            !!!cp (20);
538            $self->{current_token} = {type => START_TAG_TOKEN,            $self->{current_token} = {type => START_TAG_TOKEN,
539                              tag_name => chr ($self->{next_char})};                                      tag_name => chr ($self->{next_char}),
540                                        line => $self->{line_prev},
541                                        column => $self->{column_prev}};
542            $self->{state} = TAG_NAME_STATE;            $self->{state} = TAG_NAME_STATE;
543            !!!next-input-character;            !!!next-input-character;
544            redo A;            redo A;
# Line 530  sub _get_next_token ($) { Line 548  sub _get_next_token ($) {
548            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
549            !!!next-input-character;            !!!next-input-character;
550    
551            !!!emit ({type => CHARACTER_TOKEN, data => '<>'});            !!!emit ({type => CHARACTER_TOKEN, data => '<>',
552                        line => $self->{line_prev},
553                        column => $self->{column_prev}});
554    
555            redo A;            redo A;
556          } elsif ($self->{next_char} == 0x003F) { # ?          } elsif ($self->{next_char} == 0x003F) { # ?
557            !!!cp (22);            !!!cp (22);
558            !!!parse-error (type => 'pio');            !!!parse-error (type => 'pio');
559            $self->{state} = BOGUS_COMMENT_STATE;            $self->{state} = BOGUS_COMMENT_STATE;
560              $self->{current_token} = {type => COMMENT_TOKEN, data => '',
561                                        line => $self->{line_prev},
562                                        column => $self->{column_prev}};
563            ## $self->{next_char} is intentionally left as is            ## $self->{next_char} is intentionally left as is
564            redo A;            redo A;
565          } else {          } else {
# Line 545  sub _get_next_token ($) { Line 568  sub _get_next_token ($) {
568            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
569            ## reconsume            ## reconsume
570    
571            !!!emit ({type => CHARACTER_TOKEN, data => '<'});            !!!emit ({type => CHARACTER_TOKEN, data => '<',
572                        line => $self->{line_prev},
573                        column => $self->{column_prev}});
574    
575            redo A;            redo A;
576          }          }
# Line 553  sub _get_next_token ($) { Line 578  sub _get_next_token ($) {
578          die "$0: $self->{content_model} in tag open";          die "$0: $self->{content_model} in tag open";
579        }        }
580      } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {      } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
581          my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
582        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA        if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
583          if (defined $self->{last_emitted_start_tag_name}) {          if (defined $self->{last_emitted_start_tag_name}) {
584    
585            ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>            ## NOTE: <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>
586            my @next_char;            my @next_char;
587            TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {            TAGNAME: for (my $i = 0; $i < length $self->{last_emitted_start_tag_name}; $i++) {
# Line 571  sub _get_next_token ($) { Line 598  sub _get_next_token ($) {
598                !!!back-next-input-character (@next_char);                !!!back-next-input-character (@next_char);
599                $self->{state} = DATA_STATE;                $self->{state} = DATA_STATE;
600    
601                !!!emit ({type => CHARACTER_TOKEN, data => '</'});                !!!emit ({type => CHARACTER_TOKEN, data => '</',
602                            line => $l, column => $c});
603        
604                redo A;                redo A;
605              }              }
# Line 590  sub _get_next_token ($) { Line 618  sub _get_next_token ($) {
618              $self->{next_char} = shift @next_char; # reconsume              $self->{next_char} = shift @next_char; # reconsume
619              !!!back-next-input-character (@next_char);              !!!back-next-input-character (@next_char);
620              $self->{state} = DATA_STATE;              $self->{state} = DATA_STATE;
621              !!!emit ({type => CHARACTER_TOKEN, data => '</'});              !!!emit ({type => CHARACTER_TOKEN, data => '</',
622                          line => $l, column => $c});
623              redo A;              redo A;
624            } else {            } else {
625              !!!cp (27);              !!!cp (27);
# Line 603  sub _get_next_token ($) { Line 632  sub _get_next_token ($) {
632            !!!cp (28);            !!!cp (28);
633            # next-input-character is already done            # next-input-character is already done
634            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
635            !!!emit ({type => CHARACTER_TOKEN, data => '</'});            !!!emit ({type => CHARACTER_TOKEN, data => '</',
636                        line => $l, column => $c});
637            redo A;            redo A;
638          }          }
639        }        }
# Line 611  sub _get_next_token ($) { Line 641  sub _get_next_token ($) {
641        if (0x0041 <= $self->{next_char} and        if (0x0041 <= $self->{next_char} and
642            $self->{next_char} <= 0x005A) { # A..Z            $self->{next_char} <= 0x005A) { # A..Z
643          !!!cp (29);          !!!cp (29);
644          $self->{current_token} = {type => END_TAG_TOKEN,          $self->{current_token}
645                            tag_name => chr ($self->{next_char} + 0x0020)};              = {type => END_TAG_TOKEN,
646                   tag_name => chr ($self->{next_char} + 0x0020),
647                   line => $l, column => $c};
648          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
649          !!!next-input-character;          !!!next-input-character;
650          redo A;          redo A;
# Line 620  sub _get_next_token ($) { Line 652  sub _get_next_token ($) {
652                 $self->{next_char} <= 0x007A) { # a..z                 $self->{next_char} <= 0x007A) { # a..z
653          !!!cp (30);          !!!cp (30);
654          $self->{current_token} = {type => END_TAG_TOKEN,          $self->{current_token} = {type => END_TAG_TOKEN,
655                            tag_name => chr ($self->{next_char})};                                    tag_name => chr ($self->{next_char}),
656                                      line => $l, column => $c};
657          $self->{state} = TAG_NAME_STATE;          $self->{state} = TAG_NAME_STATE;
658          !!!next-input-character;          !!!next-input-character;
659          redo A;          redo A;
# Line 636  sub _get_next_token ($) { Line 669  sub _get_next_token ($) {
669          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
670          # reconsume          # reconsume
671    
672          !!!emit ({type => CHARACTER_TOKEN, data => '</'});          !!!emit ({type => CHARACTER_TOKEN, data => '</',
673                      line => $l, column => $c});
674    
675          redo A;          redo A;
676        } else {        } else {
677          !!!cp (33);          !!!cp (33);
678          !!!parse-error (type => 'bogus end tag');          !!!parse-error (type => 'bogus end tag');
679          $self->{state} = BOGUS_COMMENT_STATE;          $self->{state} = BOGUS_COMMENT_STATE;
680            $self->{current_token} = {type => COMMENT_TOKEN, data => '',
681                                      line => $self->{line_prev}, # "<" of "</"
682                                      column => $self->{column_prev} - 1};
683          ## $self->{next_char} is intentionally left as is          ## $self->{next_char} is intentionally left as is
684          redo A;          redo A;
685        }        }
# Line 1379  sub _get_next_token ($) { Line 1416  sub _get_next_token ($) {
1416      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1417        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
1418                
1419        my $token = {type => COMMENT_TOKEN, data => ''};        ## NOTE: Set by the previous state
1420          #my $token = {type => COMMENT_TOKEN, data => ''};
1421    
1422        BC: {        BC: {
1423          if ($self->{next_char} == 0x003E) { # >          if ($self->{next_char} == 0x003E) { # >
# Line 1387  sub _get_next_token ($) { Line 1425  sub _get_next_token ($) {
1425            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1426            !!!next-input-character;            !!!next-input-character;
1427    
1428            !!!emit ($token);            !!!emit ($self->{current_token}); # comment
1429    
1430            redo A;            redo A;
1431          } elsif ($self->{next_char} == -1) {          } elsif ($self->{next_char} == -1) {
# Line 1395  sub _get_next_token ($) { Line 1433  sub _get_next_token ($) {
1433            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
1434            ## reconsume            ## reconsume
1435    
1436            !!!emit ($token);            !!!emit ($self->{current_token}); # comment
1437    
1438            redo A;            redo A;
1439          } else {          } else {
1440            !!!cp (126);            !!!cp (126);
1441            $token->{data} .= chr ($self->{next_char});            $self->{current_token}->{data} .= chr ($self->{next_char}); # comment
1442            !!!next-input-character;            !!!next-input-character;
1443            redo BC;            redo BC;
1444          }          }
# Line 1410  sub _get_next_token ($) { Line 1448  sub _get_next_token ($) {
1448      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1449        ## (only happen if PCDATA state)        ## (only happen if PCDATA state)
1450    
1451          my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1);
1452    
1453        my @next_char;        my @next_char;
1454        push @next_char, $self->{next_char};        push @next_char, $self->{next_char};
1455                
# Line 1418  sub _get_next_token ($) { Line 1458  sub _get_next_token ($) {
1458          push @next_char, $self->{next_char};          push @next_char, $self->{next_char};
1459          if ($self->{next_char} == 0x002D) { # -          if ($self->{next_char} == 0x002D) { # -
1460            !!!cp (127);            !!!cp (127);
1461            $self->{current_token} = {type => COMMENT_TOKEN, data => ''};            $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1462                                        line => $l, column => $c};
1463            $self->{state} = COMMENT_START_STATE;            $self->{state} = COMMENT_START_STATE;
1464            !!!next-input-character;            !!!next-input-character;
1465            redo A;            redo A;
# Line 1454  sub _get_next_token ($) { Line 1495  sub _get_next_token ($) {
1495                      !!!cp (129);                      !!!cp (129);
1496                      ## TODO: What a stupid code this is!                      ## TODO: What a stupid code this is!
1497                      $self->{state} = DOCTYPE_STATE;                      $self->{state} = DOCTYPE_STATE;
1498                        $self->{current_token} = {type => DOCTYPE_TOKEN,
1499                                                  quirks => 1,
1500                                                  line => $l, column => $c};
1501                      !!!next-input-character;                      !!!next-input-character;
1502                      redo A;                      redo A;
1503                    } else {                    } else {
# Line 1482  sub _get_next_token ($) { Line 1526  sub _get_next_token ($) {
1526        $self->{next_char} = shift @next_char;        $self->{next_char} = shift @next_char;
1527        !!!back-next-input-character (@next_char);        !!!back-next-input-character (@next_char);
1528        $self->{state} = BOGUS_COMMENT_STATE;        $self->{state} = BOGUS_COMMENT_STATE;
1529          $self->{current_token} = {type => COMMENT_TOKEN, data => '',
1530                                    line => $l, column => $c};
1531        redo A;        redo A;
1532                
1533        ## ISSUE: typos in spec: chacacters, is is a parse error        ## ISSUE: typos in spec: chacacters, is is a parse error
# Line 1660  sub _get_next_token ($) { Line 1706  sub _get_next_token ($) {
1706          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1707          !!!next-input-character;          !!!next-input-character;
1708    
1709          !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});          !!!emit ($self->{current_token}); # DOCTYPE (quirks)
1710    
1711          redo A;          redo A;
1712        } elsif ($self->{next_char} == -1) {        } elsif ($self->{next_char} == -1) {
# Line 1669  sub _get_next_token ($) { Line 1715  sub _get_next_token ($) {
1715          $self->{state} = DATA_STATE;          $self->{state} = DATA_STATE;
1716          ## reconsume          ## reconsume
1717    
1718          !!!emit ({type => DOCTYPE_TOKEN, quirks => 1});          !!!emit ($self->{current_token}); # DOCTYPE (quirks)
1719    
1720          redo A;          redo A;
1721        } else {        } else {
1722          !!!cp (160);          !!!cp (160);
1723          $self->{current_token}          $self->{current_token}->{name} = chr $self->{next_char};
1724              = {type => DOCTYPE_TOKEN,          delete $self->{current_token}->{quirks};
                name => chr ($self->{next_char}),  
                #quirks => 0,  
               };  
1725  ## ISSUE: "Set the token's name name to the" in the spec  ## ISSUE: "Set the token's name name to the" in the spec
1726          $self->{state} = DOCTYPE_NAME_STATE;          $self->{state} = DOCTYPE_NAME_STATE;
1727          !!!next-input-character;          !!!next-input-character;
# Line 2205  sub _get_next_token ($) { Line 2248  sub _get_next_token ($) {
2248  sub _tokenize_attempt_to_consume_an_entity ($$$) {  sub _tokenize_attempt_to_consume_an_entity ($$$) {
2249    my ($self, $in_attr, $additional) = @_;    my ($self, $in_attr, $additional) = @_;
2250    
2251      my ($l, $c) = ($self->{line_prev}, $self->{column_prev});
2252    
2253    if ({    if ({
2254         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,         0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, # HT, LF, VT, FF,
2255         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR         0x0020 => 1, 0x003C => 1, 0x0026 => 1, -1 => 1, # SP, <, & # 0x000D # CR
# Line 2245  sub _tokenize_attempt_to_consume_an_enti Line 2290  sub _tokenize_attempt_to_consume_an_enti
2290            redo X;            redo X;
2291          } elsif (not defined $code) { # no hexadecimal digit          } elsif (not defined $code) { # no hexadecimal digit
2292            !!!cp (1005);            !!!cp (1005);
2293            !!!parse-error (type => 'bare hcro');            !!!parse-error (type => 'bare hcro', line => $l, column => $c);
2294            !!!back-next-input-character ($x_char, $self->{next_char});            !!!back-next-input-character ($x_char, $self->{next_char});
2295            $self->{next_char} = 0x0023; # #            $self->{next_char} = 0x0023; # #
2296            return undef;            return undef;
# Line 2254  sub _tokenize_attempt_to_consume_an_enti Line 2299  sub _tokenize_attempt_to_consume_an_enti
2299            !!!next-input-character;            !!!next-input-character;
2300          } else {          } else {
2301            !!!cp (1007);            !!!cp (1007);
2302            !!!parse-error (type => 'no refc');            !!!parse-error (type => 'no refc', line => $l, column => $c);
2303          }          }
2304    
2305          if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {          if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2306            !!!cp (1008);            !!!cp (1008);
2307            !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);            !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2308            $code = 0xFFFD;            $code = 0xFFFD;
2309          } elsif ($code > 0x10FFFF) {          } elsif ($code > 0x10FFFF) {
2310            !!!cp (1009);            !!!cp (1009);
2311            !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);            !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2312            $code = 0xFFFD;            $code = 0xFFFD;
2313          } elsif ($code == 0x000D) {          } elsif ($code == 0x000D) {
2314            !!!cp (1010);            !!!cp (1010);
2315            !!!parse-error (type => 'CR character reference');            !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2316            $code = 0x000A;            $code = 0x000A;
2317          } elsif (0x80 <= $code and $code <= 0x9F) {          } elsif (0x80 <= $code and $code <= 0x9F) {
2318            !!!cp (1011);            !!!cp (1011);
2319            !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);            !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2320            $code = $c1_entity_char->{$code};            $code = $c1_entity_char->{$code};
2321          }          }
2322    
2323          return {type => CHARACTER_TOKEN, data => chr $code,          return {type => CHARACTER_TOKEN, data => chr $code,
2324                  has_reference => 1};                  has_reference => 1, line => $l, column => $c};
2325        } # X        } # X
2326      } elsif (0x0030 <= $self->{next_char} and      } elsif (0x0030 <= $self->{next_char} and
2327               $self->{next_char} <= 0x0039) { # 0..9               $self->{next_char} <= 0x0039) { # 0..9
# Line 2297  sub _tokenize_attempt_to_consume_an_enti Line 2342  sub _tokenize_attempt_to_consume_an_enti
2342          !!!next-input-character;          !!!next-input-character;
2343        } else {        } else {
2344          !!!cp (1014);          !!!cp (1014);
2345          !!!parse-error (type => 'no refc');          !!!parse-error (type => 'no refc', line => $l, column => $c);
2346        }        }
2347    
2348        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {        if ($code == 0 or (0xD800 <= $code and $code <= 0xDFFF)) {
2349          !!!cp (1015);          !!!cp (1015);
2350          !!!parse-error (type => sprintf 'invalid character reference:U+%04X', $code);          !!!parse-error (type => (sprintf 'invalid character reference:U+%04X', $code), line => $l, column => $c);
2351          $code = 0xFFFD;          $code = 0xFFFD;
2352        } elsif ($code > 0x10FFFF) {        } elsif ($code > 0x10FFFF) {
2353          !!!cp (1016);          !!!cp (1016);
2354          !!!parse-error (type => sprintf 'invalid character reference:U-%08X', $code);          !!!parse-error (type => (sprintf 'invalid character reference:U-%08X', $code), line => $l, column => $c);
2355          $code = 0xFFFD;          $code = 0xFFFD;
2356        } elsif ($code == 0x000D) {        } elsif ($code == 0x000D) {
2357          !!!cp (1017);          !!!cp (1017);
2358          !!!parse-error (type => 'CR character reference');          !!!parse-error (type => 'CR character reference', line => $l, column => $c);
2359          $code = 0x000A;          $code = 0x000A;
2360        } elsif (0x80 <= $code and $code <= 0x9F) {        } elsif (0x80 <= $code and $code <= 0x9F) {
2361          !!!cp (1018);          !!!cp (1018);
2362          !!!parse-error (type => sprintf 'C1 character reference:U+%04X', $code);          !!!parse-error (type => (sprintf 'C1 character reference:U+%04X', $code), line => $l, column => $c);
2363          $code = $c1_entity_char->{$code};          $code = $c1_entity_char->{$code};
2364        }        }
2365                
2366        return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1};        return {type => CHARACTER_TOKEN, data => chr $code, has_reference => 1,
2367                  line => $l, column => $c};
2368      } else {      } else {
2369        !!!cp (1019);        !!!cp (1019);
2370        !!!parse-error (type => 'bare nero');        !!!parse-error (type => 'bare nero', line => $l, column => $c);
2371        !!!back-next-input-character ($self->{next_char});        !!!back-next-input-character ($self->{next_char});
2372        $self->{next_char} = 0x0023; # #        $self->{next_char} = 0x0023; # #
2373        return undef;        return undef;
# Line 2371  sub _tokenize_attempt_to_consume_an_enti Line 2417  sub _tokenize_attempt_to_consume_an_enti
2417            
2418      if ($match > 0) {      if ($match > 0) {
2419        !!!cp (1023);        !!!cp (1023);
2420        return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};        return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2421                  line => $l, column => $c};
2422      } elsif ($match < 0) {      } elsif ($match < 0) {
2423        !!!parse-error (type => 'no refc');        !!!parse-error (type => 'no refc', line => $l, column => $c);
2424        if ($in_attr and $match < -1) {        if ($in_attr and $match < -1) {
2425          !!!cp (1024);          !!!cp (1024);
2426          return {type => CHARACTER_TOKEN, data => '&'.$entity_name};          return {type => CHARACTER_TOKEN, data => '&'.$entity_name,
2427                    line => $l, column => $c};
2428        } else {        } else {
2429          !!!cp (1025);          !!!cp (1025);
2430          return {type => CHARACTER_TOKEN, data => $value, has_reference => 1};          return {type => CHARACTER_TOKEN, data => $value, has_reference => 1,
2431                    line => $l, column => $c};
2432        }        }
2433      } else {      } else {
2434        !!!cp (1026);        !!!cp (1026);
2435        !!!parse-error (type => 'bare ero');        !!!parse-error (type => 'bare ero', line => $l, column => $c);
2436        ## NOTE: "No characters are consumed" in the spec.        ## NOTE: "No characters are consumed" in the spec.
2437        return {type => CHARACTER_TOKEN, data => '&'.$value};        return {type => CHARACTER_TOKEN, data => '&'.$value,
2438                  line => $l, column => $c};
2439      }      }
2440    } else {    } else {
2441      !!!cp (1027);      !!!cp (1027);
2442      ## no characters are consumed      ## no characters are consumed
2443      !!!parse-error (type => 'bare ero');      !!!parse-error (type => 'bare ero', line => $l, column => $c);
2444      return undef;      return undef;
2445    }    }
2446  } # _tokenize_attempt_to_consume_an_entity  } # _tokenize_attempt_to_consume_an_entity

Legend:
Removed from v.1.111  
changed lines
  Added in v.1.112

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24