/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.13 by wakaba, Thu Oct 16 03:39:57 2008 UTC revision 1.14 by wakaba, Fri Oct 17 07:14:29 2008 UTC
# Line 16  BEGIN { Line 16  BEGIN {
16      PI_TOKEN      PI_TOKEN
17      ABORT_TOKEN      ABORT_TOKEN
18      END_OF_DOCTYPE_TOKEN      END_OF_DOCTYPE_TOKEN
19        ATTLIST_TOKEN
20        ELEMENT_TOKEN
21        GENERAL_ENTITY_TOKEN
22        PARAMETER_ENTITY_TOKEN
23        NOTATION_TOKEN
24    );    );
25        
26    our %EXPORT_TAGS = (    our %EXPORT_TAGS = (
# Line 29  BEGIN { Line 34  BEGIN {
34        PI_TOKEN        PI_TOKEN
35        ABORT_TOKEN        ABORT_TOKEN
36        END_OF_DOCTYPE_TOKEN        END_OF_DOCTYPE_TOKEN
37          ATTLIST_TOKEN
38          ELEMENT_TOKEN
39          GENERAL_ENTITY_TOKEN
40          PARAMETER_ENTITY_TOKEN
41          NOTATION_TOKEN
42      )],      )],
43    );    );
44  }  }
# Line 45  sub END_OF_FILE_TOKEN () { 5 } Line 55  sub END_OF_FILE_TOKEN () { 5 }
55  sub CHARACTER_TOKEN () { 6 }  sub CHARACTER_TOKEN () { 6 }
56  sub PI_TOKEN () { 7 } ## NOTE: XML only.  sub PI_TOKEN () { 7 } ## NOTE: XML only.
57  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.  sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
58  sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only  sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only.
59    sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only.
60    sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only.
61    sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only.
62    sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only.
63    sub NOTATION_TOKEN () { 14 } ## NOTE: XML only.
64    
65  ## XML5: XML5 has "empty tag token".  In this implementation, it is  ## XML5: XML5 has "empty tag token".  In this implementation, it is
66  ## represented as a start tag token with $self->{self_closing} flag  ## represented as a start tag token with $self->{self_closing} flag
# Line 136  sub PI_AFTER_STATE () { 55 } Line 151  sub PI_AFTER_STATE () { 55 }
151  sub PI_DATA_AFTER_STATE () { 56 }  sub PI_DATA_AFTER_STATE () { 56 }
152  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }  sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
153  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }  sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
154  sub DOCTYPE_TAG_STATE () { 59 }  sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 }
155  sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 60 }  sub DOCTYPE_TAG_STATE () { 60 }
156    sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 }
157    sub MD_ATTLIST_STATE () { 62 }
158    sub MD_E_STATE () { 63 }
159    sub MD_ELEMENT_STATE () { 64 }
160    sub MD_ENTITY_STATE () { 65 }
161    sub MD_NOTATION_STATE () { 66 }
162    sub DOCTYPE_MD_STATE () { 67 }
163    sub BEFORE_MD_NAME_STATE () { 68 }
164    sub MD_NAME_STATE () { 69 }
165    sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 }
166    sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 }
167    
168  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
169  ## list and descriptions)  ## list and descriptions)
# Line 2188  sub _get_next_token ($) { Line 2214  sub _get_next_token ($) {
2214          redo A;          redo A;
2215        }        }
2216      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {      } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2217          ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state".
2218    
2219        ## NOTE: Unlike spec's "bogus comment state", this implementation        ## NOTE: Unlike spec's "bogus comment state", this implementation
2220        ## consumes characters one-by-one basis.        ## consumes characters one-by-one basis.
2221                
# Line 2249  sub _get_next_token ($) { Line 2277  sub _get_next_token ($) {
2277          redo A;          redo A;
2278        }        }
2279      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {      } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2280        ## XML5: "Markup declaration state" and "DOCTYPE markup        ## XML5: "Markup declaration state".
       ## declaration state".  
2281                
2282        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2283                    
# Line 2648  sub _get_next_token ($) { Line 2675  sub _get_next_token ($) {
2675          redo A;          redo A;
2676        }        }
2677      } elsif ($self->{state} == COMMENT_STATE) {      } elsif ($self->{state} == COMMENT_STATE) {
2678          ## XML5: "Comment state" and "DOCTYPE comment state".
2679    
2680        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2681                    
2682          $self->{state} = COMMENT_END_DASH_STATE;          $self->{state} = COMMENT_END_DASH_STATE;
# Line 2700  sub _get_next_token ($) { Line 2729  sub _get_next_token ($) {
2729          redo A;          redo A;
2730        }        }
2731      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {      } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2732        ## XML5: "comment dash state".        ## XML5: "Comment dash state" and "DOCTYPE comment dash state".
2733    
2734        if ($self->{nc} == 0x002D) { # -        if ($self->{nc} == 0x002D) { # -
2735                    
# Line 2750  sub _get_next_token ($) { Line 2779  sub _get_next_token ($) {
2779          redo A;          redo A;
2780        }        }
2781      } elsif ($self->{state} == COMMENT_END_STATE) {      } elsif ($self->{state} == COMMENT_END_STATE) {
2782          ## XML5: "Comment end state" and "DOCTYPE comment end state".
2783    
2784        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
2785          if ($self->{in_subset}) {          if ($self->{in_subset}) {
2786                        
# Line 4720  sub _get_next_token ($) { Line 4751  sub _get_next_token ($) {
4751      ## XML-only states      ## XML-only states
4752    
4753      } elsif ($self->{state} == PI_STATE) {      } elsif ($self->{state} == PI_STATE) {
4754          ## XML5: "Pi state" and "DOCTYPE pi state".
4755    
4756        if ($is_space->{$self->{nc}} or        if ($is_space->{$self->{nc}} or
4757            $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"            $self->{nc} == 0x003F or # ?
4758            $self->{nc} == -1) {            $self->{nc} == -1) {
4759            ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE
4760            ## pi state": Switch to the "DOCTYPE pi after state".  EOF:
4761            ## "DOCTYPE pi state": Parse error, switch to the "data
4762            ## state".
4763          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4764                          line => $self->{line_prev},                          line => $self->{line_prev},
4765                          column => $self->{column_prev}                          column => $self->{column_prev}
# Line 4737  sub _get_next_token ($) { Line 4774  sub _get_next_token ($) {
4774                        };                        };
4775          redo A;          redo A;
4776        } else {        } else {
4777            ## XML5: "DOCTYPE pi state": Stay in the state.
4778          $self->{ct} = {type => PI_TOKEN,          $self->{ct} = {type => PI_TOKEN,
4779                         target => chr $self->{nc},                         target => chr $self->{nc},
4780                         data => '',                         data => '',
# Line 4851  sub _get_next_token ($) { Line 4889  sub _get_next_token ($) {
4889        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
4890          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4891          if ($self->{in_subset}) {          if ($self->{in_subset}) {
4892            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state"
4893          } else {          } else {
4894            $self->{state} = DATA_STATE;            $self->{state} = DATA_STATE;
4895            $self->{s_kwd} = '';            $self->{s_kwd} = '';
# Line 4879  sub _get_next_token ($) { Line 4917  sub _get_next_token ($) {
4917          redo A;          redo A;
4918        }        }
4919      } elsif ($self->{state} == PI_AFTER_STATE) {      } elsif ($self->{state} == PI_AFTER_STATE) {
4920          ## XML5: Part of "Pi after state".
4921    
4922        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4923          if ($self->{in_subset}) {          if ($self->{in_subset}) {
4924            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
# Line 4928  sub _get_next_token ($) { Line 4968  sub _get_next_token ($) {
4968          redo A;          redo A;
4969        }        }
4970      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {      } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4971        ## XML5: Same as "pi after state" in XML5        ## XML5: Same as "pi after state" and "DOCTYPE pi after state".
4972    
4973        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
4974          if ($self->{in_subset}) {          if ($self->{in_subset}) {
4975            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
# Line 5141  sub _get_next_token ($) { Line 5182  sub _get_next_token ($) {
5182        }        }
5183      } elsif ($self->{state} == DOCTYPE_TAG_STATE) {      } elsif ($self->{state} == DOCTYPE_TAG_STATE) {
5184        if ($self->{nc} == 0x0021) { # !        if ($self->{nc} == 0x0021) { # !
5185          $self->{state} = MARKUP_DECLARATION_OPEN_STATE;          $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE;
5186                    
5187      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5188        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 5195  sub _get_next_token ($) { Line 5236  sub _get_next_token ($) {
5236        
5237          redo A;          redo A;
5238        }        }
5239        } elsif ($self->{state} == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) {
5240          ## XML5: "DOCTYPE markup declaration state".
5241          
5242          if ($self->{nc} == 0x002D) { # -
5243            $self->{state} = MD_HYPHEN_STATE;
5244            
5245        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5246          $self->{line_prev} = $self->{line};
5247          $self->{column_prev} = $self->{column};
5248          $self->{column}++;
5249          $self->{nc}
5250              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5251        } else {
5252          $self->{set_nc}->($self);
5253        }
5254      
5255            redo A;
5256          } elsif ($self->{nc} == 0x0045) { # E
5257            $self->{state} = MD_E_STATE;
5258            $self->{kwd} = chr $self->{nc};
5259            
5260        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5261          $self->{line_prev} = $self->{line};
5262          $self->{column_prev} = $self->{column};
5263          $self->{column}++;
5264          $self->{nc}
5265              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5266        } else {
5267          $self->{set_nc}->($self);
5268        }
5269      
5270            redo A;
5271          } elsif ($self->{nc} == 0x0041) { # A
5272            $self->{state} = MD_ATTLIST_STATE;
5273            $self->{kwd} = chr $self->{nc};
5274            
5275        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5276          $self->{line_prev} = $self->{line};
5277          $self->{column_prev} = $self->{column};
5278          $self->{column}++;
5279          $self->{nc}
5280              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5281        } else {
5282          $self->{set_nc}->($self);
5283        }
5284      
5285            redo A;
5286          } elsif ($self->{nc} == 0x004E) { # N
5287            $self->{state} = MD_NOTATION_STATE;
5288            $self->{kwd} = chr $self->{nc};
5289            
5290        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5291          $self->{line_prev} = $self->{line};
5292          $self->{column_prev} = $self->{column};
5293          $self->{column}++;
5294          $self->{nc}
5295              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5296        } else {
5297          $self->{set_nc}->($self);
5298        }
5299      
5300            redo A;
5301          } else {
5302            #
5303          }
5304          
5305          ## XML5: No parse error.
5306          $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5307                          line => $self->{line_prev},
5308                          column => $self->{column_prev} - 1);
5309          ## Reconsume.
5310          $self->{state} = BOGUS_COMMENT_STATE;
5311          $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded.
5312          redo A;
5313        } elsif ($self->{state} == MD_E_STATE) {
5314          if ($self->{nc} == 0x004E) { # N
5315            $self->{state} = MD_ENTITY_STATE;
5316            $self->{kwd} .= chr $self->{nc};
5317            
5318        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5319          $self->{line_prev} = $self->{line};
5320          $self->{column_prev} = $self->{column};
5321          $self->{column}++;
5322          $self->{nc}
5323              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5324        } else {
5325          $self->{set_nc}->($self);
5326        }
5327      
5328            redo A;
5329          } elsif ($self->{nc} == 0x004C) { # L
5330            ## XML5: <!ELEMENT> not supported.
5331            $self->{state} = MD_ELEMENT_STATE;
5332            $self->{kwd} .= chr $self->{nc};
5333            
5334        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5335          $self->{line_prev} = $self->{line};
5336          $self->{column_prev} = $self->{column};
5337          $self->{column}++;
5338          $self->{nc}
5339              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5340        } else {
5341          $self->{set_nc}->($self);
5342        }
5343      
5344            redo A;
5345          } else {
5346            ## XML5: No parse error.
5347            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5348                            line => $self->{line_prev},
5349                            column => $self->{column_prev} - 2
5350                                + 1 * ($self->{nc} == -1));
5351            ## Reconsume.
5352            $self->{state} = BOGUS_COMMENT_STATE;
5353            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5354            redo A;
5355          }
5356        } elsif ($self->{state} == MD_ENTITY_STATE) {
5357          if ($self->{nc} == {
5358                'EN' => 0x0054, # T
5359                'ENT' => 0x0049, # I
5360                'ENTI' => 0x0054, # T
5361              }->{$self->{kwd}}) {
5362            ## Stay in the state.
5363            $self->{kwd} .= chr $self->{nc};
5364            
5365        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5366          $self->{line_prev} = $self->{line};
5367          $self->{column_prev} = $self->{column};
5368          $self->{column}++;
5369          $self->{nc}
5370              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5371        } else {
5372          $self->{set_nc}->($self);
5373        }
5374      
5375            redo A;
5376          } elsif ($self->{kwd} eq 'ENTIT' and
5377                   $self->{nc} == 0x0059) { # Y
5378            $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', text => '',
5379                           line => $self->{line_prev},
5380                           column => $self->{column_prev} - 6};
5381            $self->{state} = DOCTYPE_MD_STATE;
5382            
5383        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5384          $self->{line_prev} = $self->{line};
5385          $self->{column_prev} = $self->{column};
5386          $self->{column}++;
5387          $self->{nc}
5388              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5389        } else {
5390          $self->{set_nc}->($self);
5391        }
5392      
5393            redo A;
5394          } else {
5395            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5396                            line => $self->{line_prev},
5397                            column => $self->{column_prev} - 1
5398                                - (length $self->{kwd})
5399                                + 1 * ($self->{nc} == -1));
5400            $self->{state} = BOGUS_COMMENT_STATE;
5401            ## Reconsume.
5402            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5403            redo A;
5404          }
5405        } elsif ($self->{state} == MD_ELEMENT_STATE) {
5406          if ($self->{nc} == {
5407                'EL' => 0x0045, # E
5408                'ELE' => 0x004D, # M
5409                'ELEM' => 0x0045, # E
5410                'ELEME' => 0x004E, # N
5411              }->{$self->{kwd}}) {
5412            ## Stay in the state.
5413            $self->{kwd} .= chr $self->{nc};
5414            
5415        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5416          $self->{line_prev} = $self->{line};
5417          $self->{column_prev} = $self->{column};
5418          $self->{column}++;
5419          $self->{nc}
5420              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5421        } else {
5422          $self->{set_nc}->($self);
5423        }
5424      
5425            redo A;
5426          } elsif ($self->{kwd} eq 'ELEMEN' and
5427                   $self->{nc} == 0x0054) { # T
5428            $self->{ct} = {type => ELEMENT_TOKEN, name => '',
5429                           line => $self->{line_prev},
5430                           column => $self->{column_prev} - 6};
5431            $self->{state} = DOCTYPE_MD_STATE;
5432            
5433        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5434          $self->{line_prev} = $self->{line};
5435          $self->{column_prev} = $self->{column};
5436          $self->{column}++;
5437          $self->{nc}
5438              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5439        } else {
5440          $self->{set_nc}->($self);
5441        }
5442      
5443            redo A;
5444          } else {
5445            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5446                            line => $self->{line_prev},
5447                            column => $self->{column_prev} - 1
5448                                - (length $self->{kwd})
5449                                + 1 * ($self->{nc} == -1));
5450            $self->{state} = BOGUS_COMMENT_STATE;
5451            ## Reconsume.
5452            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5453            redo A;
5454          }
5455        } elsif ($self->{state} == MD_ATTLIST_STATE) {
5456          if ($self->{nc} == {
5457                'A' => 0x0054, # T
5458                'AT' => 0x0054, # T
5459                'ATT' => 0x004C, # L
5460                'ATTL' => 0x0049, # I
5461                'ATTLI' => 0x0053, # S
5462              }->{$self->{kwd}}) {
5463            ## Stay in the state.
5464            $self->{kwd} .= chr $self->{nc};
5465            
5466        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5467          $self->{line_prev} = $self->{line};
5468          $self->{column_prev} = $self->{column};
5469          $self->{column}++;
5470          $self->{nc}
5471              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5472        } else {
5473          $self->{set_nc}->($self);
5474        }
5475      
5476            redo A;
5477          } elsif ($self->{kwd} eq 'ATTLIS' and
5478                   $self->{nc} == 0x0054) { # T
5479            $self->{ct} = {type => ATTLIST_TOKEN, name => '',
5480                           line => $self->{line_prev},
5481                           column => $self->{column_prev} - 6};
5482            $self->{state} = DOCTYPE_MD_STATE;
5483            
5484        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5485          $self->{line_prev} = $self->{line};
5486          $self->{column_prev} = $self->{column};
5487          $self->{column}++;
5488          $self->{nc}
5489              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5490        } else {
5491          $self->{set_nc}->($self);
5492        }
5493      
5494            redo A;
5495          } else {
5496            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5497                            line => $self->{line_prev},
5498                            column => $self->{column_prev} - 1
5499                                 - (length $self->{kwd})
5500                                 + 1 * ($self->{nc} == -1));
5501            $self->{state} = BOGUS_COMMENT_STATE;
5502            ## Reconsume.
5503            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5504            redo A;
5505          }
5506        } elsif ($self->{state} == MD_NOTATION_STATE) {
5507          if ($self->{nc} == {
5508                'N' => 0x004F, # O
5509                'NO' => 0x0054, # T
5510                'NOT' => 0x0041, # A
5511                'NOTA' => 0x0054, # T
5512                'NOTAT' => 0x0049, # I
5513                'NOTATI' => 0x004F, # O
5514              }->{$self->{kwd}}) {
5515            ## Stay in the state.
5516            $self->{kwd} .= chr $self->{nc};
5517            
5518        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5519          $self->{line_prev} = $self->{line};
5520          $self->{column_prev} = $self->{column};
5521          $self->{column}++;
5522          $self->{nc}
5523              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5524        } else {
5525          $self->{set_nc}->($self);
5526        }
5527      
5528            redo A;
5529          } elsif ($self->{kwd} eq 'NOTATIO' and
5530                   $self->{nc} == 0x004E) { # N
5531            $self->{ct} = {type => NOTATION_TOKEN, name => '',
5532                           line => $self->{line_prev},
5533                           column => $self->{column_prev} - 6};
5534            $self->{state} = DOCTYPE_MD_STATE;
5535            
5536        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5537          $self->{line_prev} = $self->{line};
5538          $self->{column_prev} = $self->{column};
5539          $self->{column}++;
5540          $self->{nc}
5541              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5542        } else {
5543          $self->{set_nc}->($self);
5544        }
5545      
5546            redo A;
5547          } else {
5548            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
5549                            line => $self->{line_prev},
5550                            column => $self->{column_prev} - 1
5551                                - (length $self->{kwd})
5552                                + 1 * ($self->{nc} == -1));
5553            $self->{state} = BOGUS_COMMENT_STATE;
5554            ## Reconsume.
5555            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5556            redo A;
5557          }
5558        } elsif ($self->{state} == DOCTYPE_MD_STATE) {
5559          ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and
5560          ## "DOCTYPE NOTATION state".
5561    
5562          if ($is_space->{$self->{nc}}) {
5563            ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state".
5564            $self->{state} = BEFORE_MD_NAME_STATE;
5565            
5566        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5567          $self->{line_prev} = $self->{line};
5568          $self->{column_prev} = $self->{column};
5569          $self->{column}++;
5570          $self->{nc}
5571              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5572        } else {
5573          $self->{set_nc}->($self);
5574        }
5575      
5576            redo A;
5577          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5578                   $self->{nc} == 0x0025) { # %
5579            ## XML5: Switch to the "DOCTYPE bogus comment state".
5580            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5581            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5582            
5583        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5584          $self->{line_prev} = $self->{line};
5585          $self->{column_prev} = $self->{column};
5586          $self->{column}++;
5587          $self->{nc}
5588              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5589        } else {
5590          $self->{set_nc}->($self);
5591        }
5592      
5593            redo A;
5594          } elsif ($self->{nc} == -1) {
5595            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5596            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5597            ## Reconsume.
5598            redo A;
5599          } elsif ($self->{nc} == 0x003E) { # >
5600            ## XML5: Switch to the "DOCTYPE bogus comment state".
5601            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5602            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5603            
5604        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5605          $self->{line_prev} = $self->{line};
5606          $self->{column_prev} = $self->{column};
5607          $self->{column}++;
5608          $self->{nc}
5609              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5610        } else {
5611          $self->{set_nc}->($self);
5612        }
5613      
5614            redo A;
5615          } else {
5616            ## XML5: Switch to the "DOCTYPE bogus comment state".
5617            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type
5618            $self->{state} = BEFORE_MD_NAME_STATE;
5619            redo A;
5620          }
5621        } elsif ($self->{state} == BEFORE_MD_NAME_STATE) {
5622          ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type
5623          ## before state", "DOCTYPE ATTLIST name before state".
5624    
5625          if ($is_space->{$self->{nc}}) {
5626            ## Stay in the state.
5627            
5628        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5629          $self->{line_prev} = $self->{line};
5630          $self->{column_prev} = $self->{column};
5631          $self->{column}++;
5632          $self->{nc}
5633              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5634        } else {
5635          $self->{set_nc}->($self);
5636        }
5637      
5638            redo A;
5639          } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and
5640                   $self->{nc} == 0x0025) { # %
5641            $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE;
5642            
5643        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5644          $self->{line_prev} = $self->{line};
5645          $self->{column_prev} = $self->{column};
5646          $self->{column}++;
5647          $self->{nc}
5648              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5649        } else {
5650          $self->{set_nc}->($self);
5651        }
5652      
5653            redo A;
5654          } elsif ($self->{nc} == 0x003E) { # >
5655            ## XML5: Same as "Anything else".
5656            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5657            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5658            
5659        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5660          $self->{line_prev} = $self->{line};
5661          $self->{column_prev} = $self->{column};
5662          $self->{column}++;
5663          $self->{nc}
5664              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5665        } else {
5666          $self->{set_nc}->($self);
5667        }
5668      
5669            redo A;
5670          } elsif ($self->{nc} == -1) {
5671            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5672            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5673            ## Reconsume.
5674            redo A;
5675          } else {
5676            ## XML5: [ATTLIST] Not defined yet.
5677            $self->{ct}->{name} .= chr $self->{nc};
5678            $self->{state} = MD_NAME_STATE;
5679            
5680        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5681          $self->{line_prev} = $self->{line};
5682          $self->{column_prev} = $self->{column};
5683          $self->{column}++;
5684          $self->{nc}
5685              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5686        } else {
5687          $self->{set_nc}->($self);
5688        }
5689      
5690            redo A;
5691          }
5692        } elsif ($self->{state} == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) {
5693          if ($is_space->{$self->{nc}}) {
5694            ## XML5: Switch to the "DOCTYPE ENTITY parameter state".
5695            $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN;
5696            $self->{state} = BEFORE_MD_NAME_STATE;
5697                    
5698        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5699          $self->{line_prev} = $self->{line};
5700          $self->{column_prev} = $self->{column};
5701          $self->{column}++;
5702          $self->{nc}
5703              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5704        } else {
5705          $self->{set_nc}->($self);
5706        }
5707      
5708            redo A;
5709          } elsif ($self->{nc} == 0x003E) { # >
5710            ## XML5: Same as "Anything else".
5711            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type
5712            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5713            
5714        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5715          $self->{line_prev} = $self->{line};
5716          $self->{column_prev} = $self->{column};
5717          $self->{column}++;
5718          $self->{nc}
5719              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5720        } else {
5721          $self->{set_nc}->($self);
5722        }
5723      
5724            redo A;
5725          } elsif ($self->{nc} == -1) {
5726            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
5727            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5728            ## Reconsume.
5729            redo A;
5730          } else {
5731            ## XML5: No parse error.
5732            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type
5733            $self->{state} = BOGUS_COMMENT_STATE;
5734            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5735            ## Reconsume.
5736            redo A;
5737          }
5738        } elsif ($self->{state} == MD_NAME_STATE) {
5739          ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state".
5740          
5741          if ($is_space->{$self->{nc}}) {
5742            ## TODO:
5743            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
5744            
5745        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5746          $self->{line_prev} = $self->{line};
5747          $self->{column_prev} = $self->{column};
5748          $self->{column}++;
5749          $self->{nc}
5750              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5751        } else {
5752          $self->{set_nc}->($self);
5753        }
5754      
5755            redo A;
5756          } elsif ($self->{nc} == 0x003E) { # >
5757            if ($self->{ct}->{type} == ATTLIST_TOKEN) {
5758              #
5759            } else {
5760              $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md body'); ## TODO: type
5761            }
5762            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5763            
5764        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5765          $self->{line_prev} = $self->{line};
5766          $self->{column_prev} = $self->{column};
5767          $self->{column}++;
5768          $self->{nc}
5769              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5770        } else {
5771          $self->{set_nc}->($self);
5772        }
5773      
5774            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
5775            redo A;
5776          } elsif ($self->{nc} == -1) {
5777            ## XML5: [ATTLIST] No parse error.
5778            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md');
5779            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5780            ## Reconsume.
5781            return  ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION
5782            redo A;
5783          } else {
5784            ## XML5: [ATTLIST] Not defined yet.
5785            $self->{ct}->{name} .= chr $self->{nc};
5786            ## Stay in the state.
5787            
5788        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5789          $self->{line_prev} = $self->{line};
5790          $self->{column_prev} = $self->{column};
5791          $self->{column}++;
5792          $self->{nc}
5793              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5794        } else {
5795          $self->{set_nc}->($self);
5796        }
5797      
5798            redo A;
5799          }
5800        } elsif ($self->{state} == DOCTYPE_ATTLIST_NAME_AFTER_STATE) {
5801          if ($is_space->{$self->{nc}}) {
5802            ## Stay in the state.
5803            
5804        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5805          $self->{line_prev} = $self->{line};
5806          $self->{column_prev} = $self->{column};
5807          $self->{column}++;
5808          $self->{nc}
5809              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5810        } else {
5811          $self->{set_nc}->($self);
5812        }
5813      
5814            redo A;
5815          } elsif ($self->{nc} == 0x003E) { # >
5816            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
5817            
5818        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
5819          $self->{line_prev} = $self->{line};
5820          $self->{column_prev} = $self->{column};
5821          $self->{column}++;
5822          $self->{nc}
5823              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
5824        } else {
5825          $self->{set_nc}->($self);
5826        }
5827      
5828            return  ($self->{ct}); # ATTLIST
5829            redo A;
5830          } elsif ($self->{nc} == -1) {
5831            ## XML5: No parse error.
5832            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
5833            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state".
5834            redo A;
5835          } else {
5836            ## XML5: Not defined yet.
5837    
5838            ## TODO: ...
5839    
5840            $self->{state} = BOGUS_COMMENT_STATE;
5841            $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded
5842            ## Reconsume.
5843            redo A;
5844          }
5845    
5846      } else {      } else {
5847        die "$0: $self->{state}: Unknown state";        die "$0: $self->{state}: Unknown state";
5848      }      }

Legend:
Removed from v.1.13  
changed lines
  Added in v.1.14

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24