/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Diff of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.18 by wakaba, Sun Oct 19 06:14:57 2008 UTC revision 1.21 by wakaba, Sun Oct 19 09:25:21 2008 UTC
# Line 182  sub NDATA_STATE () { 86 } Line 182  sub NDATA_STATE () { 86 }
182  sub AFTER_NDATA_STATE () { 87 }  sub AFTER_NDATA_STATE () { 87 }
183  sub BEFORE_NOTATION_NAME_STATE () { 88 }  sub BEFORE_NOTATION_NAME_STATE () { 88 }
184  sub NOTATION_NAME_STATE () { 89 }  sub NOTATION_NAME_STATE () { 89 }
185  sub AFTER_NOTATION_NAME_STATE () { 90 }  sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 }
186  sub BOGUS_MD_STATE () { 91 }  sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 }
187    sub ENTITY_VALUE_ENTITY_STATE () { 92 }
188    sub AFTER_ELEMENT_NAME_STATE () { 93 }
189    sub BEFORE_ELEMENT_CONTENT_STATE () { 94 }
190    sub CONTENT_KEYWORD_STATE () { 95 }
191    sub AFTER_CM_GROUP_OPEN_STATE () { 96 }
192    sub CM_ELEMENT_NAME_STATE () { 97 }
193    sub AFTER_CM_ELEMENT_NAME_STATE () { 98 }
194    sub AFTER_CM_GROUP_CLOSE_STATE () { 99 }
195    sub AFTER_MD_DEF_STATE () { 100 }
196    sub BOGUS_MD_STATE () { 101 }
197    
198  ## Tree constructor state constants (see Whatpm::HTML for the full  ## Tree constructor state constants (see Whatpm::HTML for the full
199  ## list and descriptions)  ## list and descriptions)
# Line 3262  sub _get_next_token ($) { Line 3272  sub _get_next_token ($) {
3272      }      }
3273        
3274          redo A;          redo A;
3275  ## TODO: " and ' for ENTITY        } elsif ($self->{nc} == 0x0022 and # "
3276                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3277                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3278            
3279            $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE;
3280            $self->{ct}->{value} = ''; # ENTITY
3281            
3282        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3283          $self->{line_prev} = $self->{line};
3284          $self->{column_prev} = $self->{column};
3285          $self->{column}++;
3286          $self->{nc}
3287              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3288        } else {
3289          $self->{set_nc}->($self);
3290        }
3291      
3292            redo A;
3293          } elsif ($self->{nc} == 0x0027 and # '
3294                   ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or
3295                    $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) {
3296            
3297            $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE;
3298            $self->{ct}->{value} = ''; # ENTITY
3299            
3300        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301          $self->{line_prev} = $self->{line};
3302          $self->{column_prev} = $self->{column};
3303          $self->{column}++;
3304          $self->{nc}
3305              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306        } else {
3307          $self->{set_nc}->($self);
3308        }
3309      
3310            redo A;
3311        } elsif ($self->{is_xml} and        } elsif ($self->{is_xml} and
3312                 $self->{ct}->{type} == DOCTYPE_TOKEN and                 $self->{ct}->{type} == DOCTYPE_TOKEN and
3313                 $self->{nc} == 0x005B) { # [                 $self->{nc} == 0x005B) { # [
# Line 4658  sub _get_next_token ($) { Line 4703  sub _get_next_token ($) {
4703          redo A;          redo A;
4704        }        }
4705      } elsif ($self->{state} == ENTITY_HASH_STATE) {      } elsif ($self->{state} == ENTITY_HASH_STATE) {
4706        if ($self->{nc} == 0x0078 or # x        if ($self->{nc} == 0x0078) { # x
           $self->{nc} == 0x0058) { # X  
4707                    
4708          $self->{state} = HEXREF_X_STATE;          $self->{state} = HEXREF_X_STATE;
4709          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
# Line 4675  sub _get_next_token ($) { Line 4719  sub _get_next_token ($) {
4719      }      }
4720        
4721          redo A;          redo A;
4722          } elsif ($self->{nc} == 0x0058) { # X
4723            
4724            if ($self->{is_xml}) {
4725              $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type
4726            }
4727            $self->{state} = HEXREF_X_STATE;
4728            $self->{kwd} .= chr $self->{nc};
4729            
4730        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4731          $self->{line_prev} = $self->{line};
4732          $self->{column_prev} = $self->{column};
4733          $self->{column}++;
4734          $self->{nc}
4735              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4736        } else {
4737          $self->{set_nc}->($self);
4738        }
4739      
4740            redo A;
4741        } elsif (0x0030 <= $self->{nc} and        } elsif (0x0030 <= $self->{nc} and
4742                 $self->{nc} <= 0x0039) { # 0..9                 $self->{nc} <= 0x0039) { # 0..9
4743                    
# Line 4952  sub _get_next_token ($) { Line 5015  sub _get_next_token ($) {
5015          redo A;          redo A;
5016        }        }
5017      } elsif ($self->{state} == ENTITY_NAME_STATE) {      } elsif ($self->{state} == ENTITY_NAME_STATE) {
5018        if (length $self->{kwd} < 30 and        if ((0x0041 <= $self->{nc} and # a
5019            ## NOTE: Some number greater than the maximum length of entity name             $self->{nc} <= 0x005A) or # x
5020            ((0x0041 <= $self->{nc} and # a            (0x0061 <= $self->{nc} and # a
5021              $self->{nc} <= 0x005A) or # x             $self->{nc} <= 0x007A) or # z
5022             (0x0061 <= $self->{nc} and # a            (0x0030 <= $self->{nc} and # 0
5023              $self->{nc} <= 0x007A) or # z             $self->{nc} <= 0x0039) or # 9
5024             (0x0030 <= $self->{nc} and # 0            $self->{nc} == 0x003B) { # ;
             $self->{nc} <= 0x0039) or # 9  
            $self->{nc} == 0x003B)) { # ;  
5025          our $EntityChar;          our $EntityChar;
5026          $self->{kwd} .= chr $self->{nc};          $self->{kwd} .= chr $self->{nc};
5027          if (defined $EntityChar->{$self->{kwd}}) {          if (defined $EntityChar->{$self->{kwd}} or
5028                $self->{ge}->{$self->{kwd}}) {
5029            if ($self->{nc} == 0x003B) { # ;            if ($self->{nc} == 0x003B) { # ;
5030                            if (defined $self->{ge}->{$self->{kwd}}) {
5031              $self->{entity__value} = $EntityChar->{$self->{kwd}};                if ($self->{ge}->{$self->{kwd}}->{only_text}) {
5032                    
5033                    $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value};
5034                  } else {
5035                    if (defined $self->{ge}->{$self->{kwd}}->{notation}) {
5036                      
5037                      $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type
5038                                      value => $self->{kwd});
5039                    } else {
5040                      
5041                    }
5042                    $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand
5043                  }
5044                } else {
5045                  if ($self->{is_xml}) {
5046                    
5047                    $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type
5048                                    value => $self->{kwd},
5049                                    level => {
5050                                              'amp;' => $self->{level}->{warn},
5051                                              'quot;' => $self->{level}->{warn},
5052                                              'lt;' => $self->{level}->{warn},
5053                                              'gt;' => $self->{level}->{warn},
5054                                              'apos;' => $self->{level}->{warn},
5055                                             }->{$self->{kwd}} ||
5056                                             $self->{level}->{must});
5057                  } else {
5058                    
5059                  }
5060                  $self->{entity__value} = $EntityChar->{$self->{kwd}};
5061                }
5062              $self->{entity__match} = 1;              $self->{entity__match} = 1;
5063                            
5064      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 6145  sub _get_next_token ($) { Line 6237  sub _get_next_token ($) {
6237          if ($self->{ct}->{type} == ATTLIST_TOKEN) {          if ($self->{ct}->{type} == ATTLIST_TOKEN) {
6238            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;            $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;
6239          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {          } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) {
6240            ## TODO: ...            $self->{state} = AFTER_ELEMENT_NAME_STATE;
           $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE;  
6241          } else { # ENTITY/NOTATION          } else { # ENTITY/NOTATION
6242            $self->{state} = AFTER_DOCTYPE_NAME_STATE;            $self->{state} = AFTER_DOCTYPE_NAME_STATE;
6243          }          }
# Line 7629  sub _get_next_token ($) { Line 7720  sub _get_next_token ($) {
7720        }        }
7721      } elsif ($self->{state} == NOTATION_NAME_STATE) {      } elsif ($self->{state} == NOTATION_NAME_STATE) {
7722        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
7723          $self->{state} = AFTER_NOTATION_NAME_STATE;          $self->{state} = AFTER_MD_DEF_STATE;
7724                    
7725      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7726        $self->{line_prev} = $self->{line};        $self->{line_prev} = $self->{line};
# Line 7689  sub _get_next_token ($) { Line 7780  sub _get_next_token ($) {
7780        
7781          redo A;          redo A;
7782        }        }
7783      } elsif ($self->{state} == AFTER_NOTATION_NAME_STATE) {      } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) {
7784          if ($self->{nc} == 0x0022) { # "
7785            $self->{state} = AFTER_MD_DEF_STATE;
7786            
7787        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7788          $self->{line_prev} = $self->{line};
7789          $self->{column_prev} = $self->{column};
7790          $self->{column}++;
7791          $self->{nc}
7792              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7793        } else {
7794          $self->{set_nc}->($self);
7795        }
7796      
7797            redo A;
7798          } elsif ($self->{nc} == 0x0026) { # &
7799            $self->{prev_state} = $self->{state};
7800            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7801            $self->{entity_add} = 0x0022; # "
7802            
7803        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7804          $self->{line_prev} = $self->{line};
7805          $self->{column_prev} = $self->{column};
7806          $self->{column}++;
7807          $self->{nc}
7808              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7809        } else {
7810          $self->{set_nc}->($self);
7811        }
7812      
7813            redo A;
7814    ## TODO: %
7815          } elsif ($self->{nc} == -1) {
7816            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7817            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7818            ## Reconsume.
7819            return  ($self->{ct}); # ENTITY
7820            redo A;
7821          } else {
7822            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7823            
7824        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7825          $self->{line_prev} = $self->{line};
7826          $self->{column_prev} = $self->{column};
7827          $self->{column}++;
7828          $self->{nc}
7829              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7830        } else {
7831          $self->{set_nc}->($self);
7832        }
7833      
7834            redo A;
7835          }
7836        } elsif ($self->{state} == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) {
7837          if ($self->{nc} == 0x0027) { # '
7838            $self->{state} = AFTER_MD_DEF_STATE;
7839            
7840        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7841          $self->{line_prev} = $self->{line};
7842          $self->{column_prev} = $self->{column};
7843          $self->{column}++;
7844          $self->{nc}
7845              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7846        } else {
7847          $self->{set_nc}->($self);
7848        }
7849      
7850            redo A;
7851          } elsif ($self->{nc} == 0x0026) { # &
7852            $self->{prev_state} = $self->{state};
7853            $self->{state} = ENTITY_VALUE_ENTITY_STATE;
7854            $self->{entity_add} = 0x0027; # '
7855            
7856        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7857          $self->{line_prev} = $self->{line};
7858          $self->{column_prev} = $self->{column};
7859          $self->{column}++;
7860          $self->{nc}
7861              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7862        } else {
7863          $self->{set_nc}->($self);
7864        }
7865      
7866            redo A;
7867    ## TODO: %
7868          } elsif ($self->{nc} == -1) {
7869            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type
7870            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7871            ## Reconsume.
7872            return  ($self->{ct}); # ENTITY
7873            redo A;
7874          } else {
7875            $self->{ct}->{value} .= chr $self->{nc}; # ENTITY
7876            
7877        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7878          $self->{line_prev} = $self->{line};
7879          $self->{column_prev} = $self->{column};
7880          $self->{column}++;
7881          $self->{nc}
7882              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7883        } else {
7884          $self->{set_nc}->($self);
7885        }
7886      
7887            redo A;
7888          }
7889        } elsif ($self->{state} == ENTITY_VALUE_ENTITY_STATE) {
7890          ## TODO: XMLize
7891    
7892          if ($is_space->{$self->{nc}} or
7893              {
7894                0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
7895                $self->{entity_add} => 1,
7896              }->{$self->{nc}}) {
7897            ## Don't consume
7898            ## No error
7899            ## Return nothing.
7900            #
7901          } elsif ($self->{nc} == 0x0023) { # #
7902            $self->{ca} = $self->{ct};
7903            $self->{state} = ENTITY_HASH_STATE;
7904            $self->{kwd} = '#';
7905            
7906        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7907          $self->{line_prev} = $self->{line};
7908          $self->{column_prev} = $self->{column};
7909          $self->{column}++;
7910          $self->{nc}
7911              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7912        } else {
7913          $self->{set_nc}->($self);
7914        }
7915      
7916            redo A;
7917          } elsif ((0x0041 <= $self->{nc} and
7918                    $self->{nc} <= 0x005A) or # A..Z
7919                   (0x0061 <= $self->{nc} and
7920                    $self->{nc} <= 0x007A)) { # a..z
7921            #
7922          } else {
7923            $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
7924            ## Return nothing.
7925            #
7926          }
7927    
7928          $self->{ct}->{value} .= '&';
7929          $self->{state} = $self->{prev_state};
7930          ## Reconsume.
7931          redo A;
7932        } elsif ($self->{state} == AFTER_ELEMENT_NAME_STATE) {
7933          if ($is_space->{$self->{nc}}) {
7934            $self->{state} = BEFORE_ELEMENT_CONTENT_STATE;
7935            
7936        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7937          $self->{line_prev} = $self->{line};
7938          $self->{column_prev} = $self->{column};
7939          $self->{column}++;
7940          $self->{nc}
7941              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7942        } else {
7943          $self->{set_nc}->($self);
7944        }
7945      
7946            redo A;
7947          } elsif ($self->{nc} == 0x0028) { # (
7948            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
7949            $self->{ct}->{content} = ['('];
7950            $self->{group_depth} = 1;
7951            
7952        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7953          $self->{line_prev} = $self->{line};
7954          $self->{column_prev} = $self->{column};
7955          $self->{column}++;
7956          $self->{nc}
7957              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7958        } else {
7959          $self->{set_nc}->($self);
7960        }
7961      
7962            redo A;
7963          } elsif ($self->{nc} == 0x003E) { # >
7964            $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type
7965            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7966            
7967        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7968          $self->{line_prev} = $self->{line};
7969          $self->{column_prev} = $self->{column};
7970          $self->{column}++;
7971          $self->{nc}
7972              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7973        } else {
7974          $self->{set_nc}->($self);
7975        }
7976      
7977            return  ($self->{ct}); # ELEMENT
7978            redo A;
7979          } elsif ($self->{nc} == -1) {
7980            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
7981            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
7982            
7983        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
7984          $self->{line_prev} = $self->{line};
7985          $self->{column_prev} = $self->{column};
7986          $self->{column}++;
7987          $self->{nc}
7988              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
7989        } else {
7990          $self->{set_nc}->($self);
7991        }
7992      
7993            return  ($self->{ct}); # ELEMENT
7994            redo A;
7995          } else {
7996            $self->{ct}->{content} = [chr $self->{nc}];
7997            $self->{state} = CONTENT_KEYWORD_STATE;
7998            
7999        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8000          $self->{line_prev} = $self->{line};
8001          $self->{column_prev} = $self->{column};
8002          $self->{column}++;
8003          $self->{nc}
8004              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8005        } else {
8006          $self->{set_nc}->($self);
8007        }
8008      
8009            redo A;
8010          }
8011        } elsif ($self->{state} == CONTENT_KEYWORD_STATE) {
8012          if ($is_space->{$self->{nc}}) {
8013            $self->{state} = AFTER_MD_DEF_STATE;
8014            
8015        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8016          $self->{line_prev} = $self->{line};
8017          $self->{column_prev} = $self->{column};
8018          $self->{column}++;
8019          $self->{nc}
8020              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8021        } else {
8022          $self->{set_nc}->($self);
8023        }
8024      
8025            redo A;
8026          } elsif ($self->{nc} == 0x003E) { # >
8027            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8028            
8029        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8030          $self->{line_prev} = $self->{line};
8031          $self->{column_prev} = $self->{column};
8032          $self->{column}++;
8033          $self->{nc}
8034              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8035        } else {
8036          $self->{set_nc}->($self);
8037        }
8038      
8039            return  ($self->{ct}); # ELEMENT
8040            redo A;
8041          } elsif ($self->{nc} == -1) {
8042            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8043            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8044            
8045        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8046          $self->{line_prev} = $self->{line};
8047          $self->{column_prev} = $self->{column};
8048          $self->{column}++;
8049          $self->{nc}
8050              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8051        } else {
8052          $self->{set_nc}->($self);
8053        }
8054      
8055            return  ($self->{ct}); # ELEMENT
8056            redo A;
8057          } else {
8058            $self->{ct}->{content}->[-1] .= chr $self->{nc}; # ELEMENT
8059            ## Stay in the state.
8060            
8061        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8062          $self->{line_prev} = $self->{line};
8063          $self->{column_prev} = $self->{column};
8064          $self->{column}++;
8065          $self->{nc}
8066              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8067        } else {
8068          $self->{set_nc}->($self);
8069        }
8070      
8071            redo A;
8072          }
8073        } elsif ($self->{state} == AFTER_CM_GROUP_OPEN_STATE) {
8074          if ($is_space->{$self->{nc}}) {
8075            ## Stay in the state.
8076            
8077        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8078          $self->{line_prev} = $self->{line};
8079          $self->{column_prev} = $self->{column};
8080          $self->{column}++;
8081          $self->{nc}
8082              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8083        } else {
8084          $self->{set_nc}->($self);
8085        }
8086      
8087            redo A;
8088          } elsif ($self->{nc} == 0x0028) { # (
8089            $self->{group_depth}++;
8090            push @{$self->{ct}->{content}}, chr $self->{nc};
8091            ## Stay in the state.
8092            
8093        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8094          $self->{line_prev} = $self->{line};
8095          $self->{column_prev} = $self->{column};
8096          $self->{column}++;
8097          $self->{nc}
8098              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8099        } else {
8100          $self->{set_nc}->($self);
8101        }
8102      
8103            redo A;
8104          } elsif ($self->{nc} == 0x007C or # |
8105                   $self->{nc} == 0x002C) { # ,
8106            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8107            ## Stay in the state.
8108            
8109        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8110          $self->{line_prev} = $self->{line};
8111          $self->{column_prev} = $self->{column};
8112          $self->{column}++;
8113          $self->{nc}
8114              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8115        } else {
8116          $self->{set_nc}->($self);
8117        }
8118      
8119            redo A;
8120          } elsif ($self->{nc} == 0x0029) { # )
8121            $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type
8122            push @{$self->{ct}->{content}}, chr $self->{nc};
8123            $self->{group_depth}--;
8124            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8125            
8126        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8127          $self->{line_prev} = $self->{line};
8128          $self->{column_prev} = $self->{column};
8129          $self->{column}++;
8130          $self->{nc}
8131              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8132        } else {
8133          $self->{set_nc}->($self);
8134        }
8135      
8136            redo A;
8137          } elsif ($self->{nc} == 0x003E) { # >
8138            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8139            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8140            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8141            
8142        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8143          $self->{line_prev} = $self->{line};
8144          $self->{column_prev} = $self->{column};
8145          $self->{column}++;
8146          $self->{nc}
8147              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8148        } else {
8149          $self->{set_nc}->($self);
8150        }
8151      
8152            return  ($self->{ct}); # ELEMENT
8153            redo A;
8154          } elsif ($self->{nc} == -1) {
8155            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8156            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8157            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8158            
8159        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8160          $self->{line_prev} = $self->{line};
8161          $self->{column_prev} = $self->{column};
8162          $self->{column}++;
8163          $self->{nc}
8164              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8165        } else {
8166          $self->{set_nc}->($self);
8167        }
8168      
8169            return  ($self->{ct}); # ELEMENT
8170            redo A;
8171          } else {
8172            push @{$self->{ct}->{content}}, chr $self->{nc};
8173            $self->{state} = CM_ELEMENT_NAME_STATE;
8174            
8175        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8176          $self->{line_prev} = $self->{line};
8177          $self->{column_prev} = $self->{column};
8178          $self->{column}++;
8179          $self->{nc}
8180              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8181        } else {
8182          $self->{set_nc}->($self);
8183        }
8184      
8185            redo A;
8186          }
8187        } elsif ($self->{state} == CM_ELEMENT_NAME_STATE) {
8188          if ($is_space->{$self->{nc}}) {
8189            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8190            
8191        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8192          $self->{line_prev} = $self->{line};
8193          $self->{column_prev} = $self->{column};
8194          $self->{column}++;
8195          $self->{nc}
8196              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8197        } else {
8198          $self->{set_nc}->($self);
8199        }
8200      
8201            redo A;
8202          } elsif ($self->{nc} == 0x002A or # *
8203                   $self->{nc} == 0x002B or # +
8204                   $self->{nc} == 0x003F) { # ?
8205            push @{$self->{ct}->{content}}, chr $self->{nc};
8206            $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8207            
8208        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8209          $self->{line_prev} = $self->{line};
8210          $self->{column_prev} = $self->{column};
8211          $self->{column}++;
8212          $self->{nc}
8213              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8214        } else {
8215          $self->{set_nc}->($self);
8216        }
8217      
8218            redo A;
8219          } elsif ($self->{nc} == 0x007C or # |
8220                   $self->{nc} == 0x002C) { # ,
8221            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8222            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8223            
8224        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8225          $self->{line_prev} = $self->{line};
8226          $self->{column_prev} = $self->{column};
8227          $self->{column}++;
8228          $self->{nc}
8229              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8230        } else {
8231          $self->{set_nc}->($self);
8232        }
8233      
8234            redo A;
8235          } elsif ($self->{nc} == 0x0029) { # )
8236            $self->{group_depth}--;
8237            push @{$self->{ct}->{content}}, chr $self->{nc};
8238            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8239            
8240        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8241          $self->{line_prev} = $self->{line};
8242          $self->{column_prev} = $self->{column};
8243          $self->{column}++;
8244          $self->{nc}
8245              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8246        } else {
8247          $self->{set_nc}->($self);
8248        }
8249      
8250            redo A;
8251          } elsif ($self->{nc} == 0x003E) { # >
8252            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8253            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8254            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8255            
8256        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8257          $self->{line_prev} = $self->{line};
8258          $self->{column_prev} = $self->{column};
8259          $self->{column}++;
8260          $self->{nc}
8261              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8262        } else {
8263          $self->{set_nc}->($self);
8264        }
8265      
8266            return  ($self->{ct}); # ELEMENT
8267            redo A;
8268          } elsif ($self->{nc} == -1) {
8269            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8270            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8271            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8272            
8273        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8274          $self->{line_prev} = $self->{line};
8275          $self->{column_prev} = $self->{column};
8276          $self->{column}++;
8277          $self->{nc}
8278              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8279        } else {
8280          $self->{set_nc}->($self);
8281        }
8282      
8283            return  ($self->{ct}); # ELEMENT
8284            redo A;
8285          } else {
8286            $self->{ct}->{content}->[-1] .= chr $self->{nc};
8287            ## Stay in the state.
8288            
8289        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8290          $self->{line_prev} = $self->{line};
8291          $self->{column_prev} = $self->{column};
8292          $self->{column}++;
8293          $self->{nc}
8294              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8295        } else {
8296          $self->{set_nc}->($self);
8297        }
8298      
8299            redo A;
8300          }
8301        } elsif ($self->{state} == AFTER_CM_ELEMENT_NAME_STATE) {
8302        if ($is_space->{$self->{nc}}) {        if ($is_space->{$self->{nc}}) {
8303          ## Stay in the state.          ## Stay in the state.
8304                    
# Line 7704  sub _get_next_token ($) { Line 8313  sub _get_next_token ($) {
8313      }      }
8314        
8315          redo A;          redo A;
8316          } elsif ($self->{nc} == 0x007C or # |
8317                   $self->{nc} == 0x002C) { # ,
8318            push @{$self->{ct}->{content}}, $self->{nc} == 0x007C ? ' | ' : ', ';
8319            $self->{state} = AFTER_CM_GROUP_OPEN_STATE;
8320            
8321        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8322          $self->{line_prev} = $self->{line};
8323          $self->{column_prev} = $self->{column};
8324          $self->{column}++;
8325          $self->{nc}
8326              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8327        } else {
8328          $self->{set_nc}->($self);
8329        }
8330      
8331            redo A;
8332          } elsif ($self->{nc} == 0x0029) { # )
8333            $self->{group_depth}--;
8334            push @{$self->{ct}->{content}}, chr $self->{nc};
8335            $self->{state} = AFTER_CM_GROUP_CLOSE_STATE;
8336            
8337        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8338          $self->{line_prev} = $self->{line};
8339          $self->{column_prev} = $self->{column};
8340          $self->{column}++;
8341          $self->{nc}
8342              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8343        } else {
8344          $self->{set_nc}->($self);
8345        }
8346      
8347            redo A;
8348        } elsif ($self->{nc} == 0x003E) { # >        } elsif ($self->{nc} == 0x003E) { # >
8349            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8350            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8351          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8352                    
8353      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 7717  sub _get_next_token ($) { Line 8360  sub _get_next_token ($) {
8360        $self->{set_nc}->($self);        $self->{set_nc}->($self);
8361      }      }
8362        
8363          return  ($self->{ct}); # ENTITY          return  ($self->{ct}); # ELEMENT
8364          redo A;          redo A;
8365        } elsif ($self->{nc} == -1) {        } elsif ($self->{nc} == -1) {
8366          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8367            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8368          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8369                    
8370      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {      if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
# Line 7733  sub _get_next_token ($) { Line 8377  sub _get_next_token ($) {
8377        $self->{set_nc}->($self);        $self->{set_nc}->($self);
8378      }      }
8379        
8380          return  ($self->{ct}); # ENTITY          return  ($self->{ct}); # ELEMENT
8381          redo A;          redo A;
8382        } else {        } else {
8383          $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after notation name'); ## TODO: type          $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type
8384            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8385            $self->{state} = BOGUS_MD_STATE;
8386            
8387        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8388          $self->{line_prev} = $self->{line};
8389          $self->{column_prev} = $self->{column};
8390          $self->{column}++;
8391          $self->{nc}
8392              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8393        } else {
8394          $self->{set_nc}->($self);
8395        }
8396      
8397            redo A;
8398          }
8399        } elsif ($self->{state} == AFTER_CM_GROUP_CLOSE_STATE) {
8400          if ($is_space->{$self->{nc}}) {
8401            if ($self->{group_depth}) {
8402              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8403            } else {
8404              $self->{state} = AFTER_MD_DEF_STATE;
8405            }
8406            
8407        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8408          $self->{line_prev} = $self->{line};
8409          $self->{column_prev} = $self->{column};
8410          $self->{column}++;
8411          $self->{nc}
8412              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8413        } else {
8414          $self->{set_nc}->($self);
8415        }
8416      
8417            redo A;
8418          } elsif ($self->{nc} == 0x002A or # *
8419                   $self->{nc} == 0x002B or # +
8420                   $self->{nc} == 0x003F) { # ?
8421            push @{$self->{ct}->{content}}, chr $self->{nc};
8422            if ($self->{group_depth}) {
8423              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8424            } else {
8425              $self->{state} = AFTER_MD_DEF_STATE;
8426            }
8427            
8428        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8429          $self->{line_prev} = $self->{line};
8430          $self->{column_prev} = $self->{column};
8431          $self->{column}++;
8432          $self->{nc}
8433              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8434        } else {
8435          $self->{set_nc}->($self);
8436        }
8437      
8438            redo A;
8439          } elsif ($self->{nc} == 0x0029) { # )
8440            if ($self->{group_depth}) {
8441              $self->{group_depth}--;
8442              push @{$self->{ct}->{content}}, chr $self->{nc};
8443              ## Stay in the state.
8444              
8445        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8446          $self->{line_prev} = $self->{line};
8447          $self->{column_prev} = $self->{column};
8448          $self->{column}++;
8449          $self->{nc}
8450              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8451        } else {
8452          $self->{set_nc}->($self);
8453        }
8454      
8455              redo A;
8456            } else {
8457              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8458              $self->{state} = BOGUS_MD_STATE;
8459              ## Reconsume.
8460              redo A;
8461            }
8462          } elsif ($self->{nc} == 0x003E) { # >
8463            if ($self->{group_depth}) {
8464              $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type
8465              push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8466            }
8467            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8468            
8469        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8470          $self->{line_prev} = $self->{line};
8471          $self->{column_prev} = $self->{column};
8472          $self->{column}++;
8473          $self->{nc}
8474              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8475        } else {
8476          $self->{set_nc}->($self);
8477        }
8478      
8479            return  ($self->{ct}); # ELEMENT
8480            redo A;
8481          } elsif ($self->{nc} == -1) {
8482            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8483            push @{$self->{ct}->{content}}, (')') x $self->{group_depth};
8484            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8485            
8486        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8487          $self->{line_prev} = $self->{line};
8488          $self->{column_prev} = $self->{column};
8489          $self->{column}++;
8490          $self->{nc}
8491              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8492        } else {
8493          $self->{set_nc}->($self);
8494        }
8495      
8496            return  ($self->{ct}); # ELEMENT
8497            redo A;
8498          } else {
8499            if ($self->{group_depth}) {
8500              $self->{state} = AFTER_CM_ELEMENT_NAME_STATE;
8501            } else {
8502              $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8503              $self->{state} = BOGUS_MD_STATE;
8504            }
8505            ## Reconsume.
8506            redo A;
8507          }
8508        } elsif ($self->{state} == AFTER_MD_DEF_STATE) {
8509          if ($is_space->{$self->{nc}}) {
8510            ## Stay in the state.
8511            
8512        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8513          $self->{line_prev} = $self->{line};
8514          $self->{column_prev} = $self->{column};
8515          $self->{column}++;
8516          $self->{nc}
8517              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8518        } else {
8519          $self->{set_nc}->($self);
8520        }
8521      
8522            redo A;
8523          } elsif ($self->{nc} == 0x003E) { # >
8524            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8525            
8526        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8527          $self->{line_prev} = $self->{line};
8528          $self->{column_prev} = $self->{column};
8529          $self->{column}++;
8530          $self->{nc}
8531              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8532        } else {
8533          $self->{set_nc}->($self);
8534        }
8535      
8536            return  ($self->{ct}); # ENTITY/ELEMENT
8537            redo A;
8538          } elsif ($self->{nc} == -1) {
8539            $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type
8540            $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
8541            
8542        if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
8543          $self->{line_prev} = $self->{line};
8544          $self->{column_prev} = $self->{column};
8545          $self->{column}++;
8546          $self->{nc}
8547              = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
8548        } else {
8549          $self->{set_nc}->($self);
8550        }
8551      
8552            return  ($self->{ct}); # ENTITY/ELEMENT
8553            redo A;
8554          } else {
8555            $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type
8556          $self->{state} = BOGUS_MD_STATE;          $self->{state} = BOGUS_MD_STATE;
8557          ## Reconsume.          ## Reconsume.
8558          redo A;          redo A;
8559        }        }
   
   
8560      } elsif ($self->{state} == BOGUS_MD_STATE) {      } elsif ($self->{state} == BOGUS_MD_STATE) {
8561        if ($self->{nc} == 0x003E) { # >        if ($self->{nc} == 0x003E) { # >
8562          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;          $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;

Legend:
Removed from v.1.18  
changed lines
  Added in v.1.21

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24