/[suikacvs]/markup/html/whatpm/Whatpm/HTML.pm.src
Suika

Diff of /markup/html/whatpm/Whatpm/HTML.pm.src

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.17 by wakaba, Sat Jun 23 08:15:21 2007 UTC revision 1.18 by wakaba, Sat Jun 23 12:21:01 2007 UTC
# Line 2  package Whatpm::HTML; Line 2  package Whatpm::HTML;
2  use strict;  use strict;
3  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};  our $VERSION=do{my @r=(q$Revision$=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5  ## This is an early version of an HTML parser.  ## ISSUE:
6    ## var doc = implementation.createDocument (null, null, null);
7    ## doc.write ('');
8    ## alert (doc.compatMode);
9    
10  my $permitted_slash_tag_name = {  my $permitted_slash_tag_name = {
11    base => 1,    base => 1,
# Line 155  sub _initialize_tokenizer ($) { Line 158  sub _initialize_tokenizer ($) {
158    # $self->{next_input_character}    # $self->{next_input_character}
159    !!!next-input-character;    !!!next-input-character;
160    $self->{token} = [];    $self->{token} = [];
161      # $self->{escape}
162  } # _initialize_tokenizer  } # _initialize_tokenizer
163    
164  ## A token has:  ## A token has:
165  ##   ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',  ##   ->{type} eq 'DOCTYPE', 'start tag', 'end tag', 'comment',
166  ##       'character', or 'end-of-file'  ##       'character', or 'end-of-file'
167  ##   ->{name} (DOCTYPE, start tag (tagname), end tag (tagname))  ##   ->{name} (DOCTYPE, start tag (tag name), end tag (tag name))
168      ## ISSUE: the spec need s/tagname/tag name/  ##   ->{public_identifier} (DOCTYPE)
169  ##   ->{error} == 1 or 0 (DOCTYPE)  ##   ->{system_identifier} (DOCTYPE)
170    ##   ->{correct} == 1 or 0 (DOCTYPE)
171  ##   ->{attributes} isa HASH (start tag, end tag)  ##   ->{attributes} isa HASH (start tag, end tag)
172  ##   ->{data} (comment, character)  ##   ->{data} (comment, character)
173    
 ## Macros  
 ##   Macros MUST be preceded by three EXCLAMATION MARKs.  
 ##   emit ($token)  
 ##     Emits the specified token.  
   
174  ## Emitted token MUST immediately be handled by the tree construction state.  ## Emitted token MUST immediately be handled by the tree construction state.
175    
176  ## Before each step, UA MAY check to see if either one of the scripts in  ## Before each step, UA MAY check to see if either one of the scripts in
# Line 1103  sub _get_next_token ($) { Line 1103  sub _get_next_token ($) {
1103          ## Stay in the state          ## Stay in the state
1104          !!!next-input-character;          !!!next-input-character;
1105          redo A;          redo A;
       } elsif (0x0061 <= $self->{next_input_character} and  
                $self->{next_input_character} <= 0x007A) { # a..z  
 ## ISSUE: "Set the token's name name to the" in the spec  
         $self->{current_token} = {type => 'DOCTYPE',  
                           name => chr ($self->{next_input_character} - 0x0020),  
                           error => 1};  
         $self->{state} = 'DOCTYPE name';  
         !!!next-input-character;  
         redo A;  
1106        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
1107          !!!parse-error (type => 'no DOCTYPE name');          !!!parse-error (type => 'no DOCTYPE name');
1108          $self->{state} = 'data';          $self->{state} = 'data';
1109          !!!next-input-character;          !!!next-input-character;
1110    
1111          !!!emit ({type => 'DOCTYPE', name => '', error => 1});          !!!emit ({type => 'DOCTYPE'}); # incorrect
1112    
1113          redo A;          redo A;
1114        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
# Line 1125  sub _get_next_token ($) { Line 1116  sub _get_next_token ($) {
1116          $self->{state} = 'data';          $self->{state} = 'data';
1117          ## reconsume          ## reconsume
1118    
1119          !!!emit ({type => 'DOCTYPE', name => '', error => 1});          !!!emit ({type => 'DOCTYPE'}); # incorrect
1120    
1121          redo A;          redo A;
1122        } else {        } else {
1123          $self->{current_token} = {type => 'DOCTYPE',          $self->{current_token}
1124                            name => chr ($self->{next_input_character}),              = {type => 'DOCTYPE',
1125                            error => 1};                 name => chr ($self->{next_input_character}),
1126                   correct => 1};
1127  ## ISSUE: "Set the token's name name to the" in the spec  ## ISSUE: "Set the token's name name to the" in the spec
1128          $self->{state} = 'DOCTYPE name';          $self->{state} = 'DOCTYPE name';
1129          !!!next-input-character;          !!!next-input-character;
1130          redo A;          redo A;
1131        }        }
1132      } elsif ($self->{state} eq 'DOCTYPE name') {      } elsif ($self->{state} eq 'DOCTYPE name') {
1133    ## ISSUE: Redundant "First," in the spec.
1134        if ($self->{next_input_character} == 0x0009 or # HT        if ($self->{next_input_character} == 0x0009 or # HT
1135            $self->{next_input_character} == 0x000A or # LF            $self->{next_input_character} == 0x000A or # LF
1136            $self->{next_input_character} == 0x000B or # VT            $self->{next_input_character} == 0x000B or # VT
1137            $self->{next_input_character} == 0x000C or # FF            $self->{next_input_character} == 0x000C or # FF
1138            $self->{next_input_character} == 0x0020) { # SP            $self->{next_input_character} == 0x0020) { # SP
         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE  
1139          $self->{state} = 'after DOCTYPE name';          $self->{state} = 'after DOCTYPE name';
1140          !!!next-input-character;          !!!next-input-character;
1141          redo A;          redo A;
1142        } elsif ($self->{next_input_character} == 0x003E) { # >        } elsif ($self->{next_input_character} == 0x003E) { # >
         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE  
1143          $self->{state} = 'data';          $self->{state} = 'data';
1144          !!!next-input-character;          !!!next-input-character;
1145    
# Line 1156  sub _get_next_token ($) { Line 1147  sub _get_next_token ($) {
1147          undef $self->{current_token};          undef $self->{current_token};
1148    
1149          redo A;          redo A;
       } elsif (0x0061 <= $self->{next_input_character} and  
                $self->{next_input_character} <= 0x007A) { # a..z  
         $self->{current_token}->{name} .= chr ($self->{next_input_character} - 0x0020); # DOCTYPE  
         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');  
         ## Stay in the state  
         !!!next-input-character;  
         redo A;  
1150        } elsif ($self->{next_input_character} == -1) {        } elsif ($self->{next_input_character} == -1) {
1151          !!!parse-error (type => 'unclosed DOCTYPE');          !!!parse-error (type => 'unclosed DOCTYPE');
         $self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML'); # DOCTYPE  
1152          $self->{state} = 'data';          $self->{state} = 'data';
1153          ## reconsume          ## reconsume
1154    
1155          !!!emit ($self->{current_token});          delete $self->{current_token}->{correct};
1156            !!!emit ($self->{current_token}); # DOCTYPE
1157          undef $self->{current_token};          undef $self->{current_token};
1158    
1159          redo A;          redo A;
1160        } else {        } else {
1161          $self->{current_token}->{name}          $self->{current_token}->{name}
1162            .= chr ($self->{next_input_character}); # DOCTYPE            .= chr ($self->{next_input_character}); # DOCTYPE
         #$self->{current_token}->{error} = ($self->{current_token}->{name} ne 'HTML');  
1163          ## Stay in the state          ## Stay in the state
1164          !!!next-input-character;          !!!next-input-character;
1165          redo A;          redo A;
# Line 1203  sub _get_next_token ($) { Line 1186  sub _get_next_token ($) {
1186          $self->{state} = 'data';          $self->{state} = 'data';
1187          ## reconsume          ## reconsume
1188    
1189            delete $self->{current_token}->{correct};
1190            !!!emit ($self->{current_token}); # DOCTYPE
1191            undef $self->{current_token};
1192    
1193            redo A;
1194          } elsif ($self->{next_input_character} == 0x0050 or # P
1195                   $self->{next_input_character} == 0x0070) { # p
1196            !!!next-input-character;
1197            if ($self->{next_input_character} == 0x0055 or # U
1198                $self->{next_input_character} == 0x0075) { # u
1199              !!!next-input-character;
1200              if ($self->{next_input_character} == 0x0042 or # B
1201                  $self->{next_input_character} == 0x0062) { # b
1202                !!!next-input-character;
1203                if ($self->{next_input_character} == 0x004C or # L
1204                    $self->{next_input_character} == 0x006C) { # l
1205                  !!!next-input-character;
1206                  if ($self->{next_input_character} == 0x0049 or # I
1207                      $self->{next_input_character} == 0x0069) { # i
1208                    !!!next-input-character;
1209                    if ($self->{next_input_character} == 0x0043 or # C
1210                        $self->{next_input_character} == 0x0063) { # c
1211                      $self->{state} = 'before DOCTYPE public identifier';
1212                      !!!next-input-character;
1213                      redo A;
1214                    }
1215                  }
1216                }
1217              }
1218            }
1219    
1220            #
1221          } elsif ($self->{next_input_character} == 0x0053 or # S
1222                   $self->{next_input_character} == 0x0073) { # s
1223            !!!next-input-character;
1224            if ($self->{next_input_character} == 0x0059 or # Y
1225                $self->{next_input_character} == 0x0079) { # y
1226              !!!next-input-character;
1227              if ($self->{next_input_character} == 0x0053 or # S
1228                  $self->{next_input_character} == 0x0073) { # s
1229                !!!next-input-character;
1230                if ($self->{next_input_character} == 0x0054 or # T
1231                    $self->{next_input_character} == 0x0074) { # t
1232                  !!!next-input-character;
1233                  if ($self->{next_input_character} == 0x0045 or # E
1234                      $self->{next_input_character} == 0x0065) { # e
1235                    !!!next-input-character;
1236                    if ($self->{next_input_character} == 0x004D or # M
1237                        $self->{next_input_character} == 0x006D) { # m
1238                      $self->{state} = 'before DOCTYPE system identifier';
1239                      !!!next-input-character;
1240                      redo A;
1241                    }
1242                  }
1243                }
1244              }
1245            }
1246    
1247            #
1248          } else {
1249            !!!next-input-character;
1250            #
1251          }
1252    
1253          !!!parse-error (type => 'string after DOCTYPE name');
1254          $self->{state} = 'bogus DOCTYPE';
1255          # next-input-character is already done
1256          redo A;
1257        } elsif ($self->{state} eq 'before DOCTYPE public identifier') {
1258          if ({
1259                0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1260                #0x000D => 1, # HT, LF, VT, FF, SP, CR
1261              }->{$self->{next_input_character}}) {
1262            ## Stay in the state
1263            !!!next-input-character;
1264            redo A;
1265          } elsif ($self->{next_input_character} eq 0x0022) { # "
1266            $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1267            $self->{state} = 'DOCTYPE public identifier (double-quoted)';
1268            !!!next-input-character;
1269            redo A;
1270          } elsif ($self->{next_input_character} eq 0x0027) { # '
1271            $self->{current_token}->{public_identifier} = ''; # DOCTYPE
1272            $self->{state} = 'DOCTYPE public identifier (single-quoted)';
1273            !!!next-input-character;
1274            redo A;
1275          } elsif ($self->{next_input_character} eq 0x003E) { # >
1276            !!!parse-error (type => 'no PUBLIC literal');
1277    
1278            $self->{state} = 'data';
1279            !!!next-input-character;
1280    
1281            delete $self->{current_token}->{correct};
1282            !!!emit ($self->{current_token}); # DOCTYPE
1283            undef $self->{current_token};
1284    
1285            redo A;
1286          } elsif ($self->{next_input_character} == -1) {
1287            !!!parse-error (type => 'unclosed DOCTYPE');
1288    
1289            $self->{state} = 'data';
1290            ## reconsume
1291    
1292            delete $self->{current_token}->{correct};
1293            !!!emit ($self->{current_token}); # DOCTYPE
1294            undef $self->{current_token};
1295    
1296            redo A;
1297          } else {
1298            !!!parse-error (type => 'string after PUBLIC');
1299            $self->{state} = 'bogus DOCTYPE';
1300            !!!next-input-character;
1301            redo A;
1302          }
1303        } elsif ($self->{state} eq 'DOCTYPE public identifier (double-quoted)') {
1304          if ($self->{next_input_character} == 0x0022) { # "
1305            $self->{state} = 'after DOCTYPE public identifier';
1306            !!!next-input-character;
1307            redo A;
1308          } elsif ($self->{next_input_character} == -1) {
1309            !!!parse-error (type => 'unclosed PUBLIC literal');
1310    
1311            $self->{state} = 'data';
1312            ## reconsume
1313    
1314            delete $self->{current_token}->{correct};
1315            !!!emit ($self->{current_token}); # DOCTYPE
1316            undef $self->{current_token};
1317    
1318            redo A;
1319          } else {
1320            $self->{current_token}->{public_identifier} # DOCTYPE
1321                .= chr $self->{next_input_character};
1322            ## Stay in the state
1323            !!!next-input-character;
1324            redo A;
1325          }
1326        } elsif ($self->{state} eq 'DOCTYPE public identifier (single-quoted)') {
1327          if ($self->{next_input_character} == 0x0027) { # '
1328            $self->{state} = 'after DOCTYPE public identifier';
1329            !!!next-input-character;
1330            redo A;
1331          } elsif ($self->{next_input_character} == -1) {
1332            !!!parse-error (type => 'unclosed PUBLIC literal');
1333    
1334            $self->{state} = 'data';
1335            ## reconsume
1336    
1337            delete $self->{current_token}->{correct};
1338            !!!emit ($self->{current_token}); # DOCTYPE
1339            undef $self->{current_token};
1340    
1341            redo A;
1342          } else {
1343            $self->{current_token}->{public_identifier} # DOCTYPE
1344                .= chr $self->{next_input_character};
1345            ## Stay in the state
1346            !!!next-input-character;
1347            redo A;
1348          }
1349        } elsif ($self->{state} eq 'after DOCTYPE public identifier') {
1350          if ({
1351                0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1352                #0x000D => 1, # HT, LF, VT, FF, SP, CR
1353              }->{$self->{next_input_character}}) {
1354            ## Stay in the state
1355            !!!next-input-character;
1356            redo A;
1357          } elsif ($self->{next_input_character} == 0x0022) { # "
1358            $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1359            $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1360            !!!next-input-character;
1361            redo A;
1362          } elsif ($self->{next_input_character} == 0x0027) { # '
1363            $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1364            $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1365            !!!next-input-character;
1366            redo A;
1367          } elsif ($self->{next_input_character} == 0x003E) { # >
1368            $self->{state} = 'data';
1369            !!!next-input-character;
1370    
1371            !!!emit ($self->{current_token}); # DOCTYPE
1372            undef $self->{current_token};
1373    
1374            redo A;
1375          } elsif ($self->{next_input_character} == -1) {
1376            !!!parse-error (type => 'unclosed DOCTYPE');
1377    
1378            $self->{state} = 'data';
1379            ## recomsume
1380    
1381            delete $self->{current_token}->{correct};
1382            !!!emit ($self->{current_token}); # DOCTYPE
1383            undef $self->{current_token};
1384    
1385            redo A;
1386          } else {
1387            !!!parse-error (type => 'string after PUBLIC literal');
1388            $self->{state} = 'bogus DOCTYPE';
1389            !!!next-input-character;
1390            redo A;
1391          }
1392        } elsif ($self->{state} eq 'before DOCTYPE system identifier') {
1393          if ({
1394                0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1395                #0x000D => 1, # HT, LF, VT, FF, SP, CR
1396              }->{$self->{next_input_character}}) {
1397            ## Stay in the state
1398            !!!next-input-character;
1399            redo A;
1400          } elsif ($self->{next_input_character} == 0x0022) { # "
1401            $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1402            $self->{state} = 'DOCTYPE system identifier (double-quoted)';
1403            !!!next-input-character;
1404            redo A;
1405          } elsif ($self->{next_input_character} == 0x0027) { # '
1406            $self->{current_token}->{system_identifier} = ''; # DOCTYPE
1407            $self->{state} = 'DOCTYPE system identifier (single-quoted)';
1408            !!!next-input-character;
1409            redo A;
1410          } elsif ($self->{next_input_character} == 0x003E) { # >
1411            !!!parse-error (type => 'no SYSTEM literal');
1412            $self->{state} = 'data';
1413            !!!next-input-character;
1414    
1415            delete $self->{current_token}->{correct};
1416            !!!emit ($self->{current_token}); # DOCTYPE
1417            undef $self->{current_token};
1418    
1419            redo A;
1420          } elsif ($self->{next_input_character} == -1) {
1421            !!!parse-error (type => 'unclosed DOCTYPE');
1422    
1423            $self->{state} = 'data';
1424            ## recomsume
1425    
1426            delete $self->{current_token}->{correct};
1427            !!!emit ($self->{current_token}); # DOCTYPE
1428            undef $self->{current_token};
1429    
1430            redo A;
1431          } else {
1432            !!!parse-error (type => 'string after PUBLIC literal');
1433            $self->{state} = 'bogus DOCTYPE';
1434            !!!next-input-character;
1435            redo A;
1436          }
1437        } elsif ($self->{state} eq 'DOCTYPE system identifier (double-quoted)') {
1438          if ($self->{next_input_character} == 0x0022) { # "
1439            $self->{state} = 'after DOCTYPE system identifier';
1440            !!!next-input-character;
1441            redo A;
1442          } elsif ($self->{next_input_character} == -1) {
1443            !!!parse-error (type => 'unclosed SYSTEM literal');
1444    
1445            $self->{state} = 'data';
1446            ## reconsume
1447    
1448            delete $self->{current_token}->{correct};
1449            !!!emit ($self->{current_token}); # DOCTYPE
1450            undef $self->{current_token};
1451    
1452            redo A;
1453          } else {
1454            $self->{current_token}->{system_identifier} # DOCTYPE
1455                .= chr $self->{next_input_character};
1456            ## Stay in the state
1457            !!!next-input-character;
1458            redo A;
1459          }
1460        } elsif ($self->{state} eq 'DOCTYPE system identifier (single-quoted)') {
1461          if ($self->{next_input_character} == 0x0027) { # '
1462            $self->{state} = 'after DOCTYPE system identifier';
1463            !!!next-input-character;
1464            redo A;
1465          } elsif ($self->{next_input_character} == -1) {
1466            !!!parse-error (type => 'unclosed SYSTEM literal');
1467    
1468            $self->{state} = 'data';
1469            ## reconsume
1470    
1471            delete $self->{current_token}->{correct};
1472          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
1473          undef $self->{current_token};          undef $self->{current_token};
1474    
1475          redo A;          redo A;
1476        } else {        } else {
1477          !!!parse-error (type => 'string after DOCTYPE name');          $self->{current_token}->{system_identifier} # DOCTYPE
1478          $self->{current_token}->{error} = 1; # DOCTYPE              .= chr $self->{next_input_character};
1479            ## Stay in the state
1480            !!!next-input-character;
1481            redo A;
1482          }
1483        } elsif ($self->{state} eq 'after DOCTYPE system identifier') {
1484          if ({
1485                0x0009 => 1, 0x000A => 1, 0x000B => 1, 0x000C => 1, 0x0020 => 1,
1486                #0x000D => 1, # HT, LF, VT, FF, SP, CR
1487              }->{$self->{next_input_character}}) {
1488            ## Stay in the state
1489            !!!next-input-character;
1490            redo A;
1491          } elsif ($self->{next_input_character} == 0x003E) { # >
1492            $self->{state} = 'data';
1493            !!!next-input-character;
1494    
1495            !!!emit ($self->{current_token}); # DOCTYPE
1496            undef $self->{current_token};
1497    
1498            redo A;
1499          } elsif ($self->{next_input_character} == -1) {
1500            !!!parse-error (type => 'unclosed DOCTYPE');
1501    
1502            $self->{state} = 'data';
1503            ## recomsume
1504    
1505            delete $self->{current_token}->{correct};
1506            !!!emit ($self->{current_token}); # DOCTYPE
1507            undef $self->{current_token};
1508    
1509            redo A;
1510          } else {
1511            !!!parse-error (type => 'string after SYSTEM literal');
1512          $self->{state} = 'bogus DOCTYPE';          $self->{state} = 'bogus DOCTYPE';
1513          !!!next-input-character;          !!!next-input-character;
1514          redo A;          redo A;
# Line 1219  sub _get_next_token ($) { Line 1518  sub _get_next_token ($) {
1518          $self->{state} = 'data';          $self->{state} = 'data';
1519          !!!next-input-character;          !!!next-input-character;
1520    
1521            delete $self->{current_token}->{correct};
1522          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
1523          undef $self->{current_token};          undef $self->{current_token};
1524    
# Line 1228  sub _get_next_token ($) { Line 1528  sub _get_next_token ($) {
1528          $self->{state} = 'data';          $self->{state} = 'data';
1529          ## reconsume          ## reconsume
1530    
1531            delete $self->{current_token}->{correct};
1532          !!!emit ($self->{current_token}); # DOCTYPE          !!!emit ($self->{current_token}); # DOCTYPE
1533          undef $self->{current_token};          undef $self->{current_token};
1534    
# Line 1395  sub _initialize_tree_constructor ($) { Line 1696  sub _initialize_tree_constructor ($) {
1696    $self->{document}->strict_error_checking (0);    $self->{document}->strict_error_checking (0);
1697    ## TODO: Turn mutation events off # MUST    ## TODO: Turn mutation events off # MUST
1698    ## TODO: Turn loose Document option (manakai extension) on    ## TODO: Turn loose Document option (manakai extension) on
1699    ## TODO: Mark the Document as an HTML document # MUST    $self->{document}->manakai_is_html (1); # MUST
1700  } # _initialize_tree_constructor  } # _initialize_tree_constructor
1701    
1702  sub _terminate_tree_constructor ($) {  sub _terminate_tree_constructor ($) {
# Line 1435  sub _construct_tree ($) { Line 1736  sub _construct_tree ($) {
1736    
1737  sub _tree_construction_initial ($) {  sub _tree_construction_initial ($) {
1738    my $self = shift;    my $self = shift;
1739    B: {    INITIAL: {
1740        if ($token->{type} eq 'DOCTYPE') {      if ($token->{type} eq 'DOCTYPE') {
1741          if ($token->{error}) {        ## NOTE: Conformance checkers MAY, instead of reporting "not HTML5"
1742            ## ISSUE: Spec currently left this case undefined.        ## error, switch to a conformance checking mode for another
1743            !!!parse-error (type => 'bogus DOCTYPE');        ## language.
1744          }        my $doctype_name = $token->{name};
1745          my $doctype = $self->{document}->create_document_type_definition        $doctype_name = '' unless defined $doctype_name;
1746            ($token->{name});        $doctype_name =~ tr/a-z/A-Z/;
1747          $self->{document}->append_child ($doctype);        if (not defined $token->{name} or # <!DOCTYPE>
1748          #$phase = 'root element';            defined $token->{public_identifier} or
1749          !!!next-token;            defined $token->{system_identifier}) {
1750          #redo B;          !!!parse-error (type => 'not HTML5');
1751          return;        } elsif ($doctype_name ne 'HTML') {
1752        } elsif ({          ## ISSUE: ASCII case-insensitive? (in fact it does not matter)
1753                  comment => 1,          !!!parse-error (type => 'not HTML5');
1754                  'start tag' => 1,        }
1755                  'end tag' => 1,        
1756                  'end-of-file' => 1,        my $doctype = $self->{document}->create_document_type_definition
1757                 }->{$token->{type}}) {          ($token->{name}); ## ISSUE: If name is missing (e.g. <!DOCTYPE>)?
1758          ## ISSUE: Spec currently left this case undefined.        $doctype->public_id ($token->{public_identifier})
1759          !!!parse-error (type => 'missing DOCTYPE');            if defined $token->{public_identifier};
1760          #$phase = 'root element';        $doctype->system_id ($token->{system_identifier})
1761          ## reprocess            if defined $token->{system_identifier};
1762          #redo B;        ## NOTE: Other DocumentType attributes are null or empty lists.
1763          return;        ## ISSUE: internalSubset = null??
1764        } elsif ($token->{type} eq 'character') {        $self->{document}->append_child ($doctype);
1765          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) {        
1766            $self->{document}->manakai_append_text ($1);        if (not $token->{correct} or $doctype_name ne 'HTML') {
1767            ## ISSUE: DOM3 Core does not allow Document > Text          $self->{document}->manakai_compat_mode ('quirks');
1768            unless (length $token->{data}) {        } elsif (defined $token->{public_identifier}) {
1769              ## Stay in the phase          my $pubid = $token->{public_identifier};
1770              !!!next-token;          $pubid =~ tr/a-z/A-z/;
1771              redo B;          if ({
1772              "+//SILMARIL//DTD HTML PRO V0R11 19970101//EN" => 1,
1773              "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1774              "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//EN" => 1,
1775              "-//IETF//DTD HTML 2.0 LEVEL 1//EN" => 1,
1776              "-//IETF//DTD HTML 2.0 LEVEL 2//EN" => 1,
1777              "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//EN" => 1,
1778              "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//EN" => 1,
1779              "-//IETF//DTD HTML 2.0 STRICT//EN" => 1,
1780              "-//IETF//DTD HTML 2.0//EN" => 1,
1781              "-//IETF//DTD HTML 2.1E//EN" => 1,
1782              "-//IETF//DTD HTML 3.0//EN" => 1,
1783              "-//IETF//DTD HTML 3.0//EN//" => 1,
1784              "-//IETF//DTD HTML 3.2 FINAL//EN" => 1,
1785              "-//IETF//DTD HTML 3.2//EN" => 1,
1786              "-//IETF//DTD HTML 3//EN" => 1,
1787              "-//IETF//DTD HTML LEVEL 0//EN" => 1,
1788              "-//IETF//DTD HTML LEVEL 0//EN//2.0" => 1,
1789              "-//IETF//DTD HTML LEVEL 1//EN" => 1,
1790              "-//IETF//DTD HTML LEVEL 1//EN//2.0" => 1,
1791              "-//IETF//DTD HTML LEVEL 2//EN" => 1,
1792              "-//IETF//DTD HTML LEVEL 2//EN//2.0" => 1,
1793              "-//IETF//DTD HTML LEVEL 3//EN" => 1,
1794              "-//IETF//DTD HTML LEVEL 3//EN//3.0" => 1,
1795              "-//IETF//DTD HTML STRICT LEVEL 0//EN" => 1,
1796              "-//IETF//DTD HTML STRICT LEVEL 0//EN//2.0" => 1,
1797              "-//IETF//DTD HTML STRICT LEVEL 1//EN" => 1,
1798              "-//IETF//DTD HTML STRICT LEVEL 1//EN//2.0" => 1,
1799              "-//IETF//DTD HTML STRICT LEVEL 2//EN" => 1,
1800              "-//IETF//DTD HTML STRICT LEVEL 2//EN//2.0" => 1,
1801              "-//IETF//DTD HTML STRICT LEVEL 3//EN" => 1,
1802              "-//IETF//DTD HTML STRICT LEVEL 3//EN//3.0" => 1,
1803              "-//IETF//DTD HTML STRICT//EN" => 1,
1804              "-//IETF//DTD HTML STRICT//EN//2.0" => 1,
1805              "-//IETF//DTD HTML STRICT//EN//3.0" => 1,
1806              "-//IETF//DTD HTML//EN" => 1,
1807              "-//IETF//DTD HTML//EN//2.0" => 1,
1808              "-//IETF//DTD HTML//EN//3.0" => 1,
1809              "-//METRIUS//DTD METRIUS PRESENTATIONAL//EN" => 1,
1810              "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//EN" => 1,
1811              "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//EN" => 1,
1812              "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//EN" => 1,
1813              "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//EN" => 1,
1814              "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//EN" => 1,
1815              "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//EN" => 1,
1816              "-//NETSCAPE COMM. CORP.//DTD HTML//EN" => 1,
1817              "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//EN" => 1,
1818              "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//EN" => 1,
1819              "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//EN" => 1,
1820              "-//SPYGLASS//DTD HTML 2.0 EXTENDED//EN" => 1,
1821              "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//EN" => 1,
1822              "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//EN" => 1,
1823              "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//EN" => 1,
1824              "-//W3C//DTD HTML 3 1995-03-24//EN" => 1,
1825              "-//W3C//DTD HTML 3.2 DRAFT//EN" => 1,
1826              "-//W3C//DTD HTML 3.2 FINAL//EN" => 1,
1827              "-//W3C//DTD HTML 3.2//EN" => 1,
1828              "-//W3C//DTD HTML 3.2S DRAFT//EN" => 1,
1829              "-//W3C//DTD HTML 4.0 FRAMESET//EN" => 1,
1830              "-//W3C//DTD HTML 4.0 TRANSITIONAL//EN" => 1,
1831              "-//W3C//DTD HTML EXPERIMETNAL 19960712//EN" => 1,
1832              "-//W3C//DTD HTML EXPERIMENTAL 970421//EN" => 1,
1833              "-//W3C//DTD W3 HTML//EN" => 1,
1834              "-//W3O//DTD W3 HTML 3.0//EN" => 1,
1835              "-//W3O//DTD W3 HTML 3.0//EN//" => 1,
1836              "-//W3O//DTD W3 HTML STRICT 3.0//EN//" => 1,
1837              "-//WEBTECHS//DTD MOZILLA HTML 2.0//EN" => 1,
1838              "-//WEBTECHS//DTD MOZILLA HTML//EN" => 1,
1839              "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" => 1,
1840              "HTML" => 1,
1841            }->{$pubid}) {
1842              $self->{document}->manakai_compat_mode ('quirks');
1843            } elsif ($pubid eq "-//W3C//DTD HTML 4.01 FRAMESET//EN" or
1844                     $pubid eq "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN") {
1845              if (defined $token->{system_identifier}) {
1846                $self->{document}->manakai_compat_mode ('quirks');
1847              } else {
1848                $self->{document}->manakai_compat_mode ('limited quirks');
1849            }            }
1850            } elsif ($pubid eq "-//W3C//DTD XHTML 1.0 Frameset//EN" or
1851                     $pubid eq "-//W3C//DTD XHTML 1.0 Transitional//EN") {
1852              $self->{document}->manakai_compat_mode ('limited quirks');
1853            }
1854          }
1855          if (defined $token->{system_identifier}) {
1856            my $sysid = $token->{system_identifier};
1857            $sysid =~ tr/A-Z/a-z/;
1858            if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") {
1859              $self->{document}->manakai_compat_mode ('quirks');
1860          }          }
         ## ISSUE: Spec currently left this case undefined.  
         !!!parse-error (type => 'missing DOCTYPE');  
         #$phase = 'root element';  
         ## reprocess  
         #redo B;  
         return;  
       } else {  
         die "$0: $token->{type}: Unknown token";  
1861        }        }
1862      } # B        
1863          ## Go to the root element phase.
1864          !!!next-token;
1865          return;
1866        } elsif ({
1867                  'start tag' => 1,
1868                  'end tag' => 1,
1869                  'end-of-file' => 1,
1870                 }->{$token->{type}}) {
1871          !!!parse-error (type => 'no DOCTYPE');
1872          $self->{document}->manakai_compat_mode ('quirks');
1873          ## Go to the root element phase
1874          ## reprocess
1875          return;
1876        } elsif ($token->{type} eq 'character') {
1877          if ($token->{data} =~ s/^([\x09\x0A\x0B\x0C\x20]+)//) { # \x0D
1878            ## Ignore the token
1879            unless (length $token->{data}) {
1880              ## Stay in the phase
1881              !!!next-token;
1882              redo INITIAL;
1883            }
1884          }
1885    
1886          !!!parse-error (type => 'no DOCTYPE');
1887          $self->{document}->manakai_compat_mode ('quirks');
1888          ## Go to the root element phase
1889          ## reprocess
1890          return;
1891        } elsif ($token->{type} eq 'comment') {
1892          my $comment = $self->{document}->create_comment ($token->{data});
1893          $self->{document}->append_child ($comment);
1894          
1895          ## Stay in the phase.
1896          !!!next-token;
1897          redo INITIAL;
1898        } else {
1899          die "$0: $token->{type}: Unknown token";
1900        }
1901      } # INITIAL
1902  } # _tree_construction_initial  } # _tree_construction_initial
1903    
1904  sub _tree_construction_root_element ($) {  sub _tree_construction_root_element ($) {
# Line 4665  sub set_inner_html ($$$) { Line 5084  sub set_inner_html ($$$) {
5084      ## Step 1 # MUST      ## Step 1 # MUST
5085      my $this_doc = $node->owner_document;      my $this_doc = $node->owner_document;
5086      my $doc = $this_doc->implementation->create_document;      my $doc = $this_doc->implementation->create_document;
5087      ## TODO: Mark as HTML document      $doc->manakai_is_html (1);
5088      my $p = $class->new;      my $p = $class->new;
5089      $p->{document} = $doc;      $p->{document} = $doc;
5090    

Legend:
Removed from v.1.17  
changed lines
  Added in v.1.18

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24