/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (hide annotations) (download) (as text)
Wed Oct 15 12:49:49 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.11: +249 -82 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	15 Oct 2008 12:49:07 -0000
	* XML-Parser.t: "xml/doctypes-2.dat" added.

	* tokenizer-test-1.test: Keyword case-sensitivility tests added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 12:49:41 -0000
	* doctypes-1.dat: A keyword case-sensitivility test added.

	* doctypes-2.dat: New test data file.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 12:46:53 -0000
	* Tokenizer.pm.src: $self->{s_kwd} for non-DATA_STATE states are
	renamed as $self->{kwd} to avoid confliction.  Don't raise
	case-sensitivity error for the keyword "DOCTYPE" in HTML mode.
	Support for internal subsets (internal subset itself only; no
	declaration in them is supported yet).  Raise a parse error for
	non-uppercase keywords "PUBLIC" and "SYSTEM" in XML mode.  Raise a
	parse error if no system identifier is specified for a DOCTYPE
	declaration with a public identifier.  Don't close the DOCTYPE
	declaration by a ">" character in the system declaration in XML
	mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 12:48:30 -0000
	* Parser.pm.src: Typo fixed.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.12 our $VERSION=do{my @r=(q$Revision: 1.11 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34 wakaba 1.12 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
35    
36 wakaba 1.2 ## Token types
37    
38 wakaba 1.12 sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token.
39 wakaba 1.2 sub COMMENT_TOKEN () { 2 }
40     sub START_TAG_TOKEN () { 3 }
41     sub END_TAG_TOKEN () { 4 }
42     sub END_OF_FILE_TOKEN () { 5 }
43     sub CHARACTER_TOKEN () { 6 }
44 wakaba 1.12 sub PI_TOKEN () { 7 } ## NOTE: XML only.
45     sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing.
46    
47     ## XML5: XML5 has "empty tag token". In this implementation, it is
48     ## represented as a start tag token with $self->{self_closing} flag
49     ## set to true.
50    
51     ## XML5: XML5 has "short end tag token". In this implementation, it
52     ## is represented as an end tag token with $token->{tag_name} flag set
53     ## to an empty string.
54 wakaba 1.1
55     package Whatpm::HTML;
56    
57 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
58    
59 wakaba 1.1 ## Content model flags
60    
61     sub CM_ENTITY () { 0b001 } # & markup in data
62     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
63     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
64    
65     sub PLAINTEXT_CONTENT_MODEL () { 0 }
66     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
67     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
68     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
69    
70     ## Tokenizer states
71    
72     sub DATA_STATE () { 0 }
73     #sub ENTITY_DATA_STATE () { 1 }
74     sub TAG_OPEN_STATE () { 2 }
75     sub CLOSE_TAG_OPEN_STATE () { 3 }
76     sub TAG_NAME_STATE () { 4 }
77     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
78     sub ATTRIBUTE_NAME_STATE () { 6 }
79     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
80     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
81     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
82     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
83     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
84     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
85     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
86     sub COMMENT_START_STATE () { 14 }
87     sub COMMENT_START_DASH_STATE () { 15 }
88     sub COMMENT_STATE () { 16 }
89     sub COMMENT_END_STATE () { 17 }
90     sub COMMENT_END_DASH_STATE () { 18 }
91     sub BOGUS_COMMENT_STATE () { 19 }
92     sub DOCTYPE_STATE () { 20 }
93     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
94     sub DOCTYPE_NAME_STATE () { 22 }
95     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
96     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
97     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
98     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
99     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
100     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
101     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
102     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
103     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
104     sub BOGUS_DOCTYPE_STATE () { 32 }
105     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
106     sub SELF_CLOSING_START_TAG_STATE () { 34 }
107     sub CDATA_SECTION_STATE () { 35 }
108     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
109     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
110     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
111     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
112     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
113     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
114     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
115     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
116     ## NOTE: "Entity data state", "entity in attribute value state", and
117     ## "consume a character reference" algorithm are jointly implemented
118     ## using the following six states:
119     sub ENTITY_STATE () { 44 }
120     sub ENTITY_HASH_STATE () { 45 }
121     sub NCR_NUM_STATE () { 46 }
122     sub HEXREF_X_STATE () { 47 }
123     sub HEXREF_HEX_STATE () { 48 }
124     sub ENTITY_NAME_STATE () { 49 }
125     sub PCDATA_STATE () { 50 } # "data state" in the spec
126    
127 wakaba 1.12 ## XML-only states
128 wakaba 1.8 sub PI_STATE () { 51 }
129     sub PI_TARGET_STATE () { 52 }
130     sub PI_TARGET_AFTER_STATE () { 53 }
131     sub PI_DATA_STATE () { 54 }
132     sub PI_AFTER_STATE () { 55 }
133     sub PI_DATA_AFTER_STATE () { 56 }
134 wakaba 1.12 sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 }
135     sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 }
136 wakaba 1.8
137 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
138     ## list and descriptions)
139    
140     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
141     sub FOREIGN_EL () { 0b1_00000000000 }
142    
143     ## Character reference mappings
144    
145     my $charref_map = {
146     0x0D => 0x000A,
147     0x80 => 0x20AC,
148     0x81 => 0xFFFD,
149     0x82 => 0x201A,
150     0x83 => 0x0192,
151     0x84 => 0x201E,
152     0x85 => 0x2026,
153     0x86 => 0x2020,
154     0x87 => 0x2021,
155     0x88 => 0x02C6,
156     0x89 => 0x2030,
157     0x8A => 0x0160,
158     0x8B => 0x2039,
159     0x8C => 0x0152,
160     0x8D => 0xFFFD,
161     0x8E => 0x017D,
162     0x8F => 0xFFFD,
163     0x90 => 0xFFFD,
164     0x91 => 0x2018,
165     0x92 => 0x2019,
166     0x93 => 0x201C,
167     0x94 => 0x201D,
168     0x95 => 0x2022,
169     0x96 => 0x2013,
170     0x97 => 0x2014,
171     0x98 => 0x02DC,
172     0x99 => 0x2122,
173     0x9A => 0x0161,
174     0x9B => 0x203A,
175     0x9C => 0x0153,
176     0x9D => 0xFFFD,
177     0x9E => 0x017E,
178     0x9F => 0x0178,
179     }; # $charref_map
180     $charref_map->{$_} = 0xFFFD
181     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
182     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
183     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
184     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
185     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
186     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
187     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
188    
189     ## Implementations MUST act as if state machine in the spec
190    
191     sub _initialize_tokenizer ($) {
192     my $self = shift;
193    
194     ## NOTE: Fields set by |new| constructor:
195     #$self->{level}
196     #$self->{set_nc}
197     #$self->{parse_error}
198 wakaba 1.3 #$self->{is_xml} (if XML)
199 wakaba 1.1
200     $self->{state} = DATA_STATE; # MUST
201 wakaba 1.12 $self->{s_kwd} = ''; # Data state keyword
202     #$self->{kwd} = ''; # State-dependent keyword; initialized when used
203 wakaba 1.1 #$self->{entity__value}; # initialized when used
204     #$self->{entity__match}; # initialized when used
205     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
206     undef $self->{ct}; # current token
207     undef $self->{ca}; # current attribute
208     undef $self->{last_stag_name}; # last emitted start tag name
209     #$self->{prev_state}; # initialized when used
210     delete $self->{self_closing};
211     $self->{char_buffer} = '';
212     $self->{char_buffer_pos} = 0;
213     $self->{nc} = -1; # next input character
214     #$self->{next_nc}
215     !!!next-input-character;
216     $self->{token} = [];
217     # $self->{escape}
218     } # _initialize_tokenizer
219    
220     ## A token has:
221     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
222 wakaba 1.11 ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN
223 wakaba 1.1 ## ->{name} (DOCTYPE_TOKEN)
224     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
225 wakaba 1.11 ## ->{target} (PI_TOKEN)
226 wakaba 1.1 ## ->{pubid} (DOCTYPE_TOKEN)
227     ## ->{sysid} (DOCTYPE_TOKEN)
228     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
229     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
230     ## ->{name}
231     ## ->{value}
232     ## ->{has_reference} == 1 or 0
233 wakaba 1.11 ## ->{index}: Index of the attribute in a tag.
234     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN)
235 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
236 wakaba 1.11 ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1.
237 wakaba 1.12 ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN)
238    
239 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
240     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
241     ## while the token is pushed back to the stack.
242    
243     ## Emitted token MUST immediately be handled by the tree construction state.
244    
245     ## Before each step, UA MAY check to see if either one of the scripts in
246     ## "list of scripts that will execute as soon as possible" or the first
247     ## script in the "list of scripts that will execute asynchronously",
248     ## has completed loading. If one has, then it MUST be executed
249     ## and removed from the list.
250    
251     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
252     ## (This requirement was dropped from HTML5 spec, unfortunately.)
253    
254     my $is_space = {
255     0x0009 => 1, # CHARACTER TABULATION (HT)
256     0x000A => 1, # LINE FEED (LF)
257     #0x000B => 0, # LINE TABULATION (VT)
258 wakaba 1.12 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character.
259 wakaba 1.1 #0x000D => 1, # CARRIAGE RETURN (CR)
260     0x0020 => 1, # SPACE (SP)
261     };
262    
263     sub _get_next_token ($) {
264     my $self = shift;
265    
266     if ($self->{self_closing}) {
267     !!!parse-error (type => 'nestc', token => $self->{ct});
268     ## NOTE: The |self_closing| flag is only set by start tag token.
269     ## In addition, when a start tag token is emitted, it is always set to
270     ## |ct|.
271     delete $self->{self_closing};
272     }
273    
274     if (@{$self->{token}}) {
275     $self->{self_closing} = $self->{token}->[0]->{self_closing};
276     return shift @{$self->{token}};
277     }
278    
279     A: {
280     if ($self->{state} == PCDATA_STATE) {
281     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
282    
283     if ($self->{nc} == 0x0026) { # &
284     !!!cp (0.1);
285     ## NOTE: In the spec, the tokenizer is switched to the
286     ## "entity data state". In this implementation, the tokenizer
287     ## is switched to the |ENTITY_STATE|, which is an implementation
288     ## of the "consume a character reference" algorithm.
289     $self->{entity_add} = -1;
290     $self->{prev_state} = DATA_STATE;
291     $self->{state} = ENTITY_STATE;
292     !!!next-input-character;
293     redo A;
294     } elsif ($self->{nc} == 0x003C) { # <
295     !!!cp (0.2);
296     $self->{state} = TAG_OPEN_STATE;
297     !!!next-input-character;
298     redo A;
299     } elsif ($self->{nc} == -1) {
300     !!!cp (0.3);
301     !!!emit ({type => END_OF_FILE_TOKEN,
302     line => $self->{line}, column => $self->{column}});
303     last A; ## TODO: ok?
304     } else {
305     !!!cp (0.4);
306     #
307     }
308    
309     # Anything else
310     my $token = {type => CHARACTER_TOKEN,
311     data => chr $self->{nc},
312     line => $self->{line}, column => $self->{column},
313     };
314     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
315    
316     ## Stay in the state.
317     !!!next-input-character;
318     !!!emit ($token);
319     redo A;
320     } elsif ($self->{state} == DATA_STATE) {
321     $self->{s_kwd} = '' unless defined $self->{s_kwd};
322     if ($self->{nc} == 0x0026) { # &
323     $self->{s_kwd} = '';
324     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
325     not $self->{escape}) {
326     !!!cp (1);
327     ## NOTE: In the spec, the tokenizer is switched to the
328     ## "entity data state". In this implementation, the tokenizer
329     ## is switched to the |ENTITY_STATE|, which is an implementation
330     ## of the "consume a character reference" algorithm.
331     $self->{entity_add} = -1;
332     $self->{prev_state} = DATA_STATE;
333     $self->{state} = ENTITY_STATE;
334     !!!next-input-character;
335     redo A;
336     } else {
337     !!!cp (2);
338     #
339     }
340     } elsif ($self->{nc} == 0x002D) { # -
341     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
342 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
343 wakaba 1.1 !!!cp (3);
344     $self->{escape} = 1; # unless $self->{escape};
345     $self->{s_kwd} = '--';
346     #
347 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
348 wakaba 1.1 !!!cp (4);
349     $self->{s_kwd} = '--';
350     #
351 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
352     !!!cp (4.1);
353     $self->{s_kwd} .= '-';
354     #
355 wakaba 1.1 } else {
356     !!!cp (5);
357 wakaba 1.5 $self->{s_kwd} = '-';
358 wakaba 1.1 #
359     }
360     }
361    
362     #
363     } elsif ($self->{nc} == 0x0021) { # !
364     if (length $self->{s_kwd}) {
365     !!!cp (5.1);
366     $self->{s_kwd} .= '!';
367     #
368     } else {
369     !!!cp (5.2);
370     #$self->{s_kwd} = '';
371     #
372     }
373     #
374     } elsif ($self->{nc} == 0x003C) { # <
375     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
376     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
377     not $self->{escape})) {
378     !!!cp (6);
379     $self->{state} = TAG_OPEN_STATE;
380     !!!next-input-character;
381     redo A;
382     } else {
383     !!!cp (7);
384     $self->{s_kwd} = '';
385     #
386     }
387     } elsif ($self->{nc} == 0x003E) { # >
388     if ($self->{escape} and
389     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
390     if ($self->{s_kwd} eq '--') {
391     !!!cp (8);
392     delete $self->{escape};
393 wakaba 1.5 #
394 wakaba 1.1 } else {
395     !!!cp (9);
396 wakaba 1.5 #
397 wakaba 1.1 }
398 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
399     !!!cp (9.1);
400     !!!parse-error (type => 'unmatched mse', ## TODO: type
401     line => $self->{line_prev},
402     column => $self->{column_prev} - 1);
403     #
404 wakaba 1.1 } else {
405     !!!cp (10);
406 wakaba 1.5 #
407 wakaba 1.1 }
408    
409     $self->{s_kwd} = '';
410     #
411 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
412     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
413     !!!cp (10.1);
414     $self->{s_kwd} .= ']';
415     } elsif ($self->{s_kwd} eq ']]') {
416     !!!cp (10.2);
417     #
418     } else {
419     !!!cp (10.3);
420     $self->{s_kwd} = '';
421     }
422     #
423 wakaba 1.1 } elsif ($self->{nc} == -1) {
424     !!!cp (11);
425     $self->{s_kwd} = '';
426     !!!emit ({type => END_OF_FILE_TOKEN,
427     line => $self->{line}, column => $self->{column}});
428     last A; ## TODO: ok?
429     } else {
430     !!!cp (12);
431     $self->{s_kwd} = '';
432     #
433     }
434    
435     # Anything else
436     my $token = {type => CHARACTER_TOKEN,
437     data => chr $self->{nc},
438     line => $self->{line}, column => $self->{column},
439     };
440 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
441 wakaba 1.1 length $token->{data})) {
442     $self->{s_kwd} = '';
443     }
444    
445     ## Stay in the data state.
446 wakaba 1.5 if (not $self->{is_xml} and
447     $self->{content_model} == PCDATA_CONTENT_MODEL) {
448 wakaba 1.1 !!!cp (13);
449     $self->{state} = PCDATA_STATE;
450     } else {
451     !!!cp (14);
452     ## Stay in the state.
453     }
454     !!!next-input-character;
455     !!!emit ($token);
456     redo A;
457     } elsif ($self->{state} == TAG_OPEN_STATE) {
458 wakaba 1.10 ## XML5: "tag state".
459    
460 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
461     if ($self->{nc} == 0x002F) { # /
462     !!!cp (15);
463     !!!next-input-character;
464     $self->{state} = CLOSE_TAG_OPEN_STATE;
465     redo A;
466     } elsif ($self->{nc} == 0x0021) { # !
467     !!!cp (15.1);
468 wakaba 1.12 $self->{s_kwd} = $self->{escaped} ? '' : '<';
469 wakaba 1.1 #
470     } else {
471     !!!cp (16);
472 wakaba 1.12 $self->{s_kwd} = '';
473 wakaba 1.1 #
474     }
475    
476     ## reconsume
477     $self->{state} = DATA_STATE;
478     !!!emit ({type => CHARACTER_TOKEN, data => '<',
479     line => $self->{line_prev},
480     column => $self->{column_prev},
481     });
482     redo A;
483     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
484     if ($self->{nc} == 0x0021) { # !
485     !!!cp (17);
486     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
487     !!!next-input-character;
488     redo A;
489     } elsif ($self->{nc} == 0x002F) { # /
490     !!!cp (18);
491     $self->{state} = CLOSE_TAG_OPEN_STATE;
492     !!!next-input-character;
493     redo A;
494     } elsif (0x0041 <= $self->{nc} and
495     $self->{nc} <= 0x005A) { # A..Z
496     !!!cp (19);
497     $self->{ct}
498     = {type => START_TAG_TOKEN,
499 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
500 wakaba 1.1 line => $self->{line_prev},
501     column => $self->{column_prev}};
502     $self->{state} = TAG_NAME_STATE;
503     !!!next-input-character;
504     redo A;
505     } elsif (0x0061 <= $self->{nc} and
506     $self->{nc} <= 0x007A) { # a..z
507     !!!cp (20);
508     $self->{ct} = {type => START_TAG_TOKEN,
509     tag_name => chr ($self->{nc}),
510     line => $self->{line_prev},
511     column => $self->{column_prev}};
512     $self->{state} = TAG_NAME_STATE;
513     !!!next-input-character;
514     redo A;
515     } elsif ($self->{nc} == 0x003E) { # >
516     !!!cp (21);
517     !!!parse-error (type => 'empty start tag',
518     line => $self->{line_prev},
519     column => $self->{column_prev});
520     $self->{state} = DATA_STATE;
521 wakaba 1.5 $self->{s_kwd} = '';
522 wakaba 1.1 !!!next-input-character;
523    
524     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
525     line => $self->{line_prev},
526     column => $self->{column_prev},
527     });
528    
529     redo A;
530     } elsif ($self->{nc} == 0x003F) { # ?
531 wakaba 1.8 if ($self->{is_xml}) {
532     !!!cp (22.1);
533     $self->{state} = PI_STATE;
534     !!!next-input-character;
535     redo A;
536     } else {
537     !!!cp (22);
538     !!!parse-error (type => 'pio',
539     line => $self->{line_prev},
540     column => $self->{column_prev});
541     $self->{state} = BOGUS_COMMENT_STATE;
542     $self->{ct} = {type => COMMENT_TOKEN, data => '',
543     line => $self->{line_prev},
544     column => $self->{column_prev},
545     };
546     ## $self->{nc} is intentionally left as is
547     redo A;
548     }
549 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
550 wakaba 1.1 !!!cp (23);
551     !!!parse-error (type => 'bare stago',
552     line => $self->{line_prev},
553     column => $self->{column_prev});
554     $self->{state} = DATA_STATE;
555 wakaba 1.5 $self->{s_kwd} = '';
556 wakaba 1.1 ## reconsume
557    
558     !!!emit ({type => CHARACTER_TOKEN, data => '<',
559     line => $self->{line_prev},
560     column => $self->{column_prev},
561     });
562    
563     redo A;
564 wakaba 1.9 } else {
565     ## XML5: "<:" is a parse error.
566     !!!cp (23.1);
567     $self->{ct} = {type => START_TAG_TOKEN,
568     tag_name => chr ($self->{nc}),
569     line => $self->{line_prev},
570     column => $self->{column_prev}};
571     $self->{state} = TAG_NAME_STATE;
572     !!!next-input-character;
573     redo A;
574 wakaba 1.1 }
575     } else {
576     die "$0: $self->{content_model} in tag open";
577     }
578     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
579     ## NOTE: The "close tag open state" in the spec is implemented as
580     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
581    
582 wakaba 1.10 ## XML5: "end tag state".
583    
584 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
585     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
586     if (defined $self->{last_stag_name}) {
587     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
588 wakaba 1.12 $self->{kwd} = '';
589 wakaba 1.1 ## Reconsume.
590     redo A;
591     } else {
592     ## No start tag token has ever been emitted
593     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
594     !!!cp (28);
595     $self->{state} = DATA_STATE;
596 wakaba 1.5 $self->{s_kwd} = '';
597 wakaba 1.1 ## Reconsume.
598     !!!emit ({type => CHARACTER_TOKEN, data => '</',
599     line => $l, column => $c,
600     });
601     redo A;
602     }
603     }
604    
605     if (0x0041 <= $self->{nc} and
606     $self->{nc} <= 0x005A) { # A..Z
607     !!!cp (29);
608     $self->{ct}
609     = {type => END_TAG_TOKEN,
610 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
611 wakaba 1.1 line => $l, column => $c};
612     $self->{state} = TAG_NAME_STATE;
613     !!!next-input-character;
614     redo A;
615     } elsif (0x0061 <= $self->{nc} and
616     $self->{nc} <= 0x007A) { # a..z
617     !!!cp (30);
618     $self->{ct} = {type => END_TAG_TOKEN,
619     tag_name => chr ($self->{nc}),
620     line => $l, column => $c};
621     $self->{state} = TAG_NAME_STATE;
622     !!!next-input-character;
623     redo A;
624     } elsif ($self->{nc} == 0x003E) { # >
625     !!!parse-error (type => 'empty end tag',
626     line => $self->{line_prev}, ## "<" in "</>"
627     column => $self->{column_prev} - 1);
628     $self->{state} = DATA_STATE;
629 wakaba 1.5 $self->{s_kwd} = '';
630 wakaba 1.10 if ($self->{is_xml}) {
631     !!!cp (31);
632     ## XML5: No parse error.
633    
634     ## NOTE: This parser raises a parse error, since it supports
635     ## XML1, not XML5.
636    
637     ## NOTE: A short end tag token.
638     my $ct = {type => END_TAG_TOKEN,
639     tag_name => '',
640     line => $self->{line_prev},
641     column => $self->{column_prev} - 1,
642     };
643     !!!next-input-character;
644     !!!emit ($ct);
645     } else {
646     !!!cp (31.1);
647     !!!next-input-character;
648     }
649 wakaba 1.1 redo A;
650     } elsif ($self->{nc} == -1) {
651     !!!cp (32);
652     !!!parse-error (type => 'bare etago');
653 wakaba 1.5 $self->{s_kwd} = '';
654 wakaba 1.1 $self->{state} = DATA_STATE;
655     # reconsume
656    
657     !!!emit ({type => CHARACTER_TOKEN, data => '</',
658     line => $l, column => $c,
659     });
660    
661     redo A;
662 wakaba 1.10 } elsif (not $self->{is_xml} or
663     $is_space->{$self->{nc}}) {
664 wakaba 1.1 !!!cp (33);
665 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
666     line => $self->{line_prev}, # "<" of "</"
667     column => $self->{column_prev} - 1);
668 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
669     $self->{ct} = {type => COMMENT_TOKEN, data => '',
670     line => $self->{line_prev}, # "<" of "</"
671     column => $self->{column_prev} - 1,
672     };
673     ## NOTE: $self->{nc} is intentionally left as is.
674     ## Although the "anything else" case of the spec not explicitly
675     ## states that the next input character is to be reconsumed,
676     ## it will be included to the |data| of the comment token
677     ## generated from the bogus end tag, as defined in the
678     ## "bogus comment state" entry.
679     redo A;
680 wakaba 1.10 } else {
681     ## XML5: "</:" is a parse error.
682     !!!cp (30.1);
683     $self->{ct} = {type => END_TAG_TOKEN,
684     tag_name => chr ($self->{nc}),
685     line => $l, column => $c};
686     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
687     !!!next-input-character;
688     redo A;
689 wakaba 1.1 }
690     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
691 wakaba 1.12 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1;
692 wakaba 1.1 if (length $ch) {
693     my $CH = $ch;
694     $ch =~ tr/a-z/A-Z/;
695     my $nch = chr $self->{nc};
696     if ($nch eq $ch or $nch eq $CH) {
697     !!!cp (24);
698     ## Stay in the state.
699 wakaba 1.12 $self->{kwd} .= $nch;
700 wakaba 1.1 !!!next-input-character;
701     redo A;
702     } else {
703     !!!cp (25);
704     $self->{state} = DATA_STATE;
705 wakaba 1.5 $self->{s_kwd} = '';
706 wakaba 1.1 ## Reconsume.
707     !!!emit ({type => CHARACTER_TOKEN,
708 wakaba 1.12 data => '</' . $self->{kwd},
709 wakaba 1.1 line => $self->{line_prev},
710 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
711 wakaba 1.1 });
712     redo A;
713     }
714     } else { # after "<{tag-name}"
715     unless ($is_space->{$self->{nc}} or
716     {
717     0x003E => 1, # >
718     0x002F => 1, # /
719     -1 => 1, # EOF
720     }->{$self->{nc}}) {
721     !!!cp (26);
722     ## Reconsume.
723     $self->{state} = DATA_STATE;
724 wakaba 1.5 $self->{s_kwd} = '';
725 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
726 wakaba 1.12 data => '</' . $self->{kwd},
727 wakaba 1.1 line => $self->{line_prev},
728 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
729 wakaba 1.1 });
730     redo A;
731     } else {
732     !!!cp (27);
733     $self->{ct}
734     = {type => END_TAG_TOKEN,
735     tag_name => $self->{last_stag_name},
736     line => $self->{line_prev},
737 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd}};
738 wakaba 1.1 $self->{state} = TAG_NAME_STATE;
739     ## Reconsume.
740     redo A;
741     }
742     }
743     } elsif ($self->{state} == TAG_NAME_STATE) {
744     if ($is_space->{$self->{nc}}) {
745     !!!cp (34);
746     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
747     !!!next-input-character;
748     redo A;
749     } elsif ($self->{nc} == 0x003E) { # >
750     if ($self->{ct}->{type} == START_TAG_TOKEN) {
751     !!!cp (35);
752     $self->{last_stag_name} = $self->{ct}->{tag_name};
753     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
754     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
755     #if ($self->{ct}->{attributes}) {
756     # ## NOTE: This should never be reached.
757     # !!! cp (36);
758     # !!! parse-error (type => 'end tag attribute');
759     #} else {
760     !!!cp (37);
761     #}
762     } else {
763     die "$0: $self->{ct}->{type}: Unknown token type";
764     }
765     $self->{state} = DATA_STATE;
766 wakaba 1.5 $self->{s_kwd} = '';
767 wakaba 1.1 !!!next-input-character;
768    
769     !!!emit ($self->{ct}); # start tag or end tag
770    
771     redo A;
772     } elsif (0x0041 <= $self->{nc} and
773     $self->{nc} <= 0x005A) { # A..Z
774     !!!cp (38);
775 wakaba 1.4 $self->{ct}->{tag_name}
776     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
777 wakaba 1.1 # start tag or end tag
778     ## Stay in this state
779     !!!next-input-character;
780     redo A;
781     } elsif ($self->{nc} == -1) {
782     !!!parse-error (type => 'unclosed tag');
783     if ($self->{ct}->{type} == START_TAG_TOKEN) {
784     !!!cp (39);
785     $self->{last_stag_name} = $self->{ct}->{tag_name};
786     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
787     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
788     #if ($self->{ct}->{attributes}) {
789     # ## NOTE: This state should never be reached.
790     # !!! cp (40);
791     # !!! parse-error (type => 'end tag attribute');
792     #} else {
793     !!!cp (41);
794     #}
795     } else {
796     die "$0: $self->{ct}->{type}: Unknown token type";
797     }
798     $self->{state} = DATA_STATE;
799 wakaba 1.5 $self->{s_kwd} = '';
800 wakaba 1.1 # reconsume
801    
802     !!!emit ($self->{ct}); # start tag or end tag
803    
804     redo A;
805     } elsif ($self->{nc} == 0x002F) { # /
806     !!!cp (42);
807     $self->{state} = SELF_CLOSING_START_TAG_STATE;
808     !!!next-input-character;
809     redo A;
810     } else {
811     !!!cp (44);
812     $self->{ct}->{tag_name} .= chr $self->{nc};
813     # start tag or end tag
814     ## Stay in the state
815     !!!next-input-character;
816     redo A;
817     }
818     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
819 wakaba 1.11 ## XML5: "Tag attribute name before state".
820    
821 wakaba 1.1 if ($is_space->{$self->{nc}}) {
822     !!!cp (45);
823     ## Stay in the state
824     !!!next-input-character;
825     redo A;
826     } elsif ($self->{nc} == 0x003E) { # >
827     if ($self->{ct}->{type} == START_TAG_TOKEN) {
828     !!!cp (46);
829     $self->{last_stag_name} = $self->{ct}->{tag_name};
830     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
831     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
832     if ($self->{ct}->{attributes}) {
833     !!!cp (47);
834     !!!parse-error (type => 'end tag attribute');
835     } else {
836     !!!cp (48);
837     }
838     } else {
839     die "$0: $self->{ct}->{type}: Unknown token type";
840     }
841     $self->{state} = DATA_STATE;
842 wakaba 1.5 $self->{s_kwd} = '';
843 wakaba 1.1 !!!next-input-character;
844    
845     !!!emit ($self->{ct}); # start tag or end tag
846    
847     redo A;
848     } elsif (0x0041 <= $self->{nc} and
849     $self->{nc} <= 0x005A) { # A..Z
850     !!!cp (49);
851     $self->{ca}
852 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
853 wakaba 1.1 value => '',
854     line => $self->{line}, column => $self->{column}};
855     $self->{state} = ATTRIBUTE_NAME_STATE;
856     !!!next-input-character;
857     redo A;
858     } elsif ($self->{nc} == 0x002F) { # /
859     !!!cp (50);
860     $self->{state} = SELF_CLOSING_START_TAG_STATE;
861     !!!next-input-character;
862     redo A;
863     } elsif ($self->{nc} == -1) {
864     !!!parse-error (type => 'unclosed tag');
865     if ($self->{ct}->{type} == START_TAG_TOKEN) {
866     !!!cp (52);
867     $self->{last_stag_name} = $self->{ct}->{tag_name};
868     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
869     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
870     if ($self->{ct}->{attributes}) {
871     !!!cp (53);
872     !!!parse-error (type => 'end tag attribute');
873     } else {
874     !!!cp (54);
875     }
876     } else {
877     die "$0: $self->{ct}->{type}: Unknown token type";
878     }
879     $self->{state} = DATA_STATE;
880 wakaba 1.5 $self->{s_kwd} = '';
881 wakaba 1.1 # reconsume
882    
883     !!!emit ($self->{ct}); # start tag or end tag
884    
885     redo A;
886     } else {
887     if ({
888     0x0022 => 1, # "
889     0x0027 => 1, # '
890     0x003D => 1, # =
891     }->{$self->{nc}}) {
892     !!!cp (55);
893 wakaba 1.11 ## XML5: Not a parse error.
894 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
895     } else {
896     !!!cp (56);
897 wakaba 1.11 ## XML5: ":" raises a parse error and is ignored.
898 wakaba 1.1 }
899     $self->{ca}
900     = {name => chr ($self->{nc}),
901     value => '',
902     line => $self->{line}, column => $self->{column}};
903     $self->{state} = ATTRIBUTE_NAME_STATE;
904     !!!next-input-character;
905     redo A;
906     }
907     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
908 wakaba 1.11 ## XML5: "Tag attribute name state".
909    
910 wakaba 1.1 my $before_leave = sub {
911     if (exists $self->{ct}->{attributes} # start tag or end tag
912     ->{$self->{ca}->{name}}) { # MUST
913     !!!cp (57);
914     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
915     ## Discard $self->{ca} # MUST
916     } else {
917     !!!cp (58);
918     $self->{ct}->{attributes}->{$self->{ca}->{name}}
919     = $self->{ca};
920 wakaba 1.11 $self->{ca}->{index} = ++$self->{ct}->{last_index};
921 wakaba 1.1 }
922     }; # $before_leave
923    
924     if ($is_space->{$self->{nc}}) {
925     !!!cp (59);
926     $before_leave->();
927     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
928     !!!next-input-character;
929     redo A;
930     } elsif ($self->{nc} == 0x003D) { # =
931     !!!cp (60);
932     $before_leave->();
933     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
934     !!!next-input-character;
935     redo A;
936     } elsif ($self->{nc} == 0x003E) { # >
937 wakaba 1.11 if ($self->{is_xml}) {
938     !!!cp (60.1);
939     ## XML5: Not a parse error.
940     !!!parse-error (type => 'no attr value'); ## TODO: type
941     } else {
942     !!!cp (60.2);
943     }
944    
945 wakaba 1.1 $before_leave->();
946     if ($self->{ct}->{type} == START_TAG_TOKEN) {
947     !!!cp (61);
948     $self->{last_stag_name} = $self->{ct}->{tag_name};
949     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
950     !!!cp (62);
951     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
952     if ($self->{ct}->{attributes}) {
953     !!!parse-error (type => 'end tag attribute');
954     }
955     } else {
956     die "$0: $self->{ct}->{type}: Unknown token type";
957     }
958     $self->{state} = DATA_STATE;
959 wakaba 1.5 $self->{s_kwd} = '';
960 wakaba 1.1 !!!next-input-character;
961    
962     !!!emit ($self->{ct}); # start tag or end tag
963    
964     redo A;
965     } elsif (0x0041 <= $self->{nc} and
966     $self->{nc} <= 0x005A) { # A..Z
967     !!!cp (63);
968 wakaba 1.4 $self->{ca}->{name}
969     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
970 wakaba 1.1 ## Stay in the state
971     !!!next-input-character;
972     redo A;
973     } elsif ($self->{nc} == 0x002F) { # /
974 wakaba 1.11 if ($self->{is_xml}) {
975     !!!cp (64);
976     ## XML5: Not a parse error.
977     !!!parse-error (type => 'no attr value'); ## TODO: type
978     } else {
979     !!!cp (64.1);
980     }
981    
982 wakaba 1.1 $before_leave->();
983     $self->{state} = SELF_CLOSING_START_TAG_STATE;
984     !!!next-input-character;
985     redo A;
986     } elsif ($self->{nc} == -1) {
987     !!!parse-error (type => 'unclosed tag');
988     $before_leave->();
989     if ($self->{ct}->{type} == START_TAG_TOKEN) {
990     !!!cp (66);
991     $self->{last_stag_name} = $self->{ct}->{tag_name};
992     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
993     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
994     if ($self->{ct}->{attributes}) {
995     !!!cp (67);
996     !!!parse-error (type => 'end tag attribute');
997     } else {
998     ## NOTE: This state should never be reached.
999     !!!cp (68);
1000     }
1001     } else {
1002     die "$0: $self->{ct}->{type}: Unknown token type";
1003     }
1004     $self->{state} = DATA_STATE;
1005 wakaba 1.5 $self->{s_kwd} = '';
1006 wakaba 1.1 # reconsume
1007    
1008     !!!emit ($self->{ct}); # start tag or end tag
1009    
1010     redo A;
1011     } else {
1012     if ($self->{nc} == 0x0022 or # "
1013     $self->{nc} == 0x0027) { # '
1014     !!!cp (69);
1015 wakaba 1.11 ## XML5: Not a parse error.
1016 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1017     } else {
1018     !!!cp (70);
1019     }
1020     $self->{ca}->{name} .= chr ($self->{nc});
1021     ## Stay in the state
1022     !!!next-input-character;
1023     redo A;
1024     }
1025     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1026 wakaba 1.11 ## XML5: "Tag attribute name after state".
1027    
1028 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1029     !!!cp (71);
1030     ## Stay in the state
1031     !!!next-input-character;
1032     redo A;
1033     } elsif ($self->{nc} == 0x003D) { # =
1034     !!!cp (72);
1035     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1036     !!!next-input-character;
1037     redo A;
1038     } elsif ($self->{nc} == 0x003E) { # >
1039 wakaba 1.11 if ($self->{is_xml}) {
1040     !!!cp (72.1);
1041     ## XML5: Not a parse error.
1042     !!!parse-error (type => 'no attr value'); ## TODO: type
1043     } else {
1044     !!!cp (72.2);
1045     }
1046    
1047 wakaba 1.1 if ($self->{ct}->{type} == START_TAG_TOKEN) {
1048     !!!cp (73);
1049     $self->{last_stag_name} = $self->{ct}->{tag_name};
1050     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1051     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1052     if ($self->{ct}->{attributes}) {
1053     !!!cp (74);
1054     !!!parse-error (type => 'end tag attribute');
1055     } else {
1056     ## NOTE: This state should never be reached.
1057     !!!cp (75);
1058     }
1059     } else {
1060     die "$0: $self->{ct}->{type}: Unknown token type";
1061     }
1062     $self->{state} = DATA_STATE;
1063 wakaba 1.5 $self->{s_kwd} = '';
1064 wakaba 1.1 !!!next-input-character;
1065    
1066     !!!emit ($self->{ct}); # start tag or end tag
1067    
1068     redo A;
1069     } elsif (0x0041 <= $self->{nc} and
1070     $self->{nc} <= 0x005A) { # A..Z
1071     !!!cp (76);
1072     $self->{ca}
1073 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1074 wakaba 1.1 value => '',
1075     line => $self->{line}, column => $self->{column}};
1076     $self->{state} = ATTRIBUTE_NAME_STATE;
1077     !!!next-input-character;
1078     redo A;
1079     } elsif ($self->{nc} == 0x002F) { # /
1080 wakaba 1.11 if ($self->{is_xml}) {
1081     !!!cp (77);
1082     ## XML5: Not a parse error.
1083     !!!parse-error (type => 'no attr value'); ## TODO: type
1084     } else {
1085     !!!cp (77.1);
1086     }
1087    
1088 wakaba 1.1 $self->{state} = SELF_CLOSING_START_TAG_STATE;
1089     !!!next-input-character;
1090     redo A;
1091     } elsif ($self->{nc} == -1) {
1092     !!!parse-error (type => 'unclosed tag');
1093     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1094     !!!cp (79);
1095     $self->{last_stag_name} = $self->{ct}->{tag_name};
1096     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1097     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1098     if ($self->{ct}->{attributes}) {
1099     !!!cp (80);
1100     !!!parse-error (type => 'end tag attribute');
1101     } else {
1102     ## NOTE: This state should never be reached.
1103     !!!cp (81);
1104     }
1105     } else {
1106     die "$0: $self->{ct}->{type}: Unknown token type";
1107     }
1108 wakaba 1.5 $self->{s_kwd} = '';
1109 wakaba 1.1 $self->{state} = DATA_STATE;
1110     # reconsume
1111    
1112     !!!emit ($self->{ct}); # start tag or end tag
1113    
1114     redo A;
1115     } else {
1116 wakaba 1.11 if ($self->{is_xml}) {
1117     !!!cp (78.1);
1118     ## XML5: Not a parse error.
1119     !!!parse-error (type => 'no attr value'); ## TODO: type
1120     } else {
1121     !!!cp (78.2);
1122     }
1123    
1124 wakaba 1.1 if ($self->{nc} == 0x0022 or # "
1125     $self->{nc} == 0x0027) { # '
1126     !!!cp (78);
1127 wakaba 1.11 ## XML5: Not a parse error.
1128 wakaba 1.1 !!!parse-error (type => 'bad attribute name');
1129     } else {
1130     !!!cp (82);
1131     }
1132     $self->{ca}
1133     = {name => chr ($self->{nc}),
1134     value => '',
1135     line => $self->{line}, column => $self->{column}};
1136     $self->{state} = ATTRIBUTE_NAME_STATE;
1137     !!!next-input-character;
1138     redo A;
1139     }
1140     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1141 wakaba 1.11 ## XML5: "Tag attribute value before state".
1142    
1143 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1144     !!!cp (83);
1145     ## Stay in the state
1146     !!!next-input-character;
1147     redo A;
1148     } elsif ($self->{nc} == 0x0022) { # "
1149     !!!cp (84);
1150     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1151     !!!next-input-character;
1152     redo A;
1153     } elsif ($self->{nc} == 0x0026) { # &
1154     !!!cp (85);
1155     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1156     ## reconsume
1157     redo A;
1158     } elsif ($self->{nc} == 0x0027) { # '
1159     !!!cp (86);
1160     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1161     !!!next-input-character;
1162     redo A;
1163     } elsif ($self->{nc} == 0x003E) { # >
1164     !!!parse-error (type => 'empty unquoted attribute value');
1165     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1166     !!!cp (87);
1167     $self->{last_stag_name} = $self->{ct}->{tag_name};
1168     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1169     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1170     if ($self->{ct}->{attributes}) {
1171     !!!cp (88);
1172     !!!parse-error (type => 'end tag attribute');
1173     } else {
1174     ## NOTE: This state should never be reached.
1175     !!!cp (89);
1176     }
1177     } else {
1178     die "$0: $self->{ct}->{type}: Unknown token type";
1179     }
1180     $self->{state} = DATA_STATE;
1181 wakaba 1.5 $self->{s_kwd} = '';
1182 wakaba 1.1 !!!next-input-character;
1183    
1184     !!!emit ($self->{ct}); # start tag or end tag
1185    
1186     redo A;
1187     } elsif ($self->{nc} == -1) {
1188     !!!parse-error (type => 'unclosed tag');
1189     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1190     !!!cp (90);
1191     $self->{last_stag_name} = $self->{ct}->{tag_name};
1192     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1193     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1194     if ($self->{ct}->{attributes}) {
1195     !!!cp (91);
1196     !!!parse-error (type => 'end tag attribute');
1197     } else {
1198     ## NOTE: This state should never be reached.
1199     !!!cp (92);
1200     }
1201     } else {
1202     die "$0: $self->{ct}->{type}: Unknown token type";
1203     }
1204     $self->{state} = DATA_STATE;
1205 wakaba 1.5 $self->{s_kwd} = '';
1206 wakaba 1.1 ## reconsume
1207    
1208     !!!emit ($self->{ct}); # start tag or end tag
1209    
1210     redo A;
1211     } else {
1212     if ($self->{nc} == 0x003D) { # =
1213     !!!cp (93);
1214 wakaba 1.11 ## XML5: Not a parse error.
1215 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1216 wakaba 1.11 } elsif ($self->{is_xml}) {
1217     !!!cp (93.1);
1218     ## XML5: No parse error.
1219     !!!parse-error (type => 'unquoted attr value'); ## TODO
1220 wakaba 1.1 } else {
1221     !!!cp (94);
1222     }
1223     $self->{ca}->{value} .= chr ($self->{nc});
1224     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1225     !!!next-input-character;
1226     redo A;
1227     }
1228     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1229 wakaba 1.11 ## XML5: "Tag attribute value double quoted state".
1230    
1231 wakaba 1.1 if ($self->{nc} == 0x0022) { # "
1232     !!!cp (95);
1233 wakaba 1.11 ## XML5: "Tag attribute name before state".
1234 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1235     !!!next-input-character;
1236     redo A;
1237     } elsif ($self->{nc} == 0x0026) { # &
1238     !!!cp (96);
1239 wakaba 1.11 ## XML5: Not defined yet.
1240    
1241 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1242     ## "entity in attribute value state". In this implementation, the
1243     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1244     ## implementation of the "consume a character reference" algorithm.
1245     $self->{prev_state} = $self->{state};
1246     $self->{entity_add} = 0x0022; # "
1247     $self->{state} = ENTITY_STATE;
1248     !!!next-input-character;
1249     redo A;
1250     } elsif ($self->{nc} == -1) {
1251     !!!parse-error (type => 'unclosed attribute value');
1252     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1253     !!!cp (97);
1254     $self->{last_stag_name} = $self->{ct}->{tag_name};
1255     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1256     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1257     if ($self->{ct}->{attributes}) {
1258     !!!cp (98);
1259     !!!parse-error (type => 'end tag attribute');
1260     } else {
1261     ## NOTE: This state should never be reached.
1262     !!!cp (99);
1263     }
1264     } else {
1265     die "$0: $self->{ct}->{type}: Unknown token type";
1266     }
1267     $self->{state} = DATA_STATE;
1268 wakaba 1.5 $self->{s_kwd} = '';
1269 wakaba 1.1 ## reconsume
1270    
1271     !!!emit ($self->{ct}); # start tag or end tag
1272    
1273     redo A;
1274     } else {
1275 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1276     !!!cp (100);
1277     ## XML5: Not a parse error.
1278     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1279     } else {
1280     !!!cp (100.1);
1281     }
1282 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1283     $self->{read_until}->($self->{ca}->{value},
1284 wakaba 1.11 q["&<],
1285 wakaba 1.1 length $self->{ca}->{value});
1286    
1287     ## Stay in the state
1288     !!!next-input-character;
1289     redo A;
1290     }
1291     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1292 wakaba 1.11 ## XML5: "Tag attribute value single quoted state".
1293    
1294 wakaba 1.1 if ($self->{nc} == 0x0027) { # '
1295     !!!cp (101);
1296 wakaba 1.11 ## XML5: "Before attribute name state" (sic).
1297 wakaba 1.1 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1298     !!!next-input-character;
1299     redo A;
1300     } elsif ($self->{nc} == 0x0026) { # &
1301     !!!cp (102);
1302 wakaba 1.11 ## XML5: Not defined yet.
1303    
1304 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1305     ## "entity in attribute value state". In this implementation, the
1306     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1307     ## implementation of the "consume a character reference" algorithm.
1308     $self->{entity_add} = 0x0027; # '
1309     $self->{prev_state} = $self->{state};
1310     $self->{state} = ENTITY_STATE;
1311     !!!next-input-character;
1312     redo A;
1313     } elsif ($self->{nc} == -1) {
1314     !!!parse-error (type => 'unclosed attribute value');
1315     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1316     !!!cp (103);
1317     $self->{last_stag_name} = $self->{ct}->{tag_name};
1318     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1319     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1320     if ($self->{ct}->{attributes}) {
1321     !!!cp (104);
1322     !!!parse-error (type => 'end tag attribute');
1323     } else {
1324     ## NOTE: This state should never be reached.
1325     !!!cp (105);
1326     }
1327     } else {
1328     die "$0: $self->{ct}->{type}: Unknown token type";
1329     }
1330     $self->{state} = DATA_STATE;
1331 wakaba 1.5 $self->{s_kwd} = '';
1332 wakaba 1.1 ## reconsume
1333    
1334     !!!emit ($self->{ct}); # start tag or end tag
1335    
1336     redo A;
1337     } else {
1338 wakaba 1.11 if ($self->{is_xml} and $self->{nc} == 0x003C) { # <
1339     !!!cp (106);
1340     ## XML5: Not a parse error.
1341     !!!parse-error (type => 'lt in attr value'); ## TODO: type
1342     } else {
1343     !!!cp (106.1);
1344     }
1345 wakaba 1.1 $self->{ca}->{value} .= chr ($self->{nc});
1346     $self->{read_until}->($self->{ca}->{value},
1347 wakaba 1.11 q['&<],
1348 wakaba 1.1 length $self->{ca}->{value});
1349    
1350     ## Stay in the state
1351     !!!next-input-character;
1352     redo A;
1353     }
1354     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1355 wakaba 1.11 ## XML5: "Tag attribute value unquoted state".
1356    
1357 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1358     !!!cp (107);
1359 wakaba 1.11 ## XML5: "Tag attribute name before state".
1360 wakaba 1.1 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1361     !!!next-input-character;
1362     redo A;
1363     } elsif ($self->{nc} == 0x0026) { # &
1364     !!!cp (108);
1365 wakaba 1.11
1366     ## XML5: Not defined yet.
1367    
1368 wakaba 1.1 ## NOTE: In the spec, the tokenizer is switched to the
1369     ## "entity in attribute value state". In this implementation, the
1370     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1371     ## implementation of the "consume a character reference" algorithm.
1372     $self->{entity_add} = -1;
1373     $self->{prev_state} = $self->{state};
1374     $self->{state} = ENTITY_STATE;
1375     !!!next-input-character;
1376     redo A;
1377     } elsif ($self->{nc} == 0x003E) { # >
1378     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1379     !!!cp (109);
1380     $self->{last_stag_name} = $self->{ct}->{tag_name};
1381     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1382     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1383     if ($self->{ct}->{attributes}) {
1384     !!!cp (110);
1385     !!!parse-error (type => 'end tag attribute');
1386     } else {
1387     ## NOTE: This state should never be reached.
1388     !!!cp (111);
1389     }
1390     } else {
1391     die "$0: $self->{ct}->{type}: Unknown token type";
1392     }
1393     $self->{state} = DATA_STATE;
1394 wakaba 1.5 $self->{s_kwd} = '';
1395 wakaba 1.1 !!!next-input-character;
1396    
1397     !!!emit ($self->{ct}); # start tag or end tag
1398    
1399     redo A;
1400     } elsif ($self->{nc} == -1) {
1401     !!!parse-error (type => 'unclosed tag');
1402     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1403     !!!cp (112);
1404     $self->{last_stag_name} = $self->{ct}->{tag_name};
1405     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1406     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1407     if ($self->{ct}->{attributes}) {
1408     !!!cp (113);
1409     !!!parse-error (type => 'end tag attribute');
1410     } else {
1411     ## NOTE: This state should never be reached.
1412     !!!cp (114);
1413     }
1414     } else {
1415     die "$0: $self->{ct}->{type}: Unknown token type";
1416     }
1417     $self->{state} = DATA_STATE;
1418 wakaba 1.5 $self->{s_kwd} = '';
1419 wakaba 1.1 ## reconsume
1420    
1421     !!!emit ($self->{ct}); # start tag or end tag
1422    
1423     redo A;
1424     } else {
1425     if ({
1426     0x0022 => 1, # "
1427     0x0027 => 1, # '
1428     0x003D => 1, # =
1429     }->{$self->{nc}}) {
1430     !!!cp (115);
1431 wakaba 1.11 ## XML5: Not a parse error.
1432 wakaba 1.1 !!!parse-error (type => 'bad attribute value');
1433     } else {
1434     !!!cp (116);
1435     }
1436     $self->{ca}->{value} .= chr ($self->{nc});
1437     $self->{read_until}->($self->{ca}->{value},
1438     q["'=& >],
1439     length $self->{ca}->{value});
1440    
1441     ## Stay in the state
1442     !!!next-input-character;
1443     redo A;
1444     }
1445     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1446     if ($is_space->{$self->{nc}}) {
1447     !!!cp (118);
1448     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1449     !!!next-input-character;
1450     redo A;
1451     } elsif ($self->{nc} == 0x003E) { # >
1452     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1453     !!!cp (119);
1454     $self->{last_stag_name} = $self->{ct}->{tag_name};
1455     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1456     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1457     if ($self->{ct}->{attributes}) {
1458     !!!cp (120);
1459     !!!parse-error (type => 'end tag attribute');
1460     } else {
1461     ## NOTE: This state should never be reached.
1462     !!!cp (121);
1463     }
1464     } else {
1465     die "$0: $self->{ct}->{type}: Unknown token type";
1466     }
1467     $self->{state} = DATA_STATE;
1468 wakaba 1.5 $self->{s_kwd} = '';
1469 wakaba 1.1 !!!next-input-character;
1470    
1471     !!!emit ($self->{ct}); # start tag or end tag
1472    
1473     redo A;
1474     } elsif ($self->{nc} == 0x002F) { # /
1475     !!!cp (122);
1476     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1477     !!!next-input-character;
1478     redo A;
1479     } elsif ($self->{nc} == -1) {
1480     !!!parse-error (type => 'unclosed tag');
1481     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1482     !!!cp (122.3);
1483     $self->{last_stag_name} = $self->{ct}->{tag_name};
1484     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1485     if ($self->{ct}->{attributes}) {
1486     !!!cp (122.1);
1487     !!!parse-error (type => 'end tag attribute');
1488     } else {
1489     ## NOTE: This state should never be reached.
1490     !!!cp (122.2);
1491     }
1492     } else {
1493     die "$0: $self->{ct}->{type}: Unknown token type";
1494     }
1495     $self->{state} = DATA_STATE;
1496 wakaba 1.5 $self->{s_kwd} = '';
1497 wakaba 1.1 ## Reconsume.
1498     !!!emit ($self->{ct}); # start tag or end tag
1499     redo A;
1500     } else {
1501     !!!cp ('124.1');
1502     !!!parse-error (type => 'no space between attributes');
1503     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1504     ## reconsume
1505     redo A;
1506     }
1507     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1508 wakaba 1.11 ## XML5: "Empty tag state".
1509    
1510 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
1511     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1512     !!!cp ('124.2');
1513     !!!parse-error (type => 'nestc', token => $self->{ct});
1514     ## TODO: Different type than slash in start tag
1515     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1516     if ($self->{ct}->{attributes}) {
1517     !!!cp ('124.4');
1518     !!!parse-error (type => 'end tag attribute');
1519     } else {
1520     !!!cp ('124.5');
1521     }
1522     ## TODO: Test |<title></title/>|
1523     } else {
1524     !!!cp ('124.3');
1525     $self->{self_closing} = 1;
1526     }
1527    
1528     $self->{state} = DATA_STATE;
1529 wakaba 1.5 $self->{s_kwd} = '';
1530 wakaba 1.1 !!!next-input-character;
1531    
1532     !!!emit ($self->{ct}); # start tag or end tag
1533    
1534     redo A;
1535     } elsif ($self->{nc} == -1) {
1536     !!!parse-error (type => 'unclosed tag');
1537     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1538     !!!cp (124.7);
1539     $self->{last_stag_name} = $self->{ct}->{tag_name};
1540     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1541     if ($self->{ct}->{attributes}) {
1542     !!!cp (124.5);
1543     !!!parse-error (type => 'end tag attribute');
1544     } else {
1545     ## NOTE: This state should never be reached.
1546     !!!cp (124.6);
1547     }
1548     } else {
1549     die "$0: $self->{ct}->{type}: Unknown token type";
1550     }
1551 wakaba 1.11 ## XML5: "Tag attribute name before state".
1552 wakaba 1.1 $self->{state} = DATA_STATE;
1553 wakaba 1.5 $self->{s_kwd} = '';
1554 wakaba 1.1 ## Reconsume.
1555     !!!emit ($self->{ct}); # start tag or end tag
1556     redo A;
1557     } else {
1558     !!!cp ('124.4');
1559     !!!parse-error (type => 'nestc');
1560     ## TODO: This error type is wrong.
1561     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1562     ## Reconsume.
1563     redo A;
1564     }
1565     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1566     ## (only happen if PCDATA state)
1567    
1568     ## NOTE: Unlike spec's "bogus comment state", this implementation
1569     ## consumes characters one-by-one basis.
1570    
1571     if ($self->{nc} == 0x003E) { # >
1572     !!!cp (124);
1573     $self->{state} = DATA_STATE;
1574 wakaba 1.5 $self->{s_kwd} = '';
1575 wakaba 1.1 !!!next-input-character;
1576    
1577     !!!emit ($self->{ct}); # comment
1578     redo A;
1579     } elsif ($self->{nc} == -1) {
1580     !!!cp (125);
1581     $self->{state} = DATA_STATE;
1582 wakaba 1.5 $self->{s_kwd} = '';
1583 wakaba 1.1 ## reconsume
1584    
1585     !!!emit ($self->{ct}); # comment
1586     redo A;
1587     } else {
1588     !!!cp (126);
1589     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1590     $self->{read_until}->($self->{ct}->{data},
1591     q[>],
1592     length $self->{ct}->{data});
1593    
1594     ## Stay in the state.
1595     !!!next-input-character;
1596     redo A;
1597     }
1598     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1599     ## (only happen if PCDATA state)
1600    
1601     if ($self->{nc} == 0x002D) { # -
1602     !!!cp (133);
1603     $self->{state} = MD_HYPHEN_STATE;
1604     !!!next-input-character;
1605     redo A;
1606     } elsif ($self->{nc} == 0x0044 or # D
1607     $self->{nc} == 0x0064) { # d
1608     ## ASCII case-insensitive.
1609     !!!cp (130);
1610     $self->{state} = MD_DOCTYPE_STATE;
1611 wakaba 1.12 $self->{kwd} = chr $self->{nc};
1612 wakaba 1.1 !!!next-input-character;
1613     redo A;
1614 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1615     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1616     $self->{is_xml}) and
1617 wakaba 1.1 $self->{nc} == 0x005B) { # [
1618     !!!cp (135.4);
1619     $self->{state} = MD_CDATA_STATE;
1620 wakaba 1.12 $self->{kwd} = '[';
1621 wakaba 1.1 !!!next-input-character;
1622     redo A;
1623     } else {
1624     !!!cp (136);
1625     }
1626    
1627     !!!parse-error (type => 'bogus comment',
1628     line => $self->{line_prev},
1629     column => $self->{column_prev} - 1);
1630     ## Reconsume.
1631     $self->{state} = BOGUS_COMMENT_STATE;
1632     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1633     line => $self->{line_prev},
1634     column => $self->{column_prev} - 1,
1635     };
1636     redo A;
1637     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1638     if ($self->{nc} == 0x002D) { # -
1639     !!!cp (127);
1640     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1641     line => $self->{line_prev},
1642     column => $self->{column_prev} - 2,
1643     };
1644 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1645 wakaba 1.1 !!!next-input-character;
1646     redo A;
1647     } else {
1648     !!!cp (128);
1649     !!!parse-error (type => 'bogus comment',
1650     line => $self->{line_prev},
1651     column => $self->{column_prev} - 2);
1652     $self->{state} = BOGUS_COMMENT_STATE;
1653     ## Reconsume.
1654     $self->{ct} = {type => COMMENT_TOKEN,
1655     data => '-',
1656     line => $self->{line_prev},
1657     column => $self->{column_prev} - 2,
1658     };
1659     redo A;
1660     }
1661     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1662     ## ASCII case-insensitive.
1663     if ($self->{nc} == [
1664     undef,
1665     0x004F, # O
1666     0x0043, # C
1667     0x0054, # T
1668     0x0059, # Y
1669     0x0050, # P
1670 wakaba 1.12 ]->[length $self->{kwd}] or
1671 wakaba 1.1 $self->{nc} == [
1672     undef,
1673     0x006F, # o
1674     0x0063, # c
1675     0x0074, # t
1676     0x0079, # y
1677     0x0070, # p
1678 wakaba 1.12 ]->[length $self->{kwd}]) {
1679 wakaba 1.1 !!!cp (131);
1680     ## Stay in the state.
1681 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1682 wakaba 1.1 !!!next-input-character;
1683     redo A;
1684 wakaba 1.12 } elsif ((length $self->{kwd}) == 6 and
1685 wakaba 1.1 ($self->{nc} == 0x0045 or # E
1686     $self->{nc} == 0x0065)) { # e
1687 wakaba 1.12 if ($self->{is_xml} and
1688     ($self->{kwd} ne 'DOCTYP' or $self->{nc} == 0x0065)) {
1689 wakaba 1.10 !!!cp (129);
1690     ## XML5: case-sensitive.
1691     !!!parse-error (type => 'lowercase keyword', ## TODO
1692     text => 'DOCTYPE',
1693     line => $self->{line_prev},
1694     column => $self->{column_prev} - 5);
1695     } else {
1696     !!!cp (129.1);
1697     }
1698 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1699     $self->{ct} = {type => DOCTYPE_TOKEN,
1700     quirks => 1,
1701     line => $self->{line_prev},
1702     column => $self->{column_prev} - 7,
1703     };
1704     !!!next-input-character;
1705     redo A;
1706     } else {
1707     !!!cp (132);
1708     !!!parse-error (type => 'bogus comment',
1709     line => $self->{line_prev},
1710 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1711 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1712     ## Reconsume.
1713     $self->{ct} = {type => COMMENT_TOKEN,
1714 wakaba 1.12 data => $self->{kwd},
1715 wakaba 1.1 line => $self->{line_prev},
1716 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1717 wakaba 1.1 };
1718     redo A;
1719     }
1720     } elsif ($self->{state} == MD_CDATA_STATE) {
1721     if ($self->{nc} == {
1722     '[' => 0x0043, # C
1723     '[C' => 0x0044, # D
1724     '[CD' => 0x0041, # A
1725     '[CDA' => 0x0054, # T
1726     '[CDAT' => 0x0041, # A
1727 wakaba 1.12 }->{$self->{kwd}}) {
1728 wakaba 1.1 !!!cp (135.1);
1729     ## Stay in the state.
1730 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
1731 wakaba 1.1 !!!next-input-character;
1732     redo A;
1733 wakaba 1.12 } elsif ($self->{kwd} eq '[CDATA' and
1734 wakaba 1.1 $self->{nc} == 0x005B) { # [
1735 wakaba 1.6 if ($self->{is_xml} and
1736     not $self->{tainted} and
1737     @{$self->{open_elements} or []} == 0) {
1738 wakaba 1.8 !!!cp (135.2);
1739 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1740     line => $self->{line_prev},
1741     column => $self->{column_prev} - 7);
1742     $self->{tainted} = 1;
1743 wakaba 1.8 } else {
1744     !!!cp (135.21);
1745 wakaba 1.6 }
1746    
1747 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1748     data => '',
1749     line => $self->{line_prev},
1750     column => $self->{column_prev} - 7};
1751     $self->{state} = CDATA_SECTION_STATE;
1752     !!!next-input-character;
1753     redo A;
1754     } else {
1755     !!!cp (135.3);
1756     !!!parse-error (type => 'bogus comment',
1757     line => $self->{line_prev},
1758 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd});
1759 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
1760     ## Reconsume.
1761     $self->{ct} = {type => COMMENT_TOKEN,
1762 wakaba 1.12 data => $self->{kwd},
1763 wakaba 1.1 line => $self->{line_prev},
1764 wakaba 1.12 column => $self->{column_prev} - 1 - length $self->{kwd},
1765 wakaba 1.1 };
1766     redo A;
1767     }
1768     } elsif ($self->{state} == COMMENT_START_STATE) {
1769     if ($self->{nc} == 0x002D) { # -
1770     !!!cp (137);
1771     $self->{state} = COMMENT_START_DASH_STATE;
1772     !!!next-input-character;
1773     redo A;
1774     } elsif ($self->{nc} == 0x003E) { # >
1775     !!!cp (138);
1776     !!!parse-error (type => 'bogus comment');
1777     $self->{state} = DATA_STATE;
1778 wakaba 1.5 $self->{s_kwd} = '';
1779 wakaba 1.1 !!!next-input-character;
1780    
1781     !!!emit ($self->{ct}); # comment
1782    
1783     redo A;
1784     } elsif ($self->{nc} == -1) {
1785     !!!cp (139);
1786     !!!parse-error (type => 'unclosed comment');
1787     $self->{state} = DATA_STATE;
1788 wakaba 1.5 $self->{s_kwd} = '';
1789 wakaba 1.1 ## reconsume
1790    
1791     !!!emit ($self->{ct}); # comment
1792    
1793     redo A;
1794     } else {
1795     !!!cp (140);
1796     $self->{ct}->{data} # comment
1797     .= chr ($self->{nc});
1798     $self->{state} = COMMENT_STATE;
1799     !!!next-input-character;
1800     redo A;
1801     }
1802     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1803     if ($self->{nc} == 0x002D) { # -
1804     !!!cp (141);
1805     $self->{state} = COMMENT_END_STATE;
1806     !!!next-input-character;
1807     redo A;
1808     } elsif ($self->{nc} == 0x003E) { # >
1809     !!!cp (142);
1810     !!!parse-error (type => 'bogus comment');
1811     $self->{state} = DATA_STATE;
1812 wakaba 1.5 $self->{s_kwd} = '';
1813 wakaba 1.1 !!!next-input-character;
1814    
1815     !!!emit ($self->{ct}); # comment
1816    
1817     redo A;
1818     } elsif ($self->{nc} == -1) {
1819     !!!cp (143);
1820     !!!parse-error (type => 'unclosed comment');
1821     $self->{state} = DATA_STATE;
1822 wakaba 1.5 $self->{s_kwd} = '';
1823 wakaba 1.1 ## reconsume
1824    
1825     !!!emit ($self->{ct}); # comment
1826    
1827     redo A;
1828     } else {
1829     !!!cp (144);
1830     $self->{ct}->{data} # comment
1831     .= '-' . chr ($self->{nc});
1832     $self->{state} = COMMENT_STATE;
1833     !!!next-input-character;
1834     redo A;
1835     }
1836     } elsif ($self->{state} == COMMENT_STATE) {
1837     if ($self->{nc} == 0x002D) { # -
1838     !!!cp (145);
1839     $self->{state} = COMMENT_END_DASH_STATE;
1840     !!!next-input-character;
1841     redo A;
1842     } elsif ($self->{nc} == -1) {
1843     !!!cp (146);
1844     !!!parse-error (type => 'unclosed comment');
1845     $self->{state} = DATA_STATE;
1846 wakaba 1.5 $self->{s_kwd} = '';
1847 wakaba 1.1 ## reconsume
1848    
1849     !!!emit ($self->{ct}); # comment
1850    
1851     redo A;
1852     } else {
1853     !!!cp (147);
1854     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1855     $self->{read_until}->($self->{ct}->{data},
1856     q[-],
1857     length $self->{ct}->{data});
1858    
1859     ## Stay in the state
1860     !!!next-input-character;
1861     redo A;
1862     }
1863     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1864 wakaba 1.10 ## XML5: "comment dash state".
1865    
1866 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1867     !!!cp (148);
1868     $self->{state} = COMMENT_END_STATE;
1869     !!!next-input-character;
1870     redo A;
1871     } elsif ($self->{nc} == -1) {
1872     !!!cp (149);
1873     !!!parse-error (type => 'unclosed comment');
1874     $self->{state} = DATA_STATE;
1875 wakaba 1.5 $self->{s_kwd} = '';
1876 wakaba 1.1 ## reconsume
1877    
1878     !!!emit ($self->{ct}); # comment
1879    
1880     redo A;
1881     } else {
1882     !!!cp (150);
1883     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1884     $self->{state} = COMMENT_STATE;
1885     !!!next-input-character;
1886     redo A;
1887     }
1888     } elsif ($self->{state} == COMMENT_END_STATE) {
1889     if ($self->{nc} == 0x003E) { # >
1890     !!!cp (151);
1891     $self->{state} = DATA_STATE;
1892 wakaba 1.5 $self->{s_kwd} = '';
1893 wakaba 1.1 !!!next-input-character;
1894    
1895     !!!emit ($self->{ct}); # comment
1896    
1897     redo A;
1898     } elsif ($self->{nc} == 0x002D) { # -
1899     !!!cp (152);
1900 wakaba 1.10 ## XML5: Not a parse error.
1901 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1902     line => $self->{line_prev},
1903     column => $self->{column_prev});
1904     $self->{ct}->{data} .= '-'; # comment
1905     ## Stay in the state
1906     !!!next-input-character;
1907     redo A;
1908     } elsif ($self->{nc} == -1) {
1909     !!!cp (153);
1910     !!!parse-error (type => 'unclosed comment');
1911     $self->{state} = DATA_STATE;
1912 wakaba 1.5 $self->{s_kwd} = '';
1913 wakaba 1.1 ## reconsume
1914    
1915     !!!emit ($self->{ct}); # comment
1916    
1917     redo A;
1918     } else {
1919     !!!cp (154);
1920 wakaba 1.10 ## XML5: Not a parse error.
1921 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1922     line => $self->{line_prev},
1923     column => $self->{column_prev});
1924     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1925     $self->{state} = COMMENT_STATE;
1926     !!!next-input-character;
1927     redo A;
1928     }
1929     } elsif ($self->{state} == DOCTYPE_STATE) {
1930     if ($is_space->{$self->{nc}}) {
1931     !!!cp (155);
1932     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1933     !!!next-input-character;
1934     redo A;
1935     } else {
1936     !!!cp (156);
1937 wakaba 1.12 ## XML5: Unless EOF, swith to the bogus comment state.
1938 wakaba 1.1 !!!parse-error (type => 'no space before DOCTYPE name');
1939     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1940     ## reconsume
1941     redo A;
1942     }
1943     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1944 wakaba 1.12 ## XML5: "DOCTYPE root name before state".
1945    
1946 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1947     !!!cp (157);
1948     ## Stay in the state
1949     !!!next-input-character;
1950     redo A;
1951     } elsif ($self->{nc} == 0x003E) { # >
1952     !!!cp (158);
1953 wakaba 1.12 ## XML5: No parse error.
1954 wakaba 1.1 !!!parse-error (type => 'no DOCTYPE name');
1955     $self->{state} = DATA_STATE;
1956 wakaba 1.5 $self->{s_kwd} = '';
1957 wakaba 1.1 !!!next-input-character;
1958    
1959     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1960    
1961     redo A;
1962     } elsif ($self->{nc} == -1) {
1963     !!!cp (159);
1964     !!!parse-error (type => 'no DOCTYPE name');
1965     $self->{state} = DATA_STATE;
1966 wakaba 1.5 $self->{s_kwd} = '';
1967 wakaba 1.1 ## reconsume
1968    
1969     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1970    
1971     redo A;
1972 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
1973     !!!cp (159.1);
1974     !!!parse-error (type => 'no DOCTYPE name');
1975     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
1976     !!!next-input-character;
1977     redo A;
1978 wakaba 1.1 } else {
1979     !!!cp (160);
1980     $self->{ct}->{name} = chr $self->{nc};
1981     delete $self->{ct}->{quirks};
1982     $self->{state} = DOCTYPE_NAME_STATE;
1983     !!!next-input-character;
1984     redo A;
1985     }
1986     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1987 wakaba 1.12 ## XML5: "DOCTYPE root name state".
1988    
1989     ## ISSUE: Redundant "First," in the spec.
1990    
1991 wakaba 1.1 if ($is_space->{$self->{nc}}) {
1992     !!!cp (161);
1993     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1994     !!!next-input-character;
1995     redo A;
1996     } elsif ($self->{nc} == 0x003E) { # >
1997     !!!cp (162);
1998     $self->{state} = DATA_STATE;
1999 wakaba 1.5 $self->{s_kwd} = '';
2000 wakaba 1.1 !!!next-input-character;
2001    
2002     !!!emit ($self->{ct}); # DOCTYPE
2003    
2004     redo A;
2005     } elsif ($self->{nc} == -1) {
2006     !!!cp (163);
2007     !!!parse-error (type => 'unclosed DOCTYPE');
2008     $self->{state} = DATA_STATE;
2009 wakaba 1.5 $self->{s_kwd} = '';
2010 wakaba 1.1 ## reconsume
2011    
2012     $self->{ct}->{quirks} = 1;
2013     !!!emit ($self->{ct}); # DOCTYPE
2014    
2015     redo A;
2016 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2017     !!!cp (163.1);
2018     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2019     !!!next-input-character;
2020     redo A;
2021 wakaba 1.1 } else {
2022     !!!cp (164);
2023     $self->{ct}->{name}
2024     .= chr ($self->{nc}); # DOCTYPE
2025     ## Stay in the state
2026     !!!next-input-character;
2027     redo A;
2028     }
2029     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2030 wakaba 1.12 ## XML5: Corresponding to XML5's "DOCTYPE root name after
2031     ## state", but implemented differently.
2032    
2033 wakaba 1.1 if ($is_space->{$self->{nc}}) {
2034     !!!cp (165);
2035     ## Stay in the state
2036     !!!next-input-character;
2037     redo A;
2038     } elsif ($self->{nc} == 0x003E) { # >
2039     !!!cp (166);
2040     $self->{state} = DATA_STATE;
2041 wakaba 1.5 $self->{s_kwd} = '';
2042 wakaba 1.1 !!!next-input-character;
2043    
2044     !!!emit ($self->{ct}); # DOCTYPE
2045    
2046     redo A;
2047     } elsif ($self->{nc} == -1) {
2048     !!!cp (167);
2049     !!!parse-error (type => 'unclosed DOCTYPE');
2050     $self->{state} = DATA_STATE;
2051 wakaba 1.5 $self->{s_kwd} = '';
2052 wakaba 1.1 ## reconsume
2053    
2054     $self->{ct}->{quirks} = 1;
2055     !!!emit ($self->{ct}); # DOCTYPE
2056    
2057     redo A;
2058     } elsif ($self->{nc} == 0x0050 or # P
2059     $self->{nc} == 0x0070) { # p
2060 wakaba 1.12 !!!cp (167.1);
2061 wakaba 1.1 $self->{state} = PUBLIC_STATE;
2062 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2063 wakaba 1.1 !!!next-input-character;
2064     redo A;
2065     } elsif ($self->{nc} == 0x0053 or # S
2066     $self->{nc} == 0x0073) { # s
2067 wakaba 1.12 !!!cp (167.2);
2068 wakaba 1.1 $self->{state} = SYSTEM_STATE;
2069 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2070     !!!next-input-character;
2071     redo A;
2072     } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2073     !!!cp (167.3);
2074     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2075     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2076 wakaba 1.1 !!!next-input-character;
2077     redo A;
2078     } else {
2079     !!!cp (180);
2080     !!!parse-error (type => 'string after DOCTYPE name');
2081     $self->{ct}->{quirks} = 1;
2082    
2083     $self->{state} = BOGUS_DOCTYPE_STATE;
2084     !!!next-input-character;
2085     redo A;
2086     }
2087     } elsif ($self->{state} == PUBLIC_STATE) {
2088     ## ASCII case-insensitive
2089     if ($self->{nc} == [
2090     undef,
2091     0x0055, # U
2092     0x0042, # B
2093     0x004C, # L
2094     0x0049, # I
2095 wakaba 1.12 ]->[length $self->{kwd}] or
2096 wakaba 1.1 $self->{nc} == [
2097     undef,
2098     0x0075, # u
2099     0x0062, # b
2100     0x006C, # l
2101     0x0069, # i
2102 wakaba 1.12 ]->[length $self->{kwd}]) {
2103 wakaba 1.1 !!!cp (175);
2104     ## Stay in the state.
2105 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2106 wakaba 1.1 !!!next-input-character;
2107     redo A;
2108 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2109 wakaba 1.1 ($self->{nc} == 0x0043 or # C
2110     $self->{nc} == 0x0063)) { # c
2111 wakaba 1.12 if ($self->{is_xml} and
2112     ($self->{kwd} ne 'PUBLI' or $self->{nc} == 0x0063)) { # c
2113     !!!cp (168.1);
2114     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2115     text => 'PUBLIC',
2116     line => $self->{line_prev},
2117     column => $self->{column_prev} - 4);
2118     } else {
2119     !!!cp (168);
2120     }
2121 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2122     !!!next-input-character;
2123     redo A;
2124     } else {
2125     !!!cp (169);
2126     !!!parse-error (type => 'string after DOCTYPE name',
2127     line => $self->{line_prev},
2128 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2129 wakaba 1.1 $self->{ct}->{quirks} = 1;
2130    
2131     $self->{state} = BOGUS_DOCTYPE_STATE;
2132     ## Reconsume.
2133     redo A;
2134     }
2135     } elsif ($self->{state} == SYSTEM_STATE) {
2136     ## ASCII case-insensitive
2137     if ($self->{nc} == [
2138     undef,
2139     0x0059, # Y
2140     0x0053, # S
2141     0x0054, # T
2142     0x0045, # E
2143 wakaba 1.12 ]->[length $self->{kwd}] or
2144 wakaba 1.1 $self->{nc} == [
2145     undef,
2146     0x0079, # y
2147     0x0073, # s
2148     0x0074, # t
2149     0x0065, # e
2150 wakaba 1.12 ]->[length $self->{kwd}]) {
2151 wakaba 1.1 !!!cp (170);
2152     ## Stay in the state.
2153 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2154 wakaba 1.1 !!!next-input-character;
2155     redo A;
2156 wakaba 1.12 } elsif ((length $self->{kwd}) == 5 and
2157 wakaba 1.1 ($self->{nc} == 0x004D or # M
2158     $self->{nc} == 0x006D)) { # m
2159 wakaba 1.12 if ($self->{is_xml} and
2160     ($self->{kwd} ne 'SYSTE' or $self->{nc} == 0x006D)) { # m
2161     !!!cp (171.1);
2162     !!!parse-error (type => 'lowercase keyword', ## TODO: type
2163     text => 'SYSTEM',
2164     line => $self->{line_prev},
2165     column => $self->{column_prev} - 4);
2166     } else {
2167     !!!cp (171);
2168     }
2169 wakaba 1.1 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2170     !!!next-input-character;
2171     redo A;
2172     } else {
2173     !!!cp (172);
2174     !!!parse-error (type => 'string after DOCTYPE name',
2175     line => $self->{line_prev},
2176 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd});
2177 wakaba 1.1 $self->{ct}->{quirks} = 1;
2178    
2179     $self->{state} = BOGUS_DOCTYPE_STATE;
2180     ## Reconsume.
2181     redo A;
2182     }
2183     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2184     if ($is_space->{$self->{nc}}) {
2185     !!!cp (181);
2186     ## Stay in the state
2187     !!!next-input-character;
2188     redo A;
2189     } elsif ($self->{nc} eq 0x0022) { # "
2190     !!!cp (182);
2191     $self->{ct}->{pubid} = ''; # DOCTYPE
2192     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2193     !!!next-input-character;
2194     redo A;
2195     } elsif ($self->{nc} eq 0x0027) { # '
2196     !!!cp (183);
2197     $self->{ct}->{pubid} = ''; # DOCTYPE
2198     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2199     !!!next-input-character;
2200     redo A;
2201     } elsif ($self->{nc} eq 0x003E) { # >
2202     !!!cp (184);
2203     !!!parse-error (type => 'no PUBLIC literal');
2204    
2205     $self->{state} = DATA_STATE;
2206 wakaba 1.5 $self->{s_kwd} = '';
2207 wakaba 1.1 !!!next-input-character;
2208    
2209     $self->{ct}->{quirks} = 1;
2210     !!!emit ($self->{ct}); # DOCTYPE
2211    
2212     redo A;
2213     } elsif ($self->{nc} == -1) {
2214     !!!cp (185);
2215     !!!parse-error (type => 'unclosed DOCTYPE');
2216    
2217     $self->{state} = DATA_STATE;
2218 wakaba 1.5 $self->{s_kwd} = '';
2219 wakaba 1.1 ## reconsume
2220    
2221     $self->{ct}->{quirks} = 1;
2222     !!!emit ($self->{ct}); # DOCTYPE
2223    
2224     redo A;
2225 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2226     !!!cp (186.1);
2227     !!!parse-error (type => 'no PUBLIC literal');
2228     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2229     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2230     !!!next-input-character;
2231     redo A;
2232 wakaba 1.1 } else {
2233     !!!cp (186);
2234     !!!parse-error (type => 'string after PUBLIC');
2235     $self->{ct}->{quirks} = 1;
2236    
2237     $self->{state} = BOGUS_DOCTYPE_STATE;
2238     !!!next-input-character;
2239     redo A;
2240     }
2241     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2242     if ($self->{nc} == 0x0022) { # "
2243     !!!cp (187);
2244     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2245     !!!next-input-character;
2246     redo A;
2247     } elsif ($self->{nc} == 0x003E) { # >
2248     !!!cp (188);
2249     !!!parse-error (type => 'unclosed PUBLIC literal');
2250    
2251     $self->{state} = DATA_STATE;
2252 wakaba 1.5 $self->{s_kwd} = '';
2253 wakaba 1.1 !!!next-input-character;
2254    
2255     $self->{ct}->{quirks} = 1;
2256     !!!emit ($self->{ct}); # DOCTYPE
2257    
2258     redo A;
2259     } elsif ($self->{nc} == -1) {
2260     !!!cp (189);
2261     !!!parse-error (type => 'unclosed PUBLIC literal');
2262    
2263     $self->{state} = DATA_STATE;
2264 wakaba 1.5 $self->{s_kwd} = '';
2265 wakaba 1.1 ## reconsume
2266    
2267     $self->{ct}->{quirks} = 1;
2268     !!!emit ($self->{ct}); # DOCTYPE
2269    
2270     redo A;
2271     } else {
2272     !!!cp (190);
2273     $self->{ct}->{pubid} # DOCTYPE
2274     .= chr $self->{nc};
2275     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2276     length $self->{ct}->{pubid});
2277    
2278     ## Stay in the state
2279     !!!next-input-character;
2280     redo A;
2281     }
2282     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2283     if ($self->{nc} == 0x0027) { # '
2284     !!!cp (191);
2285     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2286     !!!next-input-character;
2287     redo A;
2288     } elsif ($self->{nc} == 0x003E) { # >
2289     !!!cp (192);
2290     !!!parse-error (type => 'unclosed PUBLIC literal');
2291    
2292     $self->{state} = DATA_STATE;
2293 wakaba 1.5 $self->{s_kwd} = '';
2294 wakaba 1.1 !!!next-input-character;
2295    
2296     $self->{ct}->{quirks} = 1;
2297     !!!emit ($self->{ct}); # DOCTYPE
2298    
2299     redo A;
2300     } elsif ($self->{nc} == -1) {
2301     !!!cp (193);
2302     !!!parse-error (type => 'unclosed PUBLIC literal');
2303    
2304     $self->{state} = DATA_STATE;
2305 wakaba 1.5 $self->{s_kwd} = '';
2306 wakaba 1.1 ## reconsume
2307    
2308     $self->{ct}->{quirks} = 1;
2309     !!!emit ($self->{ct}); # DOCTYPE
2310    
2311     redo A;
2312     } else {
2313     !!!cp (194);
2314     $self->{ct}->{pubid} # DOCTYPE
2315     .= chr $self->{nc};
2316     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2317     length $self->{ct}->{pubid});
2318    
2319     ## Stay in the state
2320     !!!next-input-character;
2321     redo A;
2322     }
2323     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2324     if ($is_space->{$self->{nc}}) {
2325     !!!cp (195);
2326     ## Stay in the state
2327     !!!next-input-character;
2328     redo A;
2329     } elsif ($self->{nc} == 0x0022) { # "
2330     !!!cp (196);
2331     $self->{ct}->{sysid} = ''; # DOCTYPE
2332     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2333     !!!next-input-character;
2334     redo A;
2335     } elsif ($self->{nc} == 0x0027) { # '
2336     !!!cp (197);
2337     $self->{ct}->{sysid} = ''; # DOCTYPE
2338     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2339     !!!next-input-character;
2340     redo A;
2341     } elsif ($self->{nc} == 0x003E) { # >
2342 wakaba 1.12 if ($self->{is_xml}) {
2343     !!!cp (198.1);
2344     !!!parse-error (type => 'no SYSTEM literal');
2345     } else {
2346     !!!cp (198);
2347     }
2348 wakaba 1.1 $self->{state} = DATA_STATE;
2349 wakaba 1.5 $self->{s_kwd} = '';
2350 wakaba 1.1 !!!next-input-character;
2351    
2352     !!!emit ($self->{ct}); # DOCTYPE
2353    
2354     redo A;
2355     } elsif ($self->{nc} == -1) {
2356     !!!cp (199);
2357     !!!parse-error (type => 'unclosed DOCTYPE');
2358    
2359     $self->{state} = DATA_STATE;
2360 wakaba 1.5 $self->{s_kwd} = '';
2361 wakaba 1.1 ## reconsume
2362    
2363     $self->{ct}->{quirks} = 1;
2364     !!!emit ($self->{ct}); # DOCTYPE
2365    
2366     redo A;
2367 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2368     !!!cp (200.1);
2369     !!!parse-error (type => 'no SYSTEM literal');
2370     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2371     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2372     !!!next-input-character;
2373     redo A;
2374 wakaba 1.1 } else {
2375     !!!cp (200);
2376     !!!parse-error (type => 'string after PUBLIC literal');
2377     $self->{ct}->{quirks} = 1;
2378    
2379     $self->{state} = BOGUS_DOCTYPE_STATE;
2380     !!!next-input-character;
2381     redo A;
2382     }
2383     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2384     if ($is_space->{$self->{nc}}) {
2385     !!!cp (201);
2386     ## Stay in the state
2387     !!!next-input-character;
2388     redo A;
2389     } elsif ($self->{nc} == 0x0022) { # "
2390     !!!cp (202);
2391     $self->{ct}->{sysid} = ''; # DOCTYPE
2392     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2393     !!!next-input-character;
2394     redo A;
2395     } elsif ($self->{nc} == 0x0027) { # '
2396     !!!cp (203);
2397     $self->{ct}->{sysid} = ''; # DOCTYPE
2398     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2399     !!!next-input-character;
2400     redo A;
2401     } elsif ($self->{nc} == 0x003E) { # >
2402     !!!cp (204);
2403     !!!parse-error (type => 'no SYSTEM literal');
2404     $self->{state} = DATA_STATE;
2405 wakaba 1.5 $self->{s_kwd} = '';
2406 wakaba 1.1 !!!next-input-character;
2407    
2408     $self->{ct}->{quirks} = 1;
2409     !!!emit ($self->{ct}); # DOCTYPE
2410    
2411     redo A;
2412     } elsif ($self->{nc} == -1) {
2413     !!!cp (205);
2414     !!!parse-error (type => 'unclosed DOCTYPE');
2415    
2416     $self->{state} = DATA_STATE;
2417 wakaba 1.5 $self->{s_kwd} = '';
2418 wakaba 1.1 ## reconsume
2419    
2420     $self->{ct}->{quirks} = 1;
2421     !!!emit ($self->{ct}); # DOCTYPE
2422    
2423     redo A;
2424 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2425     !!!cp (206.1);
2426     !!!parse-error (type => 'no SYSTEM literal');
2427    
2428     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2429     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2430     !!!next-input-character;
2431     redo A;
2432 wakaba 1.1 } else {
2433     !!!cp (206);
2434     !!!parse-error (type => 'string after SYSTEM');
2435     $self->{ct}->{quirks} = 1;
2436    
2437     $self->{state} = BOGUS_DOCTYPE_STATE;
2438     !!!next-input-character;
2439     redo A;
2440     }
2441     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2442     if ($self->{nc} == 0x0022) { # "
2443     !!!cp (207);
2444     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2445     !!!next-input-character;
2446     redo A;
2447 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2448 wakaba 1.1 !!!cp (208);
2449     !!!parse-error (type => 'unclosed SYSTEM literal');
2450    
2451     $self->{state} = DATA_STATE;
2452 wakaba 1.5 $self->{s_kwd} = '';
2453 wakaba 1.1 !!!next-input-character;
2454    
2455     $self->{ct}->{quirks} = 1;
2456     !!!emit ($self->{ct}); # DOCTYPE
2457    
2458     redo A;
2459     } elsif ($self->{nc} == -1) {
2460     !!!cp (209);
2461     !!!parse-error (type => 'unclosed SYSTEM literal');
2462    
2463     $self->{state} = DATA_STATE;
2464 wakaba 1.5 $self->{s_kwd} = '';
2465 wakaba 1.1 ## reconsume
2466    
2467     $self->{ct}->{quirks} = 1;
2468     !!!emit ($self->{ct}); # DOCTYPE
2469    
2470     redo A;
2471     } else {
2472     !!!cp (210);
2473     $self->{ct}->{sysid} # DOCTYPE
2474     .= chr $self->{nc};
2475     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2476     length $self->{ct}->{sysid});
2477    
2478     ## Stay in the state
2479     !!!next-input-character;
2480     redo A;
2481     }
2482     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2483     if ($self->{nc} == 0x0027) { # '
2484     !!!cp (211);
2485     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2486     !!!next-input-character;
2487     redo A;
2488 wakaba 1.12 } elsif (not $self->{is_xml} and $self->{nc} == 0x003E) { # >
2489 wakaba 1.1 !!!cp (212);
2490     !!!parse-error (type => 'unclosed SYSTEM literal');
2491    
2492     $self->{state} = DATA_STATE;
2493 wakaba 1.5 $self->{s_kwd} = '';
2494 wakaba 1.1 !!!next-input-character;
2495    
2496     $self->{ct}->{quirks} = 1;
2497     !!!emit ($self->{ct}); # DOCTYPE
2498    
2499     redo A;
2500     } elsif ($self->{nc} == -1) {
2501     !!!cp (213);
2502     !!!parse-error (type => 'unclosed SYSTEM literal');
2503    
2504     $self->{state} = DATA_STATE;
2505 wakaba 1.5 $self->{s_kwd} = '';
2506 wakaba 1.1 ## reconsume
2507    
2508     $self->{ct}->{quirks} = 1;
2509     !!!emit ($self->{ct}); # DOCTYPE
2510    
2511     redo A;
2512     } else {
2513     !!!cp (214);
2514     $self->{ct}->{sysid} # DOCTYPE
2515     .= chr $self->{nc};
2516     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2517     length $self->{ct}->{sysid});
2518    
2519     ## Stay in the state
2520     !!!next-input-character;
2521     redo A;
2522     }
2523     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2524     if ($is_space->{$self->{nc}}) {
2525     !!!cp (215);
2526     ## Stay in the state
2527     !!!next-input-character;
2528     redo A;
2529     } elsif ($self->{nc} == 0x003E) { # >
2530     !!!cp (216);
2531     $self->{state} = DATA_STATE;
2532 wakaba 1.5 $self->{s_kwd} = '';
2533 wakaba 1.1 !!!next-input-character;
2534    
2535     !!!emit ($self->{ct}); # DOCTYPE
2536    
2537     redo A;
2538     } elsif ($self->{nc} == -1) {
2539     !!!cp (217);
2540     !!!parse-error (type => 'unclosed DOCTYPE');
2541     $self->{state} = DATA_STATE;
2542 wakaba 1.5 $self->{s_kwd} = '';
2543 wakaba 1.1 ## reconsume
2544    
2545     $self->{ct}->{quirks} = 1;
2546     !!!emit ($self->{ct}); # DOCTYPE
2547    
2548     redo A;
2549 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2550     !!!cp (218.1);
2551     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2552     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2553     !!!next-input-character;
2554     redo A;
2555 wakaba 1.1 } else {
2556     !!!cp (218);
2557     !!!parse-error (type => 'string after SYSTEM literal');
2558     #$self->{ct}->{quirks} = 1;
2559    
2560     $self->{state} = BOGUS_DOCTYPE_STATE;
2561     !!!next-input-character;
2562     redo A;
2563     }
2564     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2565     if ($self->{nc} == 0x003E) { # >
2566     !!!cp (219);
2567     $self->{state} = DATA_STATE;
2568 wakaba 1.5 $self->{s_kwd} = '';
2569 wakaba 1.1 !!!next-input-character;
2570    
2571     !!!emit ($self->{ct}); # DOCTYPE
2572    
2573     redo A;
2574 wakaba 1.12 } elsif ($self->{is_xml} and $self->{nc} == 0x005B) { # [
2575     if ($self->{ct}->{has_internal_subset}) { # DOCTYPE
2576     !!!cp (220.2);
2577     ## Stay in the state.
2578     !!!next-input-character;
2579     redo A;
2580     } else {
2581     !!!cp (220.1);
2582     $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE;
2583     $self->{ct}->{has_internal_subset} = 1; # DOCTYPE
2584     !!!next-input-character;
2585     redo A;
2586     }
2587 wakaba 1.1 } elsif ($self->{nc} == -1) {
2588     !!!cp (220);
2589     $self->{state} = DATA_STATE;
2590 wakaba 1.5 $self->{s_kwd} = '';
2591 wakaba 1.1 ## reconsume
2592    
2593     !!!emit ($self->{ct}); # DOCTYPE
2594    
2595     redo A;
2596     } else {
2597     !!!cp (221);
2598     my $s = '';
2599 wakaba 1.12 $self->{read_until}->($s, q{>[}, 0);
2600 wakaba 1.1
2601     ## Stay in the state
2602     !!!next-input-character;
2603     redo A;
2604     }
2605     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2606     ## NOTE: "CDATA section state" in the state is jointly implemented
2607     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2608     ## and |CDATA_SECTION_MSE2_STATE|.
2609 wakaba 1.10
2610     ## XML5: "CDATA state".
2611 wakaba 1.1
2612     if ($self->{nc} == 0x005D) { # ]
2613     !!!cp (221.1);
2614     $self->{state} = CDATA_SECTION_MSE1_STATE;
2615     !!!next-input-character;
2616     redo A;
2617     } elsif ($self->{nc} == -1) {
2618 wakaba 1.6 if ($self->{is_xml}) {
2619 wakaba 1.8 !!!cp (221.11);
2620 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2621 wakaba 1.8 } else {
2622     !!!cp (221.12);
2623 wakaba 1.6 }
2624    
2625 wakaba 1.1 $self->{state} = DATA_STATE;
2626 wakaba 1.5 $self->{s_kwd} = '';
2627 wakaba 1.10 ## Reconsume.
2628 wakaba 1.1 if (length $self->{ct}->{data}) { # character
2629     !!!cp (221.2);
2630     !!!emit ($self->{ct}); # character
2631     } else {
2632     !!!cp (221.3);
2633     ## No token to emit. $self->{ct} is discarded.
2634     }
2635     redo A;
2636     } else {
2637     !!!cp (221.4);
2638     $self->{ct}->{data} .= chr $self->{nc};
2639     $self->{read_until}->($self->{ct}->{data},
2640     q<]>,
2641     length $self->{ct}->{data});
2642    
2643     ## Stay in the state.
2644     !!!next-input-character;
2645     redo A;
2646     }
2647    
2648     ## ISSUE: "text tokens" in spec.
2649     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2650 wakaba 1.10 ## XML5: "CDATA bracket state".
2651    
2652 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
2653     !!!cp (221.5);
2654     $self->{state} = CDATA_SECTION_MSE2_STATE;
2655     !!!next-input-character;
2656     redo A;
2657     } else {
2658     !!!cp (221.6);
2659 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
2660 wakaba 1.1 $self->{ct}->{data} .= ']';
2661 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2662 wakaba 1.1 ## Reconsume.
2663     redo A;
2664     }
2665     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2666 wakaba 1.10 ## XML5: "CDATA end state".
2667    
2668 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2669     $self->{state} = DATA_STATE;
2670 wakaba 1.5 $self->{s_kwd} = '';
2671 wakaba 1.1 !!!next-input-character;
2672     if (length $self->{ct}->{data}) { # character
2673     !!!cp (221.7);
2674     !!!emit ($self->{ct}); # character
2675     } else {
2676     !!!cp (221.8);
2677     ## No token to emit. $self->{ct} is discarded.
2678     }
2679     redo A;
2680     } elsif ($self->{nc} == 0x005D) { # ]
2681     !!!cp (221.9); # character
2682     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2683     ## Stay in the state.
2684     !!!next-input-character;
2685     redo A;
2686     } else {
2687     !!!cp (221.11);
2688     $self->{ct}->{data} .= ']]'; # character
2689     $self->{state} = CDATA_SECTION_STATE;
2690 wakaba 1.10 ## Reconsume. ## XML5: Emit.
2691 wakaba 1.1 redo A;
2692     }
2693     } elsif ($self->{state} == ENTITY_STATE) {
2694     if ($is_space->{$self->{nc}} or
2695     {
2696     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2697     $self->{entity_add} => 1,
2698     }->{$self->{nc}}) {
2699     !!!cp (1001);
2700     ## Don't consume
2701     ## No error
2702     ## Return nothing.
2703     #
2704     } elsif ($self->{nc} == 0x0023) { # #
2705     !!!cp (999);
2706     $self->{state} = ENTITY_HASH_STATE;
2707 wakaba 1.12 $self->{kwd} = '#';
2708 wakaba 1.1 !!!next-input-character;
2709     redo A;
2710     } elsif ((0x0041 <= $self->{nc} and
2711     $self->{nc} <= 0x005A) or # A..Z
2712     (0x0061 <= $self->{nc} and
2713     $self->{nc} <= 0x007A)) { # a..z
2714     !!!cp (998);
2715     require Whatpm::_NamedEntityList;
2716     $self->{state} = ENTITY_NAME_STATE;
2717 wakaba 1.12 $self->{kwd} = chr $self->{nc};
2718     $self->{entity__value} = $self->{kwd};
2719 wakaba 1.1 $self->{entity__match} = 0;
2720     !!!next-input-character;
2721     redo A;
2722     } else {
2723     !!!cp (1027);
2724     !!!parse-error (type => 'bare ero');
2725     ## Return nothing.
2726     #
2727     }
2728    
2729     ## NOTE: No character is consumed by the "consume a character
2730     ## reference" algorithm. In other word, there is an "&" character
2731     ## that does not introduce a character reference, which would be
2732     ## appended to the parent element or the attribute value in later
2733     ## process of the tokenizer.
2734    
2735     if ($self->{prev_state} == DATA_STATE) {
2736     !!!cp (997);
2737     $self->{state} = $self->{prev_state};
2738 wakaba 1.5 $self->{s_kwd} = '';
2739 wakaba 1.1 ## Reconsume.
2740     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2741     line => $self->{line_prev},
2742     column => $self->{column_prev},
2743     });
2744     redo A;
2745     } else {
2746     !!!cp (996);
2747     $self->{ca}->{value} .= '&';
2748     $self->{state} = $self->{prev_state};
2749 wakaba 1.5 $self->{s_kwd} = '';
2750 wakaba 1.1 ## Reconsume.
2751     redo A;
2752     }
2753     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2754     if ($self->{nc} == 0x0078 or # x
2755     $self->{nc} == 0x0058) { # X
2756     !!!cp (995);
2757     $self->{state} = HEXREF_X_STATE;
2758 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2759 wakaba 1.1 !!!next-input-character;
2760     redo A;
2761     } elsif (0x0030 <= $self->{nc} and
2762     $self->{nc} <= 0x0039) { # 0..9
2763     !!!cp (994);
2764     $self->{state} = NCR_NUM_STATE;
2765 wakaba 1.12 $self->{kwd} = $self->{nc} - 0x0030;
2766 wakaba 1.1 !!!next-input-character;
2767     redo A;
2768     } else {
2769     !!!parse-error (type => 'bare nero',
2770     line => $self->{line_prev},
2771     column => $self->{column_prev} - 1);
2772    
2773     ## NOTE: According to the spec algorithm, nothing is returned,
2774     ## and then "&#" is appended to the parent element or the attribute
2775     ## value in the later processing.
2776    
2777     if ($self->{prev_state} == DATA_STATE) {
2778     !!!cp (1019);
2779     $self->{state} = $self->{prev_state};
2780 wakaba 1.5 $self->{s_kwd} = '';
2781 wakaba 1.1 ## Reconsume.
2782     !!!emit ({type => CHARACTER_TOKEN,
2783     data => '&#',
2784     line => $self->{line_prev},
2785     column => $self->{column_prev} - 1,
2786     });
2787     redo A;
2788     } else {
2789     !!!cp (993);
2790     $self->{ca}->{value} .= '&#';
2791     $self->{state} = $self->{prev_state};
2792 wakaba 1.5 $self->{s_kwd} = '';
2793 wakaba 1.1 ## Reconsume.
2794     redo A;
2795     }
2796     }
2797     } elsif ($self->{state} == NCR_NUM_STATE) {
2798     if (0x0030 <= $self->{nc} and
2799     $self->{nc} <= 0x0039) { # 0..9
2800     !!!cp (1012);
2801 wakaba 1.12 $self->{kwd} *= 10;
2802     $self->{kwd} += $self->{nc} - 0x0030;
2803 wakaba 1.1
2804     ## Stay in the state.
2805     !!!next-input-character;
2806     redo A;
2807     } elsif ($self->{nc} == 0x003B) { # ;
2808     !!!cp (1013);
2809     !!!next-input-character;
2810     #
2811     } else {
2812     !!!cp (1014);
2813     !!!parse-error (type => 'no refc');
2814     ## Reconsume.
2815     #
2816     }
2817    
2818 wakaba 1.12 my $code = $self->{kwd};
2819 wakaba 1.1 my $l = $self->{line_prev};
2820     my $c = $self->{column_prev};
2821     if ($charref_map->{$code}) {
2822     !!!cp (1015);
2823     !!!parse-error (type => 'invalid character reference',
2824     text => (sprintf 'U+%04X', $code),
2825     line => $l, column => $c);
2826     $code = $charref_map->{$code};
2827     } elsif ($code > 0x10FFFF) {
2828     !!!cp (1016);
2829     !!!parse-error (type => 'invalid character reference',
2830     text => (sprintf 'U-%08X', $code),
2831     line => $l, column => $c);
2832     $code = 0xFFFD;
2833     }
2834    
2835     if ($self->{prev_state} == DATA_STATE) {
2836     !!!cp (992);
2837     $self->{state} = $self->{prev_state};
2838 wakaba 1.5 $self->{s_kwd} = '';
2839 wakaba 1.1 ## Reconsume.
2840     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2841 wakaba 1.7 has_reference => 1,
2842 wakaba 1.1 line => $l, column => $c,
2843     });
2844     redo A;
2845     } else {
2846     !!!cp (991);
2847     $self->{ca}->{value} .= chr $code;
2848     $self->{ca}->{has_reference} = 1;
2849     $self->{state} = $self->{prev_state};
2850 wakaba 1.5 $self->{s_kwd} = '';
2851 wakaba 1.1 ## Reconsume.
2852     redo A;
2853     }
2854     } elsif ($self->{state} == HEXREF_X_STATE) {
2855     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2856     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2857     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2858     # 0..9, A..F, a..f
2859     !!!cp (990);
2860     $self->{state} = HEXREF_HEX_STATE;
2861 wakaba 1.12 $self->{kwd} = 0;
2862 wakaba 1.1 ## Reconsume.
2863     redo A;
2864     } else {
2865     !!!parse-error (type => 'bare hcro',
2866     line => $self->{line_prev},
2867     column => $self->{column_prev} - 2);
2868    
2869     ## NOTE: According to the spec algorithm, nothing is returned,
2870     ## and then "&#" followed by "X" or "x" is appended to the parent
2871     ## element or the attribute value in the later processing.
2872    
2873     if ($self->{prev_state} == DATA_STATE) {
2874     !!!cp (1005);
2875     $self->{state} = $self->{prev_state};
2876 wakaba 1.5 $self->{s_kwd} = '';
2877 wakaba 1.1 ## Reconsume.
2878     !!!emit ({type => CHARACTER_TOKEN,
2879 wakaba 1.12 data => '&' . $self->{kwd},
2880 wakaba 1.1 line => $self->{line_prev},
2881 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd},
2882 wakaba 1.1 });
2883     redo A;
2884     } else {
2885     !!!cp (989);
2886 wakaba 1.12 $self->{ca}->{value} .= '&' . $self->{kwd};
2887 wakaba 1.1 $self->{state} = $self->{prev_state};
2888 wakaba 1.5 $self->{s_kwd} = '';
2889 wakaba 1.1 ## Reconsume.
2890     redo A;
2891     }
2892     }
2893     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2894     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2895     # 0..9
2896     !!!cp (1002);
2897 wakaba 1.12 $self->{kwd} *= 0x10;
2898     $self->{kwd} += $self->{nc} - 0x0030;
2899 wakaba 1.1 ## Stay in the state.
2900     !!!next-input-character;
2901     redo A;
2902     } elsif (0x0061 <= $self->{nc} and
2903     $self->{nc} <= 0x0066) { # a..f
2904     !!!cp (1003);
2905 wakaba 1.12 $self->{kwd} *= 0x10;
2906     $self->{kwd} += $self->{nc} - 0x0060 + 9;
2907 wakaba 1.1 ## Stay in the state.
2908     !!!next-input-character;
2909     redo A;
2910     } elsif (0x0041 <= $self->{nc} and
2911     $self->{nc} <= 0x0046) { # A..F
2912     !!!cp (1004);
2913 wakaba 1.12 $self->{kwd} *= 0x10;
2914     $self->{kwd} += $self->{nc} - 0x0040 + 9;
2915 wakaba 1.1 ## Stay in the state.
2916     !!!next-input-character;
2917     redo A;
2918     } elsif ($self->{nc} == 0x003B) { # ;
2919     !!!cp (1006);
2920     !!!next-input-character;
2921     #
2922     } else {
2923     !!!cp (1007);
2924     !!!parse-error (type => 'no refc',
2925     line => $self->{line},
2926     column => $self->{column});
2927     ## Reconsume.
2928     #
2929     }
2930    
2931 wakaba 1.12 my $code = $self->{kwd};
2932 wakaba 1.1 my $l = $self->{line_prev};
2933     my $c = $self->{column_prev};
2934     if ($charref_map->{$code}) {
2935     !!!cp (1008);
2936     !!!parse-error (type => 'invalid character reference',
2937     text => (sprintf 'U+%04X', $code),
2938     line => $l, column => $c);
2939     $code = $charref_map->{$code};
2940     } elsif ($code > 0x10FFFF) {
2941     !!!cp (1009);
2942     !!!parse-error (type => 'invalid character reference',
2943     text => (sprintf 'U-%08X', $code),
2944     line => $l, column => $c);
2945     $code = 0xFFFD;
2946     }
2947    
2948     if ($self->{prev_state} == DATA_STATE) {
2949     !!!cp (988);
2950     $self->{state} = $self->{prev_state};
2951 wakaba 1.5 $self->{s_kwd} = '';
2952 wakaba 1.1 ## Reconsume.
2953     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2954 wakaba 1.7 has_reference => 1,
2955 wakaba 1.1 line => $l, column => $c,
2956     });
2957     redo A;
2958     } else {
2959     !!!cp (987);
2960     $self->{ca}->{value} .= chr $code;
2961     $self->{ca}->{has_reference} = 1;
2962     $self->{state} = $self->{prev_state};
2963 wakaba 1.5 $self->{s_kwd} = '';
2964 wakaba 1.1 ## Reconsume.
2965     redo A;
2966     }
2967     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2968 wakaba 1.12 if (length $self->{kwd} < 30 and
2969 wakaba 1.1 ## NOTE: Some number greater than the maximum length of entity name
2970     ((0x0041 <= $self->{nc} and # a
2971     $self->{nc} <= 0x005A) or # x
2972     (0x0061 <= $self->{nc} and # a
2973     $self->{nc} <= 0x007A) or # z
2974     (0x0030 <= $self->{nc} and # 0
2975     $self->{nc} <= 0x0039) or # 9
2976     $self->{nc} == 0x003B)) { # ;
2977     our $EntityChar;
2978 wakaba 1.12 $self->{kwd} .= chr $self->{nc};
2979     if (defined $EntityChar->{$self->{kwd}}) {
2980 wakaba 1.1 if ($self->{nc} == 0x003B) { # ;
2981     !!!cp (1020);
2982 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
2983 wakaba 1.1 $self->{entity__match} = 1;
2984     !!!next-input-character;
2985     #
2986     } else {
2987     !!!cp (1021);
2988 wakaba 1.12 $self->{entity__value} = $EntityChar->{$self->{kwd}};
2989 wakaba 1.1 $self->{entity__match} = -1;
2990     ## Stay in the state.
2991     !!!next-input-character;
2992     redo A;
2993     }
2994     } else {
2995     !!!cp (1022);
2996     $self->{entity__value} .= chr $self->{nc};
2997     $self->{entity__match} *= 2;
2998     ## Stay in the state.
2999     !!!next-input-character;
3000     redo A;
3001     }
3002     }
3003    
3004     my $data;
3005     my $has_ref;
3006     if ($self->{entity__match} > 0) {
3007     !!!cp (1023);
3008     $data = $self->{entity__value};
3009     $has_ref = 1;
3010     #
3011     } elsif ($self->{entity__match} < 0) {
3012     !!!parse-error (type => 'no refc');
3013     if ($self->{prev_state} != DATA_STATE and # in attribute
3014     $self->{entity__match} < -1) {
3015     !!!cp (1024);
3016 wakaba 1.12 $data = '&' . $self->{kwd};
3017 wakaba 1.1 #
3018     } else {
3019     !!!cp (1025);
3020     $data = $self->{entity__value};
3021     $has_ref = 1;
3022     #
3023     }
3024     } else {
3025     !!!cp (1026);
3026     !!!parse-error (type => 'bare ero',
3027     line => $self->{line_prev},
3028 wakaba 1.12 column => $self->{column_prev} - length $self->{kwd});
3029     $data = '&' . $self->{kwd};
3030 wakaba 1.1 #
3031     }
3032    
3033     ## NOTE: In these cases, when a character reference is found,
3034     ## it is consumed and a character token is returned, or, otherwise,
3035     ## nothing is consumed and returned, according to the spec algorithm.
3036     ## In this implementation, anything that has been examined by the
3037     ## tokenizer is appended to the parent element or the attribute value
3038     ## as string, either literal string when no character reference or
3039     ## entity-replaced string otherwise, in this stage, since any characters
3040     ## that would not be consumed are appended in the data state or in an
3041     ## appropriate attribute value state anyway.
3042    
3043     if ($self->{prev_state} == DATA_STATE) {
3044     !!!cp (986);
3045     $self->{state} = $self->{prev_state};
3046 wakaba 1.5 $self->{s_kwd} = '';
3047 wakaba 1.1 ## Reconsume.
3048     !!!emit ({type => CHARACTER_TOKEN,
3049     data => $data,
3050 wakaba 1.7 has_reference => $has_ref,
3051 wakaba 1.1 line => $self->{line_prev},
3052 wakaba 1.12 column => $self->{column_prev} + 1 - length $self->{kwd},
3053 wakaba 1.1 });
3054     redo A;
3055     } else {
3056     !!!cp (985);
3057     $self->{ca}->{value} .= $data;
3058     $self->{ca}->{has_reference} = 1 if $has_ref;
3059     $self->{state} = $self->{prev_state};
3060 wakaba 1.5 $self->{s_kwd} = '';
3061 wakaba 1.1 ## Reconsume.
3062     redo A;
3063     }
3064 wakaba 1.8
3065     ## XML-only states
3066    
3067     } elsif ($self->{state} == PI_STATE) {
3068     if ($is_space->{$self->{nc}} or
3069     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
3070     $self->{nc} == -1) {
3071     !!!parse-error (type => 'bare pio', ## TODO: type
3072     line => $self->{line_prev},
3073     column => $self->{column_prev}
3074     - 1 * ($self->{nc} != -1));
3075     $self->{state} = BOGUS_COMMENT_STATE;
3076     ## Reconsume.
3077     $self->{ct} = {type => COMMENT_TOKEN,
3078     data => '?',
3079     line => $self->{line_prev},
3080     column => $self->{column_prev}
3081     - 1 * ($self->{nc} != -1),
3082     };
3083     redo A;
3084     } else {
3085     $self->{ct} = {type => PI_TOKEN,
3086     target => chr $self->{nc},
3087     data => '',
3088     line => $self->{line_prev},
3089     column => $self->{column_prev} - 1,
3090     };
3091     $self->{state} = PI_TARGET_STATE;
3092     !!!next-input-character;
3093     redo A;
3094     }
3095     } elsif ($self->{state} == PI_TARGET_STATE) {
3096     if ($is_space->{$self->{nc}}) {
3097     $self->{state} = PI_TARGET_AFTER_STATE;
3098     !!!next-input-character;
3099     redo A;
3100     } elsif ($self->{nc} == -1) {
3101     !!!parse-error (type => 'no pic'); ## TODO: type
3102     $self->{state} = DATA_STATE;
3103     $self->{s_kwd} = '';
3104     ## Reconsume.
3105     !!!emit ($self->{ct}); # pi
3106     redo A;
3107     } elsif ($self->{nc} == 0x003F) { # ?
3108     $self->{state} = PI_AFTER_STATE;
3109     !!!next-input-character;
3110     redo A;
3111     } else {
3112     ## XML5: typo ("tag name" -> "target")
3113     $self->{ct}->{target} .= chr $self->{nc}; # pi
3114     !!!next-input-character;
3115     redo A;
3116     }
3117     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
3118     if ($is_space->{$self->{nc}}) {
3119     ## Stay in the state.
3120     !!!next-input-character;
3121     redo A;
3122     } else {
3123     $self->{state} = PI_DATA_STATE;
3124     ## Reprocess.
3125     redo A;
3126     }
3127     } elsif ($self->{state} == PI_DATA_STATE) {
3128     if ($self->{nc} == 0x003F) { # ?
3129     $self->{state} = PI_DATA_AFTER_STATE;
3130     !!!next-input-character;
3131     redo A;
3132     } elsif ($self->{nc} == -1) {
3133     !!!parse-error (type => 'no pic'); ## TODO: type
3134     $self->{state} = DATA_STATE;
3135     $self->{s_kwd} = '';
3136     ## Reprocess.
3137     !!!emit ($self->{ct}); # pi
3138     redo A;
3139     } else {
3140     $self->{ct}->{data} .= chr $self->{nc}; # pi
3141     $self->{read_until}->($self->{ct}->{data}, q[?],
3142     length $self->{ct}->{data});
3143     ## Stay in the state.
3144     !!!next-input-character;
3145     ## Reprocess.
3146     redo A;
3147     }
3148     } elsif ($self->{state} == PI_AFTER_STATE) {
3149     if ($self->{nc} == 0x003E) { # >
3150     $self->{state} = DATA_STATE;
3151     $self->{s_kwd} = '';
3152     !!!next-input-character;
3153     !!!emit ($self->{ct}); # pi
3154     redo A;
3155     } elsif ($self->{nc} == 0x003F) { # ?
3156     !!!parse-error (type => 'no s after target', ## TODO: type
3157     line => $self->{line_prev},
3158     column => $self->{column_prev}); ## XML5: no error
3159     $self->{ct}->{data} .= '?';
3160     $self->{state} = PI_DATA_AFTER_STATE;
3161     !!!next-input-character;
3162     redo A;
3163     } else {
3164     !!!parse-error (type => 'no s after target', ## TODO: type
3165     line => $self->{line_prev},
3166     column => $self->{column_prev}
3167     + 1 * ($self->{nc} == -1)); ## XML5: no error
3168     $self->{ct}->{data} .= '?'; ## XML5: not appended
3169     $self->{state} = PI_DATA_STATE;
3170     ## Reprocess.
3171     redo A;
3172     }
3173     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
3174     ## XML5: Same as "pi after state" in XML5
3175     if ($self->{nc} == 0x003E) { # >
3176     $self->{state} = DATA_STATE;
3177     $self->{s_kwd} = '';
3178     !!!next-input-character;
3179     !!!emit ($self->{ct}); # pi
3180     redo A;
3181     } elsif ($self->{nc} == 0x003F) { # ?
3182     $self->{ct}->{data} .= '?';
3183     ## Stay in the state.
3184     !!!next-input-character;
3185     redo A;
3186     } else {
3187     $self->{ct}->{data} .= '?'; ## XML5: not appended
3188     $self->{state} = PI_DATA_STATE;
3189     ## Reprocess.
3190     redo A;
3191     }
3192 wakaba 1.12
3193     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_STATE) {
3194     if ($self->{nc} == 0x003C) { # <
3195     ## TODO:
3196     !!!next-input-character;
3197     redo A;
3198     } elsif ($self->{nc} == 0x0025) { # %
3199     ## XML5: Not defined yet.
3200    
3201     ## TODO:
3202     !!!next-input-character;
3203     redo A;
3204     } elsif ($self->{nc} == 0x005D) { # ]
3205     $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE;
3206     !!!next-input-character;
3207     redo A;
3208     } elsif ($is_space->{$self->{nc}}) {
3209     ## Stay in the state.
3210     !!!next-input-character;
3211     redo A;
3212     } elsif ($self->{nc} == -1) {
3213     !!!parse-error (type => 'unclosed internal subset'); ## TODO: type
3214     $self->{state} = DATA_STATE;
3215     $self->{s_kwd} = '';
3216     ## Reconsume.
3217     !!!emit ($self->{ct}); # DOCTYPE
3218     redo A;
3219     } else {
3220     unless ($self->{internal_subset_tainted}) {
3221     ## XML5: No parse error.
3222     !!!parse-error (type => 'string in internal subset');
3223     $self->{internal_subset_tainted} = 1;
3224     }
3225     ## Stay in the state.
3226     !!!next-input-character;
3227     redo A;
3228     }
3229     } elsif ($self->{state} == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) {
3230     if ($self->{nc} == 0x003E) { # >
3231     $self->{state} = DATA_STATE;
3232     $self->{s_kwd} = '';
3233     !!!next-input-character;
3234     !!!emit ($self->{ct}); # DOCTYPE
3235     redo A;
3236     } elsif ($self->{nc} == -1) {
3237     !!!parse-error (type => 'unclosed DOCTYPE');
3238     $self->{state} = DATA_STATE;
3239     $self->{s_kwd} = '';
3240     ## Reconsume.
3241     !!!emit ($self->{ct}); # DOCTYPE
3242     redo A;
3243     } else {
3244     ## XML5: No parse error and stay in the state.
3245     !!!parse-error (type => 'string after internal subset'); ## TODO: type
3246    
3247     $self->{state} = BOGUS_DOCTYPE_STATE;
3248     !!!next-input-character;
3249     redo A;
3250     }
3251 wakaba 1.8
3252 wakaba 1.1 } else {
3253     die "$0: $self->{state}: Unknown state";
3254     }
3255     } # A
3256    
3257     die "$0: _get_next_token: unexpected case";
3258     } # _get_next_token
3259    
3260     1;
3261 wakaba 1.12 ## $Date: 2008/10/15 10:50:38 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24