/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (hide annotations) (download) (as text)
Wed Oct 15 08:51:02 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.9: +64 -11 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	15 Oct 2008 08:50:58 -0000
	* doctypes-1.dat: Lowercase <!doctype> test added.

	* elements-1.dat: End tag tests added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 08:50:06 -0000
	* Tokenizer.pm.src: XML tag name start character support for end
	tags.  Support for the short end tag syntax of XML5.  Raise a
	parse erorr for a lowercase <!doctype> in XML.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: XML tag name start character support for start

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.10 our $VERSION=do{my @r=(q$Revision: 1.9 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117 wakaba 1.8 ## XML states
118     sub PI_STATE () { 51 }
119     sub PI_TARGET_STATE () { 52 }
120     sub PI_TARGET_AFTER_STATE () { 53 }
121     sub PI_DATA_STATE () { 54 }
122     sub PI_AFTER_STATE () { 55 }
123     sub PI_DATA_AFTER_STATE () { 56 }
124    
125 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
126     ## list and descriptions)
127    
128     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129     sub FOREIGN_EL () { 0b1_00000000000 }
130    
131     ## Character reference mappings
132    
133     my $charref_map = {
134     0x0D => 0x000A,
135     0x80 => 0x20AC,
136     0x81 => 0xFFFD,
137     0x82 => 0x201A,
138     0x83 => 0x0192,
139     0x84 => 0x201E,
140     0x85 => 0x2026,
141     0x86 => 0x2020,
142     0x87 => 0x2021,
143     0x88 => 0x02C6,
144     0x89 => 0x2030,
145     0x8A => 0x0160,
146     0x8B => 0x2039,
147     0x8C => 0x0152,
148     0x8D => 0xFFFD,
149     0x8E => 0x017D,
150     0x8F => 0xFFFD,
151     0x90 => 0xFFFD,
152     0x91 => 0x2018,
153     0x92 => 0x2019,
154     0x93 => 0x201C,
155     0x94 => 0x201D,
156     0x95 => 0x2022,
157     0x96 => 0x2013,
158     0x97 => 0x2014,
159     0x98 => 0x02DC,
160     0x99 => 0x2122,
161     0x9A => 0x0161,
162     0x9B => 0x203A,
163     0x9C => 0x0153,
164     0x9D => 0xFFFD,
165     0x9E => 0x017E,
166     0x9F => 0x0178,
167     }; # $charref_map
168     $charref_map->{$_} = 0xFFFD
169     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176    
177     ## Implementations MUST act as if state machine in the spec
178    
179     sub _initialize_tokenizer ($) {
180     my $self = shift;
181    
182     ## NOTE: Fields set by |new| constructor:
183     #$self->{level}
184     #$self->{set_nc}
185     #$self->{parse_error}
186 wakaba 1.3 #$self->{is_xml} (if XML)
187 wakaba 1.1
188     $self->{state} = DATA_STATE; # MUST
189 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
190 wakaba 1.1 #$self->{entity__value}; # initialized when used
191     #$self->{entity__match}; # initialized when used
192     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193     undef $self->{ct}; # current token
194     undef $self->{ca}; # current attribute
195     undef $self->{last_stag_name}; # last emitted start tag name
196     #$self->{prev_state}; # initialized when used
197     delete $self->{self_closing};
198     $self->{char_buffer} = '';
199     $self->{char_buffer_pos} = 0;
200     $self->{nc} = -1; # next input character
201     #$self->{next_nc}
202     !!!next-input-character;
203     $self->{token} = [];
204     # $self->{escape}
205     } # _initialize_tokenizer
206    
207     ## A token has:
208     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
209     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
210     ## ->{name} (DOCTYPE_TOKEN)
211     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
212     ## ->{pubid} (DOCTYPE_TOKEN)
213     ## ->{sysid} (DOCTYPE_TOKEN)
214     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
215     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
216     ## ->{name}
217     ## ->{value}
218     ## ->{has_reference} == 1 or 0
219     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
220 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
221 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
222     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
223     ## while the token is pushed back to the stack.
224    
225     ## Emitted token MUST immediately be handled by the tree construction state.
226    
227     ## Before each step, UA MAY check to see if either one of the scripts in
228     ## "list of scripts that will execute as soon as possible" or the first
229     ## script in the "list of scripts that will execute asynchronously",
230     ## has completed loading. If one has, then it MUST be executed
231     ## and removed from the list.
232    
233     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
234     ## (This requirement was dropped from HTML5 spec, unfortunately.)
235    
236     my $is_space = {
237     0x0009 => 1, # CHARACTER TABULATION (HT)
238     0x000A => 1, # LINE FEED (LF)
239     #0x000B => 0, # LINE TABULATION (VT)
240     0x000C => 1, # FORM FEED (FF)
241     #0x000D => 1, # CARRIAGE RETURN (CR)
242     0x0020 => 1, # SPACE (SP)
243     };
244    
245     sub _get_next_token ($) {
246     my $self = shift;
247    
248     if ($self->{self_closing}) {
249     !!!parse-error (type => 'nestc', token => $self->{ct});
250     ## NOTE: The |self_closing| flag is only set by start tag token.
251     ## In addition, when a start tag token is emitted, it is always set to
252     ## |ct|.
253     delete $self->{self_closing};
254     }
255    
256     if (@{$self->{token}}) {
257     $self->{self_closing} = $self->{token}->[0]->{self_closing};
258     return shift @{$self->{token}};
259     }
260    
261     A: {
262     if ($self->{state} == PCDATA_STATE) {
263     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
264    
265     if ($self->{nc} == 0x0026) { # &
266     !!!cp (0.1);
267     ## NOTE: In the spec, the tokenizer is switched to the
268     ## "entity data state". In this implementation, the tokenizer
269     ## is switched to the |ENTITY_STATE|, which is an implementation
270     ## of the "consume a character reference" algorithm.
271     $self->{entity_add} = -1;
272     $self->{prev_state} = DATA_STATE;
273     $self->{state} = ENTITY_STATE;
274     !!!next-input-character;
275     redo A;
276     } elsif ($self->{nc} == 0x003C) { # <
277     !!!cp (0.2);
278     $self->{state} = TAG_OPEN_STATE;
279     !!!next-input-character;
280     redo A;
281     } elsif ($self->{nc} == -1) {
282     !!!cp (0.3);
283     !!!emit ({type => END_OF_FILE_TOKEN,
284     line => $self->{line}, column => $self->{column}});
285     last A; ## TODO: ok?
286     } else {
287     !!!cp (0.4);
288     #
289     }
290    
291     # Anything else
292     my $token = {type => CHARACTER_TOKEN,
293     data => chr $self->{nc},
294     line => $self->{line}, column => $self->{column},
295     };
296     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
297    
298     ## Stay in the state.
299     !!!next-input-character;
300     !!!emit ($token);
301     redo A;
302     } elsif ($self->{state} == DATA_STATE) {
303     $self->{s_kwd} = '' unless defined $self->{s_kwd};
304     if ($self->{nc} == 0x0026) { # &
305     $self->{s_kwd} = '';
306     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
307     not $self->{escape}) {
308     !!!cp (1);
309     ## NOTE: In the spec, the tokenizer is switched to the
310     ## "entity data state". In this implementation, the tokenizer
311     ## is switched to the |ENTITY_STATE|, which is an implementation
312     ## of the "consume a character reference" algorithm.
313     $self->{entity_add} = -1;
314     $self->{prev_state} = DATA_STATE;
315     $self->{state} = ENTITY_STATE;
316     !!!next-input-character;
317     redo A;
318     } else {
319     !!!cp (2);
320     #
321     }
322     } elsif ($self->{nc} == 0x002D) { # -
323     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
324 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
325 wakaba 1.1 !!!cp (3);
326     $self->{escape} = 1; # unless $self->{escape};
327     $self->{s_kwd} = '--';
328     #
329 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
330 wakaba 1.1 !!!cp (4);
331     $self->{s_kwd} = '--';
332     #
333 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
334     !!!cp (4.1);
335     $self->{s_kwd} .= '-';
336     #
337 wakaba 1.1 } else {
338     !!!cp (5);
339 wakaba 1.5 $self->{s_kwd} = '-';
340 wakaba 1.1 #
341     }
342     }
343    
344     #
345     } elsif ($self->{nc} == 0x0021) { # !
346     if (length $self->{s_kwd}) {
347     !!!cp (5.1);
348     $self->{s_kwd} .= '!';
349     #
350     } else {
351     !!!cp (5.2);
352     #$self->{s_kwd} = '';
353     #
354     }
355     #
356     } elsif ($self->{nc} == 0x003C) { # <
357     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
358     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
359     not $self->{escape})) {
360     !!!cp (6);
361     $self->{state} = TAG_OPEN_STATE;
362     !!!next-input-character;
363     redo A;
364     } else {
365     !!!cp (7);
366     $self->{s_kwd} = '';
367     #
368     }
369     } elsif ($self->{nc} == 0x003E) { # >
370     if ($self->{escape} and
371     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
372     if ($self->{s_kwd} eq '--') {
373     !!!cp (8);
374     delete $self->{escape};
375 wakaba 1.5 #
376 wakaba 1.1 } else {
377     !!!cp (9);
378 wakaba 1.5 #
379 wakaba 1.1 }
380 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
381     !!!cp (9.1);
382     !!!parse-error (type => 'unmatched mse', ## TODO: type
383     line => $self->{line_prev},
384     column => $self->{column_prev} - 1);
385     #
386 wakaba 1.1 } else {
387     !!!cp (10);
388 wakaba 1.5 #
389 wakaba 1.1 }
390    
391     $self->{s_kwd} = '';
392     #
393 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
394     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
395     !!!cp (10.1);
396     $self->{s_kwd} .= ']';
397     } elsif ($self->{s_kwd} eq ']]') {
398     !!!cp (10.2);
399     #
400     } else {
401     !!!cp (10.3);
402     $self->{s_kwd} = '';
403     }
404     #
405 wakaba 1.1 } elsif ($self->{nc} == -1) {
406     !!!cp (11);
407     $self->{s_kwd} = '';
408     !!!emit ({type => END_OF_FILE_TOKEN,
409     line => $self->{line}, column => $self->{column}});
410     last A; ## TODO: ok?
411     } else {
412     !!!cp (12);
413     $self->{s_kwd} = '';
414     #
415     }
416    
417     # Anything else
418     my $token = {type => CHARACTER_TOKEN,
419     data => chr $self->{nc},
420     line => $self->{line}, column => $self->{column},
421     };
422 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
423 wakaba 1.1 length $token->{data})) {
424     $self->{s_kwd} = '';
425     }
426    
427     ## Stay in the data state.
428 wakaba 1.5 if (not $self->{is_xml} and
429     $self->{content_model} == PCDATA_CONTENT_MODEL) {
430 wakaba 1.1 !!!cp (13);
431     $self->{state} = PCDATA_STATE;
432     } else {
433     !!!cp (14);
434     ## Stay in the state.
435     }
436     !!!next-input-character;
437     !!!emit ($token);
438     redo A;
439     } elsif ($self->{state} == TAG_OPEN_STATE) {
440 wakaba 1.10 ## XML5: "tag state".
441    
442 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
443     if ($self->{nc} == 0x002F) { # /
444     !!!cp (15);
445     !!!next-input-character;
446     $self->{state} = CLOSE_TAG_OPEN_STATE;
447     redo A;
448     } elsif ($self->{nc} == 0x0021) { # !
449     !!!cp (15.1);
450     $self->{s_kwd} = '<' unless $self->{escape};
451     #
452     } else {
453     !!!cp (16);
454     #
455     }
456    
457     ## reconsume
458     $self->{state} = DATA_STATE;
459 wakaba 1.5 $self->{s_kwd} = '';
460 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN, data => '<',
461     line => $self->{line_prev},
462     column => $self->{column_prev},
463     });
464     redo A;
465     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
466     if ($self->{nc} == 0x0021) { # !
467     !!!cp (17);
468     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
469     !!!next-input-character;
470     redo A;
471     } elsif ($self->{nc} == 0x002F) { # /
472     !!!cp (18);
473     $self->{state} = CLOSE_TAG_OPEN_STATE;
474     !!!next-input-character;
475     redo A;
476     } elsif (0x0041 <= $self->{nc} and
477     $self->{nc} <= 0x005A) { # A..Z
478     !!!cp (19);
479     $self->{ct}
480     = {type => START_TAG_TOKEN,
481 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
482 wakaba 1.1 line => $self->{line_prev},
483     column => $self->{column_prev}};
484     $self->{state} = TAG_NAME_STATE;
485     !!!next-input-character;
486     redo A;
487     } elsif (0x0061 <= $self->{nc} and
488     $self->{nc} <= 0x007A) { # a..z
489     !!!cp (20);
490     $self->{ct} = {type => START_TAG_TOKEN,
491     tag_name => chr ($self->{nc}),
492     line => $self->{line_prev},
493     column => $self->{column_prev}};
494     $self->{state} = TAG_NAME_STATE;
495     !!!next-input-character;
496     redo A;
497     } elsif ($self->{nc} == 0x003E) { # >
498     !!!cp (21);
499     !!!parse-error (type => 'empty start tag',
500     line => $self->{line_prev},
501     column => $self->{column_prev});
502     $self->{state} = DATA_STATE;
503 wakaba 1.5 $self->{s_kwd} = '';
504 wakaba 1.1 !!!next-input-character;
505    
506     !!!emit ({type => CHARACTER_TOKEN, data => '<>',
507     line => $self->{line_prev},
508     column => $self->{column_prev},
509     });
510    
511     redo A;
512     } elsif ($self->{nc} == 0x003F) { # ?
513 wakaba 1.8 if ($self->{is_xml}) {
514     !!!cp (22.1);
515     $self->{state} = PI_STATE;
516     !!!next-input-character;
517     redo A;
518     } else {
519     !!!cp (22);
520     !!!parse-error (type => 'pio',
521     line => $self->{line_prev},
522     column => $self->{column_prev});
523     $self->{state} = BOGUS_COMMENT_STATE;
524     $self->{ct} = {type => COMMENT_TOKEN, data => '',
525     line => $self->{line_prev},
526     column => $self->{column_prev},
527     };
528     ## $self->{nc} is intentionally left as is
529     redo A;
530     }
531 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
532 wakaba 1.1 !!!cp (23);
533     !!!parse-error (type => 'bare stago',
534     line => $self->{line_prev},
535     column => $self->{column_prev});
536     $self->{state} = DATA_STATE;
537 wakaba 1.5 $self->{s_kwd} = '';
538 wakaba 1.1 ## reconsume
539    
540     !!!emit ({type => CHARACTER_TOKEN, data => '<',
541     line => $self->{line_prev},
542     column => $self->{column_prev},
543     });
544    
545     redo A;
546 wakaba 1.9 } else {
547     ## XML5: "<:" is a parse error.
548     !!!cp (23.1);
549     $self->{ct} = {type => START_TAG_TOKEN,
550     tag_name => chr ($self->{nc}),
551     line => $self->{line_prev},
552     column => $self->{column_prev}};
553     $self->{state} = TAG_NAME_STATE;
554     !!!next-input-character;
555     redo A;
556 wakaba 1.1 }
557     } else {
558     die "$0: $self->{content_model} in tag open";
559     }
560     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
561     ## NOTE: The "close tag open state" in the spec is implemented as
562     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
563    
564 wakaba 1.10 ## XML5: "end tag state".
565    
566 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
567     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
568     if (defined $self->{last_stag_name}) {
569     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
570     $self->{s_kwd} = '';
571     ## Reconsume.
572     redo A;
573     } else {
574     ## No start tag token has ever been emitted
575     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
576     !!!cp (28);
577     $self->{state} = DATA_STATE;
578 wakaba 1.5 $self->{s_kwd} = '';
579 wakaba 1.1 ## Reconsume.
580     !!!emit ({type => CHARACTER_TOKEN, data => '</',
581     line => $l, column => $c,
582     });
583     redo A;
584     }
585     }
586    
587     if (0x0041 <= $self->{nc} and
588     $self->{nc} <= 0x005A) { # A..Z
589     !!!cp (29);
590     $self->{ct}
591     = {type => END_TAG_TOKEN,
592 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
593 wakaba 1.1 line => $l, column => $c};
594     $self->{state} = TAG_NAME_STATE;
595     !!!next-input-character;
596     redo A;
597     } elsif (0x0061 <= $self->{nc} and
598     $self->{nc} <= 0x007A) { # a..z
599     !!!cp (30);
600     $self->{ct} = {type => END_TAG_TOKEN,
601     tag_name => chr ($self->{nc}),
602     line => $l, column => $c};
603     $self->{state} = TAG_NAME_STATE;
604     !!!next-input-character;
605     redo A;
606     } elsif ($self->{nc} == 0x003E) { # >
607     !!!parse-error (type => 'empty end tag',
608     line => $self->{line_prev}, ## "<" in "</>"
609     column => $self->{column_prev} - 1);
610     $self->{state} = DATA_STATE;
611 wakaba 1.5 $self->{s_kwd} = '';
612 wakaba 1.10 if ($self->{is_xml}) {
613     !!!cp (31);
614     ## XML5: No parse error.
615    
616     ## NOTE: This parser raises a parse error, since it supports
617     ## XML1, not XML5.
618    
619     ## NOTE: A short end tag token.
620     my $ct = {type => END_TAG_TOKEN,
621     tag_name => '',
622     line => $self->{line_prev},
623     column => $self->{column_prev} - 1,
624     };
625     !!!next-input-character;
626     !!!emit ($ct);
627     } else {
628     !!!cp (31.1);
629     !!!next-input-character;
630     }
631 wakaba 1.1 redo A;
632     } elsif ($self->{nc} == -1) {
633     !!!cp (32);
634     !!!parse-error (type => 'bare etago');
635 wakaba 1.5 $self->{s_kwd} = '';
636 wakaba 1.1 $self->{state} = DATA_STATE;
637     # reconsume
638    
639     !!!emit ({type => CHARACTER_TOKEN, data => '</',
640     line => $l, column => $c,
641     });
642    
643     redo A;
644 wakaba 1.10 } elsif (not $self->{is_xml} or
645     $is_space->{$self->{nc}}) {
646 wakaba 1.1 !!!cp (33);
647 wakaba 1.10 !!!parse-error (type => 'bogus end tag',
648     line => $self->{line_prev}, # "<" of "</"
649     column => $self->{column_prev} - 1);
650 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
651     $self->{ct} = {type => COMMENT_TOKEN, data => '',
652     line => $self->{line_prev}, # "<" of "</"
653     column => $self->{column_prev} - 1,
654     };
655     ## NOTE: $self->{nc} is intentionally left as is.
656     ## Although the "anything else" case of the spec not explicitly
657     ## states that the next input character is to be reconsumed,
658     ## it will be included to the |data| of the comment token
659     ## generated from the bogus end tag, as defined in the
660     ## "bogus comment state" entry.
661     redo A;
662 wakaba 1.10 } else {
663     ## XML5: "</:" is a parse error.
664     !!!cp (30.1);
665     $self->{ct} = {type => END_TAG_TOKEN,
666     tag_name => chr ($self->{nc}),
667     line => $l, column => $c};
668     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
669     !!!next-input-character;
670     redo A;
671 wakaba 1.1 }
672     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
673     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
674     if (length $ch) {
675     my $CH = $ch;
676     $ch =~ tr/a-z/A-Z/;
677     my $nch = chr $self->{nc};
678     if ($nch eq $ch or $nch eq $CH) {
679     !!!cp (24);
680     ## Stay in the state.
681     $self->{s_kwd} .= $nch;
682     !!!next-input-character;
683     redo A;
684     } else {
685     !!!cp (25);
686     $self->{state} = DATA_STATE;
687 wakaba 1.5 $self->{s_kwd} = '';
688 wakaba 1.1 ## Reconsume.
689     !!!emit ({type => CHARACTER_TOKEN,
690     data => '</' . $self->{s_kwd},
691     line => $self->{line_prev},
692     column => $self->{column_prev} - 1 - length $self->{s_kwd},
693     });
694     redo A;
695     }
696     } else { # after "<{tag-name}"
697     unless ($is_space->{$self->{nc}} or
698     {
699     0x003E => 1, # >
700     0x002F => 1, # /
701     -1 => 1, # EOF
702     }->{$self->{nc}}) {
703     !!!cp (26);
704     ## Reconsume.
705     $self->{state} = DATA_STATE;
706 wakaba 1.5 $self->{s_kwd} = '';
707 wakaba 1.1 !!!emit ({type => CHARACTER_TOKEN,
708     data => '</' . $self->{s_kwd},
709     line => $self->{line_prev},
710     column => $self->{column_prev} - 1 - length $self->{s_kwd},
711     });
712     redo A;
713     } else {
714     !!!cp (27);
715     $self->{ct}
716     = {type => END_TAG_TOKEN,
717     tag_name => $self->{last_stag_name},
718     line => $self->{line_prev},
719     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
720     $self->{state} = TAG_NAME_STATE;
721     ## Reconsume.
722     redo A;
723     }
724     }
725     } elsif ($self->{state} == TAG_NAME_STATE) {
726     if ($is_space->{$self->{nc}}) {
727     !!!cp (34);
728     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
729     !!!next-input-character;
730     redo A;
731     } elsif ($self->{nc} == 0x003E) { # >
732     if ($self->{ct}->{type} == START_TAG_TOKEN) {
733     !!!cp (35);
734     $self->{last_stag_name} = $self->{ct}->{tag_name};
735     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
736     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
737     #if ($self->{ct}->{attributes}) {
738     # ## NOTE: This should never be reached.
739     # !!! cp (36);
740     # !!! parse-error (type => 'end tag attribute');
741     #} else {
742     !!!cp (37);
743     #}
744     } else {
745     die "$0: $self->{ct}->{type}: Unknown token type";
746     }
747     $self->{state} = DATA_STATE;
748 wakaba 1.5 $self->{s_kwd} = '';
749 wakaba 1.1 !!!next-input-character;
750    
751     !!!emit ($self->{ct}); # start tag or end tag
752    
753     redo A;
754     } elsif (0x0041 <= $self->{nc} and
755     $self->{nc} <= 0x005A) { # A..Z
756     !!!cp (38);
757 wakaba 1.4 $self->{ct}->{tag_name}
758     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
759 wakaba 1.1 # start tag or end tag
760     ## Stay in this state
761     !!!next-input-character;
762     redo A;
763     } elsif ($self->{nc} == -1) {
764     !!!parse-error (type => 'unclosed tag');
765     if ($self->{ct}->{type} == START_TAG_TOKEN) {
766     !!!cp (39);
767     $self->{last_stag_name} = $self->{ct}->{tag_name};
768     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
769     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
770     #if ($self->{ct}->{attributes}) {
771     # ## NOTE: This state should never be reached.
772     # !!! cp (40);
773     # !!! parse-error (type => 'end tag attribute');
774     #} else {
775     !!!cp (41);
776     #}
777     } else {
778     die "$0: $self->{ct}->{type}: Unknown token type";
779     }
780     $self->{state} = DATA_STATE;
781 wakaba 1.5 $self->{s_kwd} = '';
782 wakaba 1.1 # reconsume
783    
784     !!!emit ($self->{ct}); # start tag or end tag
785    
786     redo A;
787     } elsif ($self->{nc} == 0x002F) { # /
788     !!!cp (42);
789     $self->{state} = SELF_CLOSING_START_TAG_STATE;
790     !!!next-input-character;
791     redo A;
792     } else {
793     !!!cp (44);
794     $self->{ct}->{tag_name} .= chr $self->{nc};
795     # start tag or end tag
796     ## Stay in the state
797     !!!next-input-character;
798     redo A;
799     }
800     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
801     if ($is_space->{$self->{nc}}) {
802     !!!cp (45);
803     ## Stay in the state
804     !!!next-input-character;
805     redo A;
806     } elsif ($self->{nc} == 0x003E) { # >
807     if ($self->{ct}->{type} == START_TAG_TOKEN) {
808     !!!cp (46);
809     $self->{last_stag_name} = $self->{ct}->{tag_name};
810     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
811     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
812     if ($self->{ct}->{attributes}) {
813     !!!cp (47);
814     !!!parse-error (type => 'end tag attribute');
815     } else {
816     !!!cp (48);
817     }
818     } else {
819     die "$0: $self->{ct}->{type}: Unknown token type";
820     }
821     $self->{state} = DATA_STATE;
822 wakaba 1.5 $self->{s_kwd} = '';
823 wakaba 1.1 !!!next-input-character;
824    
825     !!!emit ($self->{ct}); # start tag or end tag
826    
827     redo A;
828     } elsif (0x0041 <= $self->{nc} and
829     $self->{nc} <= 0x005A) { # A..Z
830     !!!cp (49);
831     $self->{ca}
832 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
833 wakaba 1.1 value => '',
834     line => $self->{line}, column => $self->{column}};
835     $self->{state} = ATTRIBUTE_NAME_STATE;
836     !!!next-input-character;
837     redo A;
838     } elsif ($self->{nc} == 0x002F) { # /
839     !!!cp (50);
840     $self->{state} = SELF_CLOSING_START_TAG_STATE;
841     !!!next-input-character;
842     redo A;
843     } elsif ($self->{nc} == -1) {
844     !!!parse-error (type => 'unclosed tag');
845     if ($self->{ct}->{type} == START_TAG_TOKEN) {
846     !!!cp (52);
847     $self->{last_stag_name} = $self->{ct}->{tag_name};
848     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
849     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
850     if ($self->{ct}->{attributes}) {
851     !!!cp (53);
852     !!!parse-error (type => 'end tag attribute');
853     } else {
854     !!!cp (54);
855     }
856     } else {
857     die "$0: $self->{ct}->{type}: Unknown token type";
858     }
859     $self->{state} = DATA_STATE;
860 wakaba 1.5 $self->{s_kwd} = '';
861 wakaba 1.1 # reconsume
862    
863     !!!emit ($self->{ct}); # start tag or end tag
864    
865     redo A;
866     } else {
867     if ({
868     0x0022 => 1, # "
869     0x0027 => 1, # '
870     0x003D => 1, # =
871     }->{$self->{nc}}) {
872     !!!cp (55);
873     !!!parse-error (type => 'bad attribute name');
874     } else {
875     !!!cp (56);
876     }
877     $self->{ca}
878     = {name => chr ($self->{nc}),
879     value => '',
880     line => $self->{line}, column => $self->{column}};
881     $self->{state} = ATTRIBUTE_NAME_STATE;
882     !!!next-input-character;
883     redo A;
884     }
885     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
886     my $before_leave = sub {
887     if (exists $self->{ct}->{attributes} # start tag or end tag
888     ->{$self->{ca}->{name}}) { # MUST
889     !!!cp (57);
890     !!!parse-error (type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
891     ## Discard $self->{ca} # MUST
892     } else {
893     !!!cp (58);
894     $self->{ct}->{attributes}->{$self->{ca}->{name}}
895     = $self->{ca};
896     }
897     }; # $before_leave
898    
899     if ($is_space->{$self->{nc}}) {
900     !!!cp (59);
901     $before_leave->();
902     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
903     !!!next-input-character;
904     redo A;
905     } elsif ($self->{nc} == 0x003D) { # =
906     !!!cp (60);
907     $before_leave->();
908     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
909     !!!next-input-character;
910     redo A;
911     } elsif ($self->{nc} == 0x003E) { # >
912     $before_leave->();
913     if ($self->{ct}->{type} == START_TAG_TOKEN) {
914     !!!cp (61);
915     $self->{last_stag_name} = $self->{ct}->{tag_name};
916     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
917     !!!cp (62);
918     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
919     if ($self->{ct}->{attributes}) {
920     !!!parse-error (type => 'end tag attribute');
921     }
922     } else {
923     die "$0: $self->{ct}->{type}: Unknown token type";
924     }
925     $self->{state} = DATA_STATE;
926 wakaba 1.5 $self->{s_kwd} = '';
927 wakaba 1.1 !!!next-input-character;
928    
929     !!!emit ($self->{ct}); # start tag or end tag
930    
931     redo A;
932     } elsif (0x0041 <= $self->{nc} and
933     $self->{nc} <= 0x005A) { # A..Z
934     !!!cp (63);
935 wakaba 1.4 $self->{ca}->{name}
936     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
937 wakaba 1.1 ## Stay in the state
938     !!!next-input-character;
939     redo A;
940     } elsif ($self->{nc} == 0x002F) { # /
941     !!!cp (64);
942     $before_leave->();
943     $self->{state} = SELF_CLOSING_START_TAG_STATE;
944     !!!next-input-character;
945     redo A;
946     } elsif ($self->{nc} == -1) {
947     !!!parse-error (type => 'unclosed tag');
948     $before_leave->();
949     if ($self->{ct}->{type} == START_TAG_TOKEN) {
950     !!!cp (66);
951     $self->{last_stag_name} = $self->{ct}->{tag_name};
952     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
953     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
954     if ($self->{ct}->{attributes}) {
955     !!!cp (67);
956     !!!parse-error (type => 'end tag attribute');
957     } else {
958     ## NOTE: This state should never be reached.
959     !!!cp (68);
960     }
961     } else {
962     die "$0: $self->{ct}->{type}: Unknown token type";
963     }
964     $self->{state} = DATA_STATE;
965 wakaba 1.5 $self->{s_kwd} = '';
966 wakaba 1.1 # reconsume
967    
968     !!!emit ($self->{ct}); # start tag or end tag
969    
970     redo A;
971     } else {
972     if ($self->{nc} == 0x0022 or # "
973     $self->{nc} == 0x0027) { # '
974     !!!cp (69);
975     !!!parse-error (type => 'bad attribute name');
976     } else {
977     !!!cp (70);
978     }
979     $self->{ca}->{name} .= chr ($self->{nc});
980     ## Stay in the state
981     !!!next-input-character;
982     redo A;
983     }
984     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
985     if ($is_space->{$self->{nc}}) {
986     !!!cp (71);
987     ## Stay in the state
988     !!!next-input-character;
989     redo A;
990     } elsif ($self->{nc} == 0x003D) { # =
991     !!!cp (72);
992     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
993     !!!next-input-character;
994     redo A;
995     } elsif ($self->{nc} == 0x003E) { # >
996     if ($self->{ct}->{type} == START_TAG_TOKEN) {
997     !!!cp (73);
998     $self->{last_stag_name} = $self->{ct}->{tag_name};
999     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1000     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1001     if ($self->{ct}->{attributes}) {
1002     !!!cp (74);
1003     !!!parse-error (type => 'end tag attribute');
1004     } else {
1005     ## NOTE: This state should never be reached.
1006     !!!cp (75);
1007     }
1008     } else {
1009     die "$0: $self->{ct}->{type}: Unknown token type";
1010     }
1011     $self->{state} = DATA_STATE;
1012 wakaba 1.5 $self->{s_kwd} = '';
1013 wakaba 1.1 !!!next-input-character;
1014    
1015     !!!emit ($self->{ct}); # start tag or end tag
1016    
1017     redo A;
1018     } elsif (0x0041 <= $self->{nc} and
1019     $self->{nc} <= 0x005A) { # A..Z
1020     !!!cp (76);
1021     $self->{ca}
1022 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1023 wakaba 1.1 value => '',
1024     line => $self->{line}, column => $self->{column}};
1025     $self->{state} = ATTRIBUTE_NAME_STATE;
1026     !!!next-input-character;
1027     redo A;
1028     } elsif ($self->{nc} == 0x002F) { # /
1029     !!!cp (77);
1030     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1031     !!!next-input-character;
1032     redo A;
1033     } elsif ($self->{nc} == -1) {
1034     !!!parse-error (type => 'unclosed tag');
1035     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1036     !!!cp (79);
1037     $self->{last_stag_name} = $self->{ct}->{tag_name};
1038     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1039     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1040     if ($self->{ct}->{attributes}) {
1041     !!!cp (80);
1042     !!!parse-error (type => 'end tag attribute');
1043     } else {
1044     ## NOTE: This state should never be reached.
1045     !!!cp (81);
1046     }
1047     } else {
1048     die "$0: $self->{ct}->{type}: Unknown token type";
1049     }
1050 wakaba 1.5 $self->{s_kwd} = '';
1051 wakaba 1.1 $self->{state} = DATA_STATE;
1052     # reconsume
1053    
1054     !!!emit ($self->{ct}); # start tag or end tag
1055    
1056     redo A;
1057     } else {
1058     if ($self->{nc} == 0x0022 or # "
1059     $self->{nc} == 0x0027) { # '
1060     !!!cp (78);
1061     !!!parse-error (type => 'bad attribute name');
1062     } else {
1063     !!!cp (82);
1064     }
1065     $self->{ca}
1066     = {name => chr ($self->{nc}),
1067     value => '',
1068     line => $self->{line}, column => $self->{column}};
1069     $self->{state} = ATTRIBUTE_NAME_STATE;
1070     !!!next-input-character;
1071     redo A;
1072     }
1073     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1074     if ($is_space->{$self->{nc}}) {
1075     !!!cp (83);
1076     ## Stay in the state
1077     !!!next-input-character;
1078     redo A;
1079     } elsif ($self->{nc} == 0x0022) { # "
1080     !!!cp (84);
1081     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1082     !!!next-input-character;
1083     redo A;
1084     } elsif ($self->{nc} == 0x0026) { # &
1085     !!!cp (85);
1086     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1087     ## reconsume
1088     redo A;
1089     } elsif ($self->{nc} == 0x0027) { # '
1090     !!!cp (86);
1091     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1092     !!!next-input-character;
1093     redo A;
1094     } elsif ($self->{nc} == 0x003E) { # >
1095     !!!parse-error (type => 'empty unquoted attribute value');
1096     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1097     !!!cp (87);
1098     $self->{last_stag_name} = $self->{ct}->{tag_name};
1099     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1100     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1101     if ($self->{ct}->{attributes}) {
1102     !!!cp (88);
1103     !!!parse-error (type => 'end tag attribute');
1104     } else {
1105     ## NOTE: This state should never be reached.
1106     !!!cp (89);
1107     }
1108     } else {
1109     die "$0: $self->{ct}->{type}: Unknown token type";
1110     }
1111     $self->{state} = DATA_STATE;
1112 wakaba 1.5 $self->{s_kwd} = '';
1113 wakaba 1.1 !!!next-input-character;
1114    
1115     !!!emit ($self->{ct}); # start tag or end tag
1116    
1117     redo A;
1118     } elsif ($self->{nc} == -1) {
1119     !!!parse-error (type => 'unclosed tag');
1120     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1121     !!!cp (90);
1122     $self->{last_stag_name} = $self->{ct}->{tag_name};
1123     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1124     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1125     if ($self->{ct}->{attributes}) {
1126     !!!cp (91);
1127     !!!parse-error (type => 'end tag attribute');
1128     } else {
1129     ## NOTE: This state should never be reached.
1130     !!!cp (92);
1131     }
1132     } else {
1133     die "$0: $self->{ct}->{type}: Unknown token type";
1134     }
1135     $self->{state} = DATA_STATE;
1136 wakaba 1.5 $self->{s_kwd} = '';
1137 wakaba 1.1 ## reconsume
1138    
1139     !!!emit ($self->{ct}); # start tag or end tag
1140    
1141     redo A;
1142     } else {
1143     if ($self->{nc} == 0x003D) { # =
1144     !!!cp (93);
1145     !!!parse-error (type => 'bad attribute value');
1146     } else {
1147     !!!cp (94);
1148     }
1149     $self->{ca}->{value} .= chr ($self->{nc});
1150     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1151     !!!next-input-character;
1152     redo A;
1153     }
1154     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1155     if ($self->{nc} == 0x0022) { # "
1156     !!!cp (95);
1157     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1158     !!!next-input-character;
1159     redo A;
1160     } elsif ($self->{nc} == 0x0026) { # &
1161     !!!cp (96);
1162     ## NOTE: In the spec, the tokenizer is switched to the
1163     ## "entity in attribute value state". In this implementation, the
1164     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1165     ## implementation of the "consume a character reference" algorithm.
1166     $self->{prev_state} = $self->{state};
1167     $self->{entity_add} = 0x0022; # "
1168     $self->{state} = ENTITY_STATE;
1169     !!!next-input-character;
1170     redo A;
1171     } elsif ($self->{nc} == -1) {
1172     !!!parse-error (type => 'unclosed attribute value');
1173     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1174     !!!cp (97);
1175     $self->{last_stag_name} = $self->{ct}->{tag_name};
1176     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1177     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1178     if ($self->{ct}->{attributes}) {
1179     !!!cp (98);
1180     !!!parse-error (type => 'end tag attribute');
1181     } else {
1182     ## NOTE: This state should never be reached.
1183     !!!cp (99);
1184     }
1185     } else {
1186     die "$0: $self->{ct}->{type}: Unknown token type";
1187     }
1188     $self->{state} = DATA_STATE;
1189 wakaba 1.5 $self->{s_kwd} = '';
1190 wakaba 1.1 ## reconsume
1191    
1192     !!!emit ($self->{ct}); # start tag or end tag
1193    
1194     redo A;
1195     } else {
1196     !!!cp (100);
1197     $self->{ca}->{value} .= chr ($self->{nc});
1198     $self->{read_until}->($self->{ca}->{value},
1199     q["&],
1200     length $self->{ca}->{value});
1201    
1202     ## Stay in the state
1203     !!!next-input-character;
1204     redo A;
1205     }
1206     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1207     if ($self->{nc} == 0x0027) { # '
1208     !!!cp (101);
1209     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1210     !!!next-input-character;
1211     redo A;
1212     } elsif ($self->{nc} == 0x0026) { # &
1213     !!!cp (102);
1214     ## NOTE: In the spec, the tokenizer is switched to the
1215     ## "entity in attribute value state". In this implementation, the
1216     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1217     ## implementation of the "consume a character reference" algorithm.
1218     $self->{entity_add} = 0x0027; # '
1219     $self->{prev_state} = $self->{state};
1220     $self->{state} = ENTITY_STATE;
1221     !!!next-input-character;
1222     redo A;
1223     } elsif ($self->{nc} == -1) {
1224     !!!parse-error (type => 'unclosed attribute value');
1225     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1226     !!!cp (103);
1227     $self->{last_stag_name} = $self->{ct}->{tag_name};
1228     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1229     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1230     if ($self->{ct}->{attributes}) {
1231     !!!cp (104);
1232     !!!parse-error (type => 'end tag attribute');
1233     } else {
1234     ## NOTE: This state should never be reached.
1235     !!!cp (105);
1236     }
1237     } else {
1238     die "$0: $self->{ct}->{type}: Unknown token type";
1239     }
1240     $self->{state} = DATA_STATE;
1241 wakaba 1.5 $self->{s_kwd} = '';
1242 wakaba 1.1 ## reconsume
1243    
1244     !!!emit ($self->{ct}); # start tag or end tag
1245    
1246     redo A;
1247     } else {
1248     !!!cp (106);
1249     $self->{ca}->{value} .= chr ($self->{nc});
1250     $self->{read_until}->($self->{ca}->{value},
1251     q['&],
1252     length $self->{ca}->{value});
1253    
1254     ## Stay in the state
1255     !!!next-input-character;
1256     redo A;
1257     }
1258     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1259     if ($is_space->{$self->{nc}}) {
1260     !!!cp (107);
1261     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1262     !!!next-input-character;
1263     redo A;
1264     } elsif ($self->{nc} == 0x0026) { # &
1265     !!!cp (108);
1266     ## NOTE: In the spec, the tokenizer is switched to the
1267     ## "entity in attribute value state". In this implementation, the
1268     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1269     ## implementation of the "consume a character reference" algorithm.
1270     $self->{entity_add} = -1;
1271     $self->{prev_state} = $self->{state};
1272     $self->{state} = ENTITY_STATE;
1273     !!!next-input-character;
1274     redo A;
1275     } elsif ($self->{nc} == 0x003E) { # >
1276     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1277     !!!cp (109);
1278     $self->{last_stag_name} = $self->{ct}->{tag_name};
1279     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1280     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1281     if ($self->{ct}->{attributes}) {
1282     !!!cp (110);
1283     !!!parse-error (type => 'end tag attribute');
1284     } else {
1285     ## NOTE: This state should never be reached.
1286     !!!cp (111);
1287     }
1288     } else {
1289     die "$0: $self->{ct}->{type}: Unknown token type";
1290     }
1291     $self->{state} = DATA_STATE;
1292 wakaba 1.5 $self->{s_kwd} = '';
1293 wakaba 1.1 !!!next-input-character;
1294    
1295     !!!emit ($self->{ct}); # start tag or end tag
1296    
1297     redo A;
1298     } elsif ($self->{nc} == -1) {
1299     !!!parse-error (type => 'unclosed tag');
1300     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1301     !!!cp (112);
1302     $self->{last_stag_name} = $self->{ct}->{tag_name};
1303     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1304     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1305     if ($self->{ct}->{attributes}) {
1306     !!!cp (113);
1307     !!!parse-error (type => 'end tag attribute');
1308     } else {
1309     ## NOTE: This state should never be reached.
1310     !!!cp (114);
1311     }
1312     } else {
1313     die "$0: $self->{ct}->{type}: Unknown token type";
1314     }
1315     $self->{state} = DATA_STATE;
1316 wakaba 1.5 $self->{s_kwd} = '';
1317 wakaba 1.1 ## reconsume
1318    
1319     !!!emit ($self->{ct}); # start tag or end tag
1320    
1321     redo A;
1322     } else {
1323     if ({
1324     0x0022 => 1, # "
1325     0x0027 => 1, # '
1326     0x003D => 1, # =
1327     }->{$self->{nc}}) {
1328     !!!cp (115);
1329     !!!parse-error (type => 'bad attribute value');
1330     } else {
1331     !!!cp (116);
1332     }
1333     $self->{ca}->{value} .= chr ($self->{nc});
1334     $self->{read_until}->($self->{ca}->{value},
1335     q["'=& >],
1336     length $self->{ca}->{value});
1337    
1338     ## Stay in the state
1339     !!!next-input-character;
1340     redo A;
1341     }
1342     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1343     if ($is_space->{$self->{nc}}) {
1344     !!!cp (118);
1345     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1346     !!!next-input-character;
1347     redo A;
1348     } elsif ($self->{nc} == 0x003E) { # >
1349     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1350     !!!cp (119);
1351     $self->{last_stag_name} = $self->{ct}->{tag_name};
1352     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1353     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1354     if ($self->{ct}->{attributes}) {
1355     !!!cp (120);
1356     !!!parse-error (type => 'end tag attribute');
1357     } else {
1358     ## NOTE: This state should never be reached.
1359     !!!cp (121);
1360     }
1361     } else {
1362     die "$0: $self->{ct}->{type}: Unknown token type";
1363     }
1364     $self->{state} = DATA_STATE;
1365 wakaba 1.5 $self->{s_kwd} = '';
1366 wakaba 1.1 !!!next-input-character;
1367    
1368     !!!emit ($self->{ct}); # start tag or end tag
1369    
1370     redo A;
1371     } elsif ($self->{nc} == 0x002F) { # /
1372     !!!cp (122);
1373     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1374     !!!next-input-character;
1375     redo A;
1376     } elsif ($self->{nc} == -1) {
1377     !!!parse-error (type => 'unclosed tag');
1378     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1379     !!!cp (122.3);
1380     $self->{last_stag_name} = $self->{ct}->{tag_name};
1381     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1382     if ($self->{ct}->{attributes}) {
1383     !!!cp (122.1);
1384     !!!parse-error (type => 'end tag attribute');
1385     } else {
1386     ## NOTE: This state should never be reached.
1387     !!!cp (122.2);
1388     }
1389     } else {
1390     die "$0: $self->{ct}->{type}: Unknown token type";
1391     }
1392     $self->{state} = DATA_STATE;
1393 wakaba 1.5 $self->{s_kwd} = '';
1394 wakaba 1.1 ## Reconsume.
1395     !!!emit ($self->{ct}); # start tag or end tag
1396     redo A;
1397     } else {
1398     !!!cp ('124.1');
1399     !!!parse-error (type => 'no space between attributes');
1400     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1401     ## reconsume
1402     redo A;
1403     }
1404     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1405     if ($self->{nc} == 0x003E) { # >
1406     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1407     !!!cp ('124.2');
1408     !!!parse-error (type => 'nestc', token => $self->{ct});
1409     ## TODO: Different type than slash in start tag
1410     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1411     if ($self->{ct}->{attributes}) {
1412     !!!cp ('124.4');
1413     !!!parse-error (type => 'end tag attribute');
1414     } else {
1415     !!!cp ('124.5');
1416     }
1417     ## TODO: Test |<title></title/>|
1418     } else {
1419     !!!cp ('124.3');
1420     $self->{self_closing} = 1;
1421     }
1422    
1423     $self->{state} = DATA_STATE;
1424 wakaba 1.5 $self->{s_kwd} = '';
1425 wakaba 1.1 !!!next-input-character;
1426    
1427     !!!emit ($self->{ct}); # start tag or end tag
1428    
1429     redo A;
1430     } elsif ($self->{nc} == -1) {
1431     !!!parse-error (type => 'unclosed tag');
1432     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1433     !!!cp (124.7);
1434     $self->{last_stag_name} = $self->{ct}->{tag_name};
1435     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1436     if ($self->{ct}->{attributes}) {
1437     !!!cp (124.5);
1438     !!!parse-error (type => 'end tag attribute');
1439     } else {
1440     ## NOTE: This state should never be reached.
1441     !!!cp (124.6);
1442     }
1443     } else {
1444     die "$0: $self->{ct}->{type}: Unknown token type";
1445     }
1446     $self->{state} = DATA_STATE;
1447 wakaba 1.5 $self->{s_kwd} = '';
1448 wakaba 1.1 ## Reconsume.
1449     !!!emit ($self->{ct}); # start tag or end tag
1450     redo A;
1451     } else {
1452     !!!cp ('124.4');
1453     !!!parse-error (type => 'nestc');
1454     ## TODO: This error type is wrong.
1455     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1456     ## Reconsume.
1457     redo A;
1458     }
1459     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1460     ## (only happen if PCDATA state)
1461    
1462     ## NOTE: Unlike spec's "bogus comment state", this implementation
1463     ## consumes characters one-by-one basis.
1464    
1465     if ($self->{nc} == 0x003E) { # >
1466     !!!cp (124);
1467     $self->{state} = DATA_STATE;
1468 wakaba 1.5 $self->{s_kwd} = '';
1469 wakaba 1.1 !!!next-input-character;
1470    
1471     !!!emit ($self->{ct}); # comment
1472     redo A;
1473     } elsif ($self->{nc} == -1) {
1474     !!!cp (125);
1475     $self->{state} = DATA_STATE;
1476 wakaba 1.5 $self->{s_kwd} = '';
1477 wakaba 1.1 ## reconsume
1478    
1479     !!!emit ($self->{ct}); # comment
1480     redo A;
1481     } else {
1482     !!!cp (126);
1483     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1484     $self->{read_until}->($self->{ct}->{data},
1485     q[>],
1486     length $self->{ct}->{data});
1487    
1488     ## Stay in the state.
1489     !!!next-input-character;
1490     redo A;
1491     }
1492     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1493     ## (only happen if PCDATA state)
1494    
1495     if ($self->{nc} == 0x002D) { # -
1496     !!!cp (133);
1497     $self->{state} = MD_HYPHEN_STATE;
1498     !!!next-input-character;
1499     redo A;
1500     } elsif ($self->{nc} == 0x0044 or # D
1501     $self->{nc} == 0x0064) { # d
1502     ## ASCII case-insensitive.
1503     !!!cp (130);
1504     $self->{state} = MD_DOCTYPE_STATE;
1505     $self->{s_kwd} = chr $self->{nc};
1506     !!!next-input-character;
1507     redo A;
1508 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1509     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
1510     $self->{is_xml}) and
1511 wakaba 1.1 $self->{nc} == 0x005B) { # [
1512     !!!cp (135.4);
1513     $self->{state} = MD_CDATA_STATE;
1514     $self->{s_kwd} = '[';
1515     !!!next-input-character;
1516     redo A;
1517     } else {
1518     !!!cp (136);
1519     }
1520    
1521     !!!parse-error (type => 'bogus comment',
1522     line => $self->{line_prev},
1523     column => $self->{column_prev} - 1);
1524     ## Reconsume.
1525     $self->{state} = BOGUS_COMMENT_STATE;
1526     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1527     line => $self->{line_prev},
1528     column => $self->{column_prev} - 1,
1529     };
1530     redo A;
1531     } elsif ($self->{state} == MD_HYPHEN_STATE) {
1532     if ($self->{nc} == 0x002D) { # -
1533     !!!cp (127);
1534     $self->{ct} = {type => COMMENT_TOKEN, data => '',
1535     line => $self->{line_prev},
1536     column => $self->{column_prev} - 2,
1537     };
1538 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
1539 wakaba 1.1 !!!next-input-character;
1540     redo A;
1541     } else {
1542     !!!cp (128);
1543     !!!parse-error (type => 'bogus comment',
1544     line => $self->{line_prev},
1545     column => $self->{column_prev} - 2);
1546     $self->{state} = BOGUS_COMMENT_STATE;
1547     ## Reconsume.
1548     $self->{ct} = {type => COMMENT_TOKEN,
1549     data => '-',
1550     line => $self->{line_prev},
1551     column => $self->{column_prev} - 2,
1552     };
1553     redo A;
1554     }
1555     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
1556     ## ASCII case-insensitive.
1557     if ($self->{nc} == [
1558     undef,
1559     0x004F, # O
1560     0x0043, # C
1561     0x0054, # T
1562     0x0059, # Y
1563     0x0050, # P
1564     ]->[length $self->{s_kwd}] or
1565     $self->{nc} == [
1566     undef,
1567     0x006F, # o
1568     0x0063, # c
1569     0x0074, # t
1570     0x0079, # y
1571     0x0070, # p
1572     ]->[length $self->{s_kwd}]) {
1573     !!!cp (131);
1574     ## Stay in the state.
1575     $self->{s_kwd} .= chr $self->{nc};
1576     !!!next-input-character;
1577     redo A;
1578     } elsif ((length $self->{s_kwd}) == 6 and
1579     ($self->{nc} == 0x0045 or # E
1580     $self->{nc} == 0x0065)) { # e
1581 wakaba 1.10 if ($self->{s_kwd} ne 'DOCTYP') {
1582     !!!cp (129);
1583     ## XML5: case-sensitive.
1584     !!!parse-error (type => 'lowercase keyword', ## TODO
1585     text => 'DOCTYPE',
1586     line => $self->{line_prev},
1587     column => $self->{column_prev} - 5);
1588     } else {
1589     !!!cp (129.1);
1590     }
1591 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
1592     $self->{ct} = {type => DOCTYPE_TOKEN,
1593     quirks => 1,
1594     line => $self->{line_prev},
1595     column => $self->{column_prev} - 7,
1596     };
1597     !!!next-input-character;
1598     redo A;
1599     } else {
1600     !!!cp (132);
1601     !!!parse-error (type => 'bogus comment',
1602     line => $self->{line_prev},
1603     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1604     $self->{state} = BOGUS_COMMENT_STATE;
1605     ## Reconsume.
1606     $self->{ct} = {type => COMMENT_TOKEN,
1607     data => $self->{s_kwd},
1608     line => $self->{line_prev},
1609     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1610     };
1611     redo A;
1612     }
1613     } elsif ($self->{state} == MD_CDATA_STATE) {
1614     if ($self->{nc} == {
1615     '[' => 0x0043, # C
1616     '[C' => 0x0044, # D
1617     '[CD' => 0x0041, # A
1618     '[CDA' => 0x0054, # T
1619     '[CDAT' => 0x0041, # A
1620     }->{$self->{s_kwd}}) {
1621     !!!cp (135.1);
1622     ## Stay in the state.
1623     $self->{s_kwd} .= chr $self->{nc};
1624     !!!next-input-character;
1625     redo A;
1626     } elsif ($self->{s_kwd} eq '[CDATA' and
1627     $self->{nc} == 0x005B) { # [
1628 wakaba 1.6 if ($self->{is_xml} and
1629     not $self->{tainted} and
1630     @{$self->{open_elements} or []} == 0) {
1631 wakaba 1.8 !!!cp (135.2);
1632 wakaba 1.6 !!!parse-error (type => 'cdata outside of root element',
1633     line => $self->{line_prev},
1634     column => $self->{column_prev} - 7);
1635     $self->{tainted} = 1;
1636 wakaba 1.8 } else {
1637     !!!cp (135.21);
1638 wakaba 1.6 }
1639    
1640 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
1641     data => '',
1642     line => $self->{line_prev},
1643     column => $self->{column_prev} - 7};
1644     $self->{state} = CDATA_SECTION_STATE;
1645     !!!next-input-character;
1646     redo A;
1647     } else {
1648     !!!cp (135.3);
1649     !!!parse-error (type => 'bogus comment',
1650     line => $self->{line_prev},
1651     column => $self->{column_prev} - 1 - length $self->{s_kwd});
1652     $self->{state} = BOGUS_COMMENT_STATE;
1653     ## Reconsume.
1654     $self->{ct} = {type => COMMENT_TOKEN,
1655     data => $self->{s_kwd},
1656     line => $self->{line_prev},
1657     column => $self->{column_prev} - 1 - length $self->{s_kwd},
1658     };
1659     redo A;
1660     }
1661     } elsif ($self->{state} == COMMENT_START_STATE) {
1662     if ($self->{nc} == 0x002D) { # -
1663     !!!cp (137);
1664     $self->{state} = COMMENT_START_DASH_STATE;
1665     !!!next-input-character;
1666     redo A;
1667     } elsif ($self->{nc} == 0x003E) { # >
1668     !!!cp (138);
1669     !!!parse-error (type => 'bogus comment');
1670     $self->{state} = DATA_STATE;
1671 wakaba 1.5 $self->{s_kwd} = '';
1672 wakaba 1.1 !!!next-input-character;
1673    
1674     !!!emit ($self->{ct}); # comment
1675    
1676     redo A;
1677     } elsif ($self->{nc} == -1) {
1678     !!!cp (139);
1679     !!!parse-error (type => 'unclosed comment');
1680     $self->{state} = DATA_STATE;
1681 wakaba 1.5 $self->{s_kwd} = '';
1682 wakaba 1.1 ## reconsume
1683    
1684     !!!emit ($self->{ct}); # comment
1685    
1686     redo A;
1687     } else {
1688     !!!cp (140);
1689     $self->{ct}->{data} # comment
1690     .= chr ($self->{nc});
1691     $self->{state} = COMMENT_STATE;
1692     !!!next-input-character;
1693     redo A;
1694     }
1695     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
1696     if ($self->{nc} == 0x002D) { # -
1697     !!!cp (141);
1698     $self->{state} = COMMENT_END_STATE;
1699     !!!next-input-character;
1700     redo A;
1701     } elsif ($self->{nc} == 0x003E) { # >
1702     !!!cp (142);
1703     !!!parse-error (type => 'bogus comment');
1704     $self->{state} = DATA_STATE;
1705 wakaba 1.5 $self->{s_kwd} = '';
1706 wakaba 1.1 !!!next-input-character;
1707    
1708     !!!emit ($self->{ct}); # comment
1709    
1710     redo A;
1711     } elsif ($self->{nc} == -1) {
1712     !!!cp (143);
1713     !!!parse-error (type => 'unclosed comment');
1714     $self->{state} = DATA_STATE;
1715 wakaba 1.5 $self->{s_kwd} = '';
1716 wakaba 1.1 ## reconsume
1717    
1718     !!!emit ($self->{ct}); # comment
1719    
1720     redo A;
1721     } else {
1722     !!!cp (144);
1723     $self->{ct}->{data} # comment
1724     .= '-' . chr ($self->{nc});
1725     $self->{state} = COMMENT_STATE;
1726     !!!next-input-character;
1727     redo A;
1728     }
1729     } elsif ($self->{state} == COMMENT_STATE) {
1730     if ($self->{nc} == 0x002D) { # -
1731     !!!cp (145);
1732     $self->{state} = COMMENT_END_DASH_STATE;
1733     !!!next-input-character;
1734     redo A;
1735     } elsif ($self->{nc} == -1) {
1736     !!!cp (146);
1737     !!!parse-error (type => 'unclosed comment');
1738     $self->{state} = DATA_STATE;
1739 wakaba 1.5 $self->{s_kwd} = '';
1740 wakaba 1.1 ## reconsume
1741    
1742     !!!emit ($self->{ct}); # comment
1743    
1744     redo A;
1745     } else {
1746     !!!cp (147);
1747     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1748     $self->{read_until}->($self->{ct}->{data},
1749     q[-],
1750     length $self->{ct}->{data});
1751    
1752     ## Stay in the state
1753     !!!next-input-character;
1754     redo A;
1755     }
1756     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
1757 wakaba 1.10 ## XML5: "comment dash state".
1758    
1759 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
1760     !!!cp (148);
1761     $self->{state} = COMMENT_END_STATE;
1762     !!!next-input-character;
1763     redo A;
1764     } elsif ($self->{nc} == -1) {
1765     !!!cp (149);
1766     !!!parse-error (type => 'unclosed comment');
1767 wakaba 1.5 $self->{s_kwd} = '';
1768 wakaba 1.1 $self->{state} = DATA_STATE;
1769 wakaba 1.5 $self->{s_kwd} = '';
1770 wakaba 1.1 ## reconsume
1771    
1772     !!!emit ($self->{ct}); # comment
1773    
1774     redo A;
1775     } else {
1776     !!!cp (150);
1777     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
1778     $self->{state} = COMMENT_STATE;
1779     !!!next-input-character;
1780     redo A;
1781     }
1782     } elsif ($self->{state} == COMMENT_END_STATE) {
1783     if ($self->{nc} == 0x003E) { # >
1784     !!!cp (151);
1785     $self->{state} = DATA_STATE;
1786 wakaba 1.5 $self->{s_kwd} = '';
1787 wakaba 1.1 !!!next-input-character;
1788    
1789     !!!emit ($self->{ct}); # comment
1790    
1791     redo A;
1792     } elsif ($self->{nc} == 0x002D) { # -
1793     !!!cp (152);
1794 wakaba 1.10 ## XML5: Not a parse error.
1795 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1796     line => $self->{line_prev},
1797     column => $self->{column_prev});
1798     $self->{ct}->{data} .= '-'; # comment
1799     ## Stay in the state
1800     !!!next-input-character;
1801     redo A;
1802     } elsif ($self->{nc} == -1) {
1803     !!!cp (153);
1804     !!!parse-error (type => 'unclosed comment');
1805     $self->{state} = DATA_STATE;
1806 wakaba 1.5 $self->{s_kwd} = '';
1807 wakaba 1.1 ## reconsume
1808    
1809     !!!emit ($self->{ct}); # comment
1810    
1811     redo A;
1812     } else {
1813     !!!cp (154);
1814 wakaba 1.10 ## XML5: Not a parse error.
1815 wakaba 1.1 !!!parse-error (type => 'dash in comment',
1816     line => $self->{line_prev},
1817     column => $self->{column_prev});
1818     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
1819     $self->{state} = COMMENT_STATE;
1820     !!!next-input-character;
1821     redo A;
1822     }
1823     } elsif ($self->{state} == DOCTYPE_STATE) {
1824     if ($is_space->{$self->{nc}}) {
1825     !!!cp (155);
1826     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1827     !!!next-input-character;
1828     redo A;
1829     } else {
1830     !!!cp (156);
1831     !!!parse-error (type => 'no space before DOCTYPE name');
1832     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
1833     ## reconsume
1834     redo A;
1835     }
1836     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
1837     if ($is_space->{$self->{nc}}) {
1838     !!!cp (157);
1839     ## Stay in the state
1840     !!!next-input-character;
1841     redo A;
1842     } elsif ($self->{nc} == 0x003E) { # >
1843     !!!cp (158);
1844     !!!parse-error (type => 'no DOCTYPE name');
1845     $self->{state} = DATA_STATE;
1846 wakaba 1.5 $self->{s_kwd} = '';
1847 wakaba 1.1 !!!next-input-character;
1848    
1849     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1850    
1851     redo A;
1852     } elsif ($self->{nc} == -1) {
1853     !!!cp (159);
1854     !!!parse-error (type => 'no DOCTYPE name');
1855     $self->{state} = DATA_STATE;
1856 wakaba 1.5 $self->{s_kwd} = '';
1857 wakaba 1.1 ## reconsume
1858    
1859     !!!emit ($self->{ct}); # DOCTYPE (quirks)
1860    
1861     redo A;
1862     } else {
1863     !!!cp (160);
1864     $self->{ct}->{name} = chr $self->{nc};
1865     delete $self->{ct}->{quirks};
1866     $self->{state} = DOCTYPE_NAME_STATE;
1867     !!!next-input-character;
1868     redo A;
1869     }
1870     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
1871     ## ISSUE: Redundant "First," in the spec.
1872     if ($is_space->{$self->{nc}}) {
1873     !!!cp (161);
1874     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
1875     !!!next-input-character;
1876     redo A;
1877     } elsif ($self->{nc} == 0x003E) { # >
1878     !!!cp (162);
1879     $self->{state} = DATA_STATE;
1880 wakaba 1.5 $self->{s_kwd} = '';
1881 wakaba 1.1 !!!next-input-character;
1882    
1883     !!!emit ($self->{ct}); # DOCTYPE
1884    
1885     redo A;
1886     } elsif ($self->{nc} == -1) {
1887     !!!cp (163);
1888     !!!parse-error (type => 'unclosed DOCTYPE');
1889     $self->{state} = DATA_STATE;
1890 wakaba 1.5 $self->{s_kwd} = '';
1891 wakaba 1.1 ## reconsume
1892    
1893     $self->{ct}->{quirks} = 1;
1894     !!!emit ($self->{ct}); # DOCTYPE
1895    
1896     redo A;
1897     } else {
1898     !!!cp (164);
1899     $self->{ct}->{name}
1900     .= chr ($self->{nc}); # DOCTYPE
1901     ## Stay in the state
1902     !!!next-input-character;
1903     redo A;
1904     }
1905     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
1906     if ($is_space->{$self->{nc}}) {
1907     !!!cp (165);
1908     ## Stay in the state
1909     !!!next-input-character;
1910     redo A;
1911     } elsif ($self->{nc} == 0x003E) { # >
1912     !!!cp (166);
1913     $self->{state} = DATA_STATE;
1914 wakaba 1.5 $self->{s_kwd} = '';
1915 wakaba 1.1 !!!next-input-character;
1916    
1917     !!!emit ($self->{ct}); # DOCTYPE
1918    
1919     redo A;
1920     } elsif ($self->{nc} == -1) {
1921     !!!cp (167);
1922     !!!parse-error (type => 'unclosed DOCTYPE');
1923     $self->{state} = DATA_STATE;
1924 wakaba 1.5 $self->{s_kwd} = '';
1925 wakaba 1.1 ## reconsume
1926    
1927     $self->{ct}->{quirks} = 1;
1928     !!!emit ($self->{ct}); # DOCTYPE
1929    
1930     redo A;
1931     } elsif ($self->{nc} == 0x0050 or # P
1932     $self->{nc} == 0x0070) { # p
1933     $self->{state} = PUBLIC_STATE;
1934     $self->{s_kwd} = chr $self->{nc};
1935     !!!next-input-character;
1936     redo A;
1937     } elsif ($self->{nc} == 0x0053 or # S
1938     $self->{nc} == 0x0073) { # s
1939     $self->{state} = SYSTEM_STATE;
1940     $self->{s_kwd} = chr $self->{nc};
1941     !!!next-input-character;
1942     redo A;
1943     } else {
1944     !!!cp (180);
1945     !!!parse-error (type => 'string after DOCTYPE name');
1946     $self->{ct}->{quirks} = 1;
1947    
1948     $self->{state} = BOGUS_DOCTYPE_STATE;
1949     !!!next-input-character;
1950     redo A;
1951     }
1952     } elsif ($self->{state} == PUBLIC_STATE) {
1953     ## ASCII case-insensitive
1954     if ($self->{nc} == [
1955     undef,
1956     0x0055, # U
1957     0x0042, # B
1958     0x004C, # L
1959     0x0049, # I
1960     ]->[length $self->{s_kwd}] or
1961     $self->{nc} == [
1962     undef,
1963     0x0075, # u
1964     0x0062, # b
1965     0x006C, # l
1966     0x0069, # i
1967     ]->[length $self->{s_kwd}]) {
1968     !!!cp (175);
1969     ## Stay in the state.
1970     $self->{s_kwd} .= chr $self->{nc};
1971     !!!next-input-character;
1972     redo A;
1973     } elsif ((length $self->{s_kwd}) == 5 and
1974     ($self->{nc} == 0x0043 or # C
1975     $self->{nc} == 0x0063)) { # c
1976     !!!cp (168);
1977     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1978     !!!next-input-character;
1979     redo A;
1980     } else {
1981     !!!cp (169);
1982     !!!parse-error (type => 'string after DOCTYPE name',
1983     line => $self->{line_prev},
1984     column => $self->{column_prev} + 1 - length $self->{s_kwd});
1985     $self->{ct}->{quirks} = 1;
1986    
1987     $self->{state} = BOGUS_DOCTYPE_STATE;
1988     ## Reconsume.
1989     redo A;
1990     }
1991     } elsif ($self->{state} == SYSTEM_STATE) {
1992     ## ASCII case-insensitive
1993     if ($self->{nc} == [
1994     undef,
1995     0x0059, # Y
1996     0x0053, # S
1997     0x0054, # T
1998     0x0045, # E
1999     ]->[length $self->{s_kwd}] or
2000     $self->{nc} == [
2001     undef,
2002     0x0079, # y
2003     0x0073, # s
2004     0x0074, # t
2005     0x0065, # e
2006     ]->[length $self->{s_kwd}]) {
2007     !!!cp (170);
2008     ## Stay in the state.
2009     $self->{s_kwd} .= chr $self->{nc};
2010     !!!next-input-character;
2011     redo A;
2012     } elsif ((length $self->{s_kwd}) == 5 and
2013     ($self->{nc} == 0x004D or # M
2014     $self->{nc} == 0x006D)) { # m
2015     !!!cp (171);
2016     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2017     !!!next-input-character;
2018     redo A;
2019     } else {
2020     !!!cp (172);
2021     !!!parse-error (type => 'string after DOCTYPE name',
2022     line => $self->{line_prev},
2023     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2024     $self->{ct}->{quirks} = 1;
2025    
2026     $self->{state} = BOGUS_DOCTYPE_STATE;
2027     ## Reconsume.
2028     redo A;
2029     }
2030     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2031     if ($is_space->{$self->{nc}}) {
2032     !!!cp (181);
2033     ## Stay in the state
2034     !!!next-input-character;
2035     redo A;
2036     } elsif ($self->{nc} eq 0x0022) { # "
2037     !!!cp (182);
2038     $self->{ct}->{pubid} = ''; # DOCTYPE
2039     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2040     !!!next-input-character;
2041     redo A;
2042     } elsif ($self->{nc} eq 0x0027) { # '
2043     !!!cp (183);
2044     $self->{ct}->{pubid} = ''; # DOCTYPE
2045     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2046     !!!next-input-character;
2047     redo A;
2048     } elsif ($self->{nc} eq 0x003E) { # >
2049     !!!cp (184);
2050     !!!parse-error (type => 'no PUBLIC literal');
2051    
2052     $self->{state} = DATA_STATE;
2053 wakaba 1.5 $self->{s_kwd} = '';
2054 wakaba 1.1 !!!next-input-character;
2055    
2056     $self->{ct}->{quirks} = 1;
2057     !!!emit ($self->{ct}); # DOCTYPE
2058    
2059     redo A;
2060     } elsif ($self->{nc} == -1) {
2061     !!!cp (185);
2062     !!!parse-error (type => 'unclosed DOCTYPE');
2063    
2064     $self->{state} = DATA_STATE;
2065 wakaba 1.5 $self->{s_kwd} = '';
2066 wakaba 1.1 ## reconsume
2067    
2068     $self->{ct}->{quirks} = 1;
2069     !!!emit ($self->{ct}); # DOCTYPE
2070    
2071     redo A;
2072     } else {
2073     !!!cp (186);
2074     !!!parse-error (type => 'string after PUBLIC');
2075     $self->{ct}->{quirks} = 1;
2076    
2077     $self->{state} = BOGUS_DOCTYPE_STATE;
2078     !!!next-input-character;
2079     redo A;
2080     }
2081     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2082     if ($self->{nc} == 0x0022) { # "
2083     !!!cp (187);
2084     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2085     !!!next-input-character;
2086     redo A;
2087     } elsif ($self->{nc} == 0x003E) { # >
2088     !!!cp (188);
2089     !!!parse-error (type => 'unclosed PUBLIC literal');
2090    
2091     $self->{state} = DATA_STATE;
2092 wakaba 1.5 $self->{s_kwd} = '';
2093 wakaba 1.1 !!!next-input-character;
2094    
2095     $self->{ct}->{quirks} = 1;
2096     !!!emit ($self->{ct}); # DOCTYPE
2097    
2098     redo A;
2099     } elsif ($self->{nc} == -1) {
2100     !!!cp (189);
2101     !!!parse-error (type => 'unclosed PUBLIC literal');
2102    
2103     $self->{state} = DATA_STATE;
2104 wakaba 1.5 $self->{s_kwd} = '';
2105 wakaba 1.1 ## reconsume
2106    
2107     $self->{ct}->{quirks} = 1;
2108     !!!emit ($self->{ct}); # DOCTYPE
2109    
2110     redo A;
2111     } else {
2112     !!!cp (190);
2113     $self->{ct}->{pubid} # DOCTYPE
2114     .= chr $self->{nc};
2115     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2116     length $self->{ct}->{pubid});
2117    
2118     ## Stay in the state
2119     !!!next-input-character;
2120     redo A;
2121     }
2122     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2123     if ($self->{nc} == 0x0027) { # '
2124     !!!cp (191);
2125     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2126     !!!next-input-character;
2127     redo A;
2128     } elsif ($self->{nc} == 0x003E) { # >
2129     !!!cp (192);
2130     !!!parse-error (type => 'unclosed PUBLIC literal');
2131    
2132     $self->{state} = DATA_STATE;
2133 wakaba 1.5 $self->{s_kwd} = '';
2134 wakaba 1.1 !!!next-input-character;
2135    
2136     $self->{ct}->{quirks} = 1;
2137     !!!emit ($self->{ct}); # DOCTYPE
2138    
2139     redo A;
2140     } elsif ($self->{nc} == -1) {
2141     !!!cp (193);
2142     !!!parse-error (type => 'unclosed PUBLIC literal');
2143    
2144     $self->{state} = DATA_STATE;
2145 wakaba 1.5 $self->{s_kwd} = '';
2146 wakaba 1.1 ## reconsume
2147    
2148     $self->{ct}->{quirks} = 1;
2149     !!!emit ($self->{ct}); # DOCTYPE
2150    
2151     redo A;
2152     } else {
2153     !!!cp (194);
2154     $self->{ct}->{pubid} # DOCTYPE
2155     .= chr $self->{nc};
2156     $self->{read_until}->($self->{ct}->{pubid}, q['>],
2157     length $self->{ct}->{pubid});
2158    
2159     ## Stay in the state
2160     !!!next-input-character;
2161     redo A;
2162     }
2163     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2164     if ($is_space->{$self->{nc}}) {
2165     !!!cp (195);
2166     ## Stay in the state
2167     !!!next-input-character;
2168     redo A;
2169     } elsif ($self->{nc} == 0x0022) { # "
2170     !!!cp (196);
2171     $self->{ct}->{sysid} = ''; # DOCTYPE
2172     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2173     !!!next-input-character;
2174     redo A;
2175     } elsif ($self->{nc} == 0x0027) { # '
2176     !!!cp (197);
2177     $self->{ct}->{sysid} = ''; # DOCTYPE
2178     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2179     !!!next-input-character;
2180     redo A;
2181     } elsif ($self->{nc} == 0x003E) { # >
2182     !!!cp (198);
2183     $self->{state} = DATA_STATE;
2184 wakaba 1.5 $self->{s_kwd} = '';
2185 wakaba 1.1 !!!next-input-character;
2186    
2187     !!!emit ($self->{ct}); # DOCTYPE
2188    
2189     redo A;
2190     } elsif ($self->{nc} == -1) {
2191     !!!cp (199);
2192     !!!parse-error (type => 'unclosed DOCTYPE');
2193    
2194     $self->{state} = DATA_STATE;
2195 wakaba 1.5 $self->{s_kwd} = '';
2196 wakaba 1.1 ## reconsume
2197    
2198     $self->{ct}->{quirks} = 1;
2199     !!!emit ($self->{ct}); # DOCTYPE
2200    
2201     redo A;
2202     } else {
2203     !!!cp (200);
2204     !!!parse-error (type => 'string after PUBLIC literal');
2205     $self->{ct}->{quirks} = 1;
2206    
2207     $self->{state} = BOGUS_DOCTYPE_STATE;
2208     !!!next-input-character;
2209     redo A;
2210     }
2211     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2212     if ($is_space->{$self->{nc}}) {
2213     !!!cp (201);
2214     ## Stay in the state
2215     !!!next-input-character;
2216     redo A;
2217     } elsif ($self->{nc} == 0x0022) { # "
2218     !!!cp (202);
2219     $self->{ct}->{sysid} = ''; # DOCTYPE
2220     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2221     !!!next-input-character;
2222     redo A;
2223     } elsif ($self->{nc} == 0x0027) { # '
2224     !!!cp (203);
2225     $self->{ct}->{sysid} = ''; # DOCTYPE
2226     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2227     !!!next-input-character;
2228     redo A;
2229     } elsif ($self->{nc} == 0x003E) { # >
2230     !!!cp (204);
2231     !!!parse-error (type => 'no SYSTEM literal');
2232     $self->{state} = DATA_STATE;
2233 wakaba 1.5 $self->{s_kwd} = '';
2234 wakaba 1.1 !!!next-input-character;
2235    
2236     $self->{ct}->{quirks} = 1;
2237     !!!emit ($self->{ct}); # DOCTYPE
2238    
2239     redo A;
2240     } elsif ($self->{nc} == -1) {
2241     !!!cp (205);
2242     !!!parse-error (type => 'unclosed DOCTYPE');
2243    
2244     $self->{state} = DATA_STATE;
2245 wakaba 1.5 $self->{s_kwd} = '';
2246 wakaba 1.1 ## reconsume
2247    
2248     $self->{ct}->{quirks} = 1;
2249     !!!emit ($self->{ct}); # DOCTYPE
2250    
2251     redo A;
2252     } else {
2253     !!!cp (206);
2254     !!!parse-error (type => 'string after SYSTEM');
2255     $self->{ct}->{quirks} = 1;
2256    
2257     $self->{state} = BOGUS_DOCTYPE_STATE;
2258     !!!next-input-character;
2259     redo A;
2260     }
2261     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2262     if ($self->{nc} == 0x0022) { # "
2263     !!!cp (207);
2264     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2265     !!!next-input-character;
2266     redo A;
2267     } elsif ($self->{nc} == 0x003E) { # >
2268     !!!cp (208);
2269     !!!parse-error (type => 'unclosed SYSTEM literal');
2270    
2271     $self->{state} = DATA_STATE;
2272 wakaba 1.5 $self->{s_kwd} = '';
2273 wakaba 1.1 !!!next-input-character;
2274    
2275     $self->{ct}->{quirks} = 1;
2276     !!!emit ($self->{ct}); # DOCTYPE
2277    
2278     redo A;
2279     } elsif ($self->{nc} == -1) {
2280     !!!cp (209);
2281     !!!parse-error (type => 'unclosed SYSTEM literal');
2282    
2283     $self->{state} = DATA_STATE;
2284 wakaba 1.5 $self->{s_kwd} = '';
2285 wakaba 1.1 ## reconsume
2286    
2287     $self->{ct}->{quirks} = 1;
2288     !!!emit ($self->{ct}); # DOCTYPE
2289    
2290     redo A;
2291     } else {
2292     !!!cp (210);
2293     $self->{ct}->{sysid} # DOCTYPE
2294     .= chr $self->{nc};
2295     $self->{read_until}->($self->{ct}->{sysid}, q[">],
2296     length $self->{ct}->{sysid});
2297    
2298     ## Stay in the state
2299     !!!next-input-character;
2300     redo A;
2301     }
2302     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
2303     if ($self->{nc} == 0x0027) { # '
2304     !!!cp (211);
2305     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2306     !!!next-input-character;
2307     redo A;
2308     } elsif ($self->{nc} == 0x003E) { # >
2309     !!!cp (212);
2310     !!!parse-error (type => 'unclosed SYSTEM literal');
2311    
2312     $self->{state} = DATA_STATE;
2313 wakaba 1.5 $self->{s_kwd} = '';
2314 wakaba 1.1 !!!next-input-character;
2315    
2316     $self->{ct}->{quirks} = 1;
2317     !!!emit ($self->{ct}); # DOCTYPE
2318    
2319     redo A;
2320     } elsif ($self->{nc} == -1) {
2321     !!!cp (213);
2322     !!!parse-error (type => 'unclosed SYSTEM literal');
2323    
2324     $self->{state} = DATA_STATE;
2325 wakaba 1.5 $self->{s_kwd} = '';
2326 wakaba 1.1 ## reconsume
2327    
2328     $self->{ct}->{quirks} = 1;
2329     !!!emit ($self->{ct}); # DOCTYPE
2330    
2331     redo A;
2332     } else {
2333     !!!cp (214);
2334     $self->{ct}->{sysid} # DOCTYPE
2335     .= chr $self->{nc};
2336     $self->{read_until}->($self->{ct}->{sysid}, q['>],
2337     length $self->{ct}->{sysid});
2338    
2339     ## Stay in the state
2340     !!!next-input-character;
2341     redo A;
2342     }
2343     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
2344     if ($is_space->{$self->{nc}}) {
2345     !!!cp (215);
2346     ## Stay in the state
2347     !!!next-input-character;
2348     redo A;
2349     } elsif ($self->{nc} == 0x003E) { # >
2350     !!!cp (216);
2351     $self->{state} = DATA_STATE;
2352 wakaba 1.5 $self->{s_kwd} = '';
2353 wakaba 1.1 !!!next-input-character;
2354    
2355     !!!emit ($self->{ct}); # DOCTYPE
2356    
2357     redo A;
2358     } elsif ($self->{nc} == -1) {
2359     !!!cp (217);
2360     !!!parse-error (type => 'unclosed DOCTYPE');
2361     $self->{state} = DATA_STATE;
2362 wakaba 1.5 $self->{s_kwd} = '';
2363 wakaba 1.1 ## reconsume
2364    
2365     $self->{ct}->{quirks} = 1;
2366     !!!emit ($self->{ct}); # DOCTYPE
2367    
2368     redo A;
2369     } else {
2370     !!!cp (218);
2371     !!!parse-error (type => 'string after SYSTEM literal');
2372     #$self->{ct}->{quirks} = 1;
2373    
2374     $self->{state} = BOGUS_DOCTYPE_STATE;
2375     !!!next-input-character;
2376     redo A;
2377     }
2378     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
2379     if ($self->{nc} == 0x003E) { # >
2380     !!!cp (219);
2381     $self->{state} = DATA_STATE;
2382 wakaba 1.5 $self->{s_kwd} = '';
2383 wakaba 1.1 !!!next-input-character;
2384    
2385     !!!emit ($self->{ct}); # DOCTYPE
2386    
2387     redo A;
2388     } elsif ($self->{nc} == -1) {
2389     !!!cp (220);
2390     $self->{state} = DATA_STATE;
2391 wakaba 1.5 $self->{s_kwd} = '';
2392 wakaba 1.1 ## reconsume
2393    
2394     !!!emit ($self->{ct}); # DOCTYPE
2395    
2396     redo A;
2397     } else {
2398     !!!cp (221);
2399     my $s = '';
2400     $self->{read_until}->($s, q[>], 0);
2401    
2402     ## Stay in the state
2403     !!!next-input-character;
2404     redo A;
2405     }
2406     } elsif ($self->{state} == CDATA_SECTION_STATE) {
2407     ## NOTE: "CDATA section state" in the state is jointly implemented
2408     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
2409     ## and |CDATA_SECTION_MSE2_STATE|.
2410 wakaba 1.10
2411     ## XML5: "CDATA state".
2412 wakaba 1.1
2413     if ($self->{nc} == 0x005D) { # ]
2414     !!!cp (221.1);
2415     $self->{state} = CDATA_SECTION_MSE1_STATE;
2416     !!!next-input-character;
2417     redo A;
2418     } elsif ($self->{nc} == -1) {
2419 wakaba 1.6 if ($self->{is_xml}) {
2420 wakaba 1.8 !!!cp (221.11);
2421 wakaba 1.6 !!!parse-error (type => 'no mse'); ## TODO: type
2422 wakaba 1.8 } else {
2423     !!!cp (221.12);
2424 wakaba 1.6 }
2425    
2426 wakaba 1.1 $self->{state} = DATA_STATE;
2427 wakaba 1.5 $self->{s_kwd} = '';
2428 wakaba 1.10 ## Reconsume.
2429 wakaba 1.1 if (length $self->{ct}->{data}) { # character
2430     !!!cp (221.2);
2431     !!!emit ($self->{ct}); # character
2432     } else {
2433     !!!cp (221.3);
2434     ## No token to emit. $self->{ct} is discarded.
2435     }
2436     redo A;
2437     } else {
2438     !!!cp (221.4);
2439     $self->{ct}->{data} .= chr $self->{nc};
2440     $self->{read_until}->($self->{ct}->{data},
2441     q<]>,
2442     length $self->{ct}->{data});
2443    
2444     ## Stay in the state.
2445     !!!next-input-character;
2446     redo A;
2447     }
2448    
2449     ## ISSUE: "text tokens" in spec.
2450     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
2451 wakaba 1.10 ## XML5: "CDATA bracket state".
2452    
2453 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
2454     !!!cp (221.5);
2455     $self->{state} = CDATA_SECTION_MSE2_STATE;
2456     !!!next-input-character;
2457     redo A;
2458     } else {
2459     !!!cp (221.6);
2460 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
2461 wakaba 1.1 $self->{ct}->{data} .= ']';
2462 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
2463 wakaba 1.1 ## Reconsume.
2464     redo A;
2465     }
2466     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
2467 wakaba 1.10 ## XML5: "CDATA end state".
2468    
2469 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
2470     $self->{state} = DATA_STATE;
2471 wakaba 1.5 $self->{s_kwd} = '';
2472 wakaba 1.1 !!!next-input-character;
2473     if (length $self->{ct}->{data}) { # character
2474     !!!cp (221.7);
2475     !!!emit ($self->{ct}); # character
2476     } else {
2477     !!!cp (221.8);
2478     ## No token to emit. $self->{ct} is discarded.
2479     }
2480     redo A;
2481     } elsif ($self->{nc} == 0x005D) { # ]
2482     !!!cp (221.9); # character
2483     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
2484     ## Stay in the state.
2485     !!!next-input-character;
2486     redo A;
2487     } else {
2488     !!!cp (221.11);
2489     $self->{ct}->{data} .= ']]'; # character
2490     $self->{state} = CDATA_SECTION_STATE;
2491 wakaba 1.10 ## Reconsume. ## XML5: Emit.
2492 wakaba 1.1 redo A;
2493     }
2494     } elsif ($self->{state} == ENTITY_STATE) {
2495     if ($is_space->{$self->{nc}} or
2496     {
2497     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
2498     $self->{entity_add} => 1,
2499     }->{$self->{nc}}) {
2500     !!!cp (1001);
2501     ## Don't consume
2502     ## No error
2503     ## Return nothing.
2504     #
2505     } elsif ($self->{nc} == 0x0023) { # #
2506     !!!cp (999);
2507     $self->{state} = ENTITY_HASH_STATE;
2508     $self->{s_kwd} = '#';
2509     !!!next-input-character;
2510     redo A;
2511     } elsif ((0x0041 <= $self->{nc} and
2512     $self->{nc} <= 0x005A) or # A..Z
2513     (0x0061 <= $self->{nc} and
2514     $self->{nc} <= 0x007A)) { # a..z
2515     !!!cp (998);
2516     require Whatpm::_NamedEntityList;
2517     $self->{state} = ENTITY_NAME_STATE;
2518     $self->{s_kwd} = chr $self->{nc};
2519     $self->{entity__value} = $self->{s_kwd};
2520     $self->{entity__match} = 0;
2521     !!!next-input-character;
2522     redo A;
2523     } else {
2524     !!!cp (1027);
2525     !!!parse-error (type => 'bare ero');
2526     ## Return nothing.
2527     #
2528     }
2529    
2530     ## NOTE: No character is consumed by the "consume a character
2531     ## reference" algorithm. In other word, there is an "&" character
2532     ## that does not introduce a character reference, which would be
2533     ## appended to the parent element or the attribute value in later
2534     ## process of the tokenizer.
2535    
2536     if ($self->{prev_state} == DATA_STATE) {
2537     !!!cp (997);
2538     $self->{state} = $self->{prev_state};
2539 wakaba 1.5 $self->{s_kwd} = '';
2540 wakaba 1.1 ## Reconsume.
2541     !!!emit ({type => CHARACTER_TOKEN, data => '&',
2542     line => $self->{line_prev},
2543     column => $self->{column_prev},
2544     });
2545     redo A;
2546     } else {
2547     !!!cp (996);
2548     $self->{ca}->{value} .= '&';
2549     $self->{state} = $self->{prev_state};
2550 wakaba 1.5 $self->{s_kwd} = '';
2551 wakaba 1.1 ## Reconsume.
2552     redo A;
2553     }
2554     } elsif ($self->{state} == ENTITY_HASH_STATE) {
2555     if ($self->{nc} == 0x0078 or # x
2556     $self->{nc} == 0x0058) { # X
2557     !!!cp (995);
2558     $self->{state} = HEXREF_X_STATE;
2559     $self->{s_kwd} .= chr $self->{nc};
2560     !!!next-input-character;
2561     redo A;
2562     } elsif (0x0030 <= $self->{nc} and
2563     $self->{nc} <= 0x0039) { # 0..9
2564     !!!cp (994);
2565     $self->{state} = NCR_NUM_STATE;
2566     $self->{s_kwd} = $self->{nc} - 0x0030;
2567     !!!next-input-character;
2568     redo A;
2569     } else {
2570     !!!parse-error (type => 'bare nero',
2571     line => $self->{line_prev},
2572     column => $self->{column_prev} - 1);
2573    
2574     ## NOTE: According to the spec algorithm, nothing is returned,
2575     ## and then "&#" is appended to the parent element or the attribute
2576     ## value in the later processing.
2577    
2578     if ($self->{prev_state} == DATA_STATE) {
2579     !!!cp (1019);
2580     $self->{state} = $self->{prev_state};
2581 wakaba 1.5 $self->{s_kwd} = '';
2582 wakaba 1.1 ## Reconsume.
2583     !!!emit ({type => CHARACTER_TOKEN,
2584     data => '&#',
2585     line => $self->{line_prev},
2586     column => $self->{column_prev} - 1,
2587     });
2588     redo A;
2589     } else {
2590     !!!cp (993);
2591     $self->{ca}->{value} .= '&#';
2592     $self->{state} = $self->{prev_state};
2593 wakaba 1.5 $self->{s_kwd} = '';
2594 wakaba 1.1 ## Reconsume.
2595     redo A;
2596     }
2597     }
2598     } elsif ($self->{state} == NCR_NUM_STATE) {
2599     if (0x0030 <= $self->{nc} and
2600     $self->{nc} <= 0x0039) { # 0..9
2601     !!!cp (1012);
2602     $self->{s_kwd} *= 10;
2603     $self->{s_kwd} += $self->{nc} - 0x0030;
2604    
2605     ## Stay in the state.
2606     !!!next-input-character;
2607     redo A;
2608     } elsif ($self->{nc} == 0x003B) { # ;
2609     !!!cp (1013);
2610     !!!next-input-character;
2611     #
2612     } else {
2613     !!!cp (1014);
2614     !!!parse-error (type => 'no refc');
2615     ## Reconsume.
2616     #
2617     }
2618    
2619     my $code = $self->{s_kwd};
2620     my $l = $self->{line_prev};
2621     my $c = $self->{column_prev};
2622     if ($charref_map->{$code}) {
2623     !!!cp (1015);
2624     !!!parse-error (type => 'invalid character reference',
2625     text => (sprintf 'U+%04X', $code),
2626     line => $l, column => $c);
2627     $code = $charref_map->{$code};
2628     } elsif ($code > 0x10FFFF) {
2629     !!!cp (1016);
2630     !!!parse-error (type => 'invalid character reference',
2631     text => (sprintf 'U-%08X', $code),
2632     line => $l, column => $c);
2633     $code = 0xFFFD;
2634     }
2635    
2636     if ($self->{prev_state} == DATA_STATE) {
2637     !!!cp (992);
2638     $self->{state} = $self->{prev_state};
2639 wakaba 1.5 $self->{s_kwd} = '';
2640 wakaba 1.1 ## Reconsume.
2641     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2642 wakaba 1.7 has_reference => 1,
2643 wakaba 1.1 line => $l, column => $c,
2644     });
2645     redo A;
2646     } else {
2647     !!!cp (991);
2648     $self->{ca}->{value} .= chr $code;
2649     $self->{ca}->{has_reference} = 1;
2650     $self->{state} = $self->{prev_state};
2651 wakaba 1.5 $self->{s_kwd} = '';
2652 wakaba 1.1 ## Reconsume.
2653     redo A;
2654     }
2655     } elsif ($self->{state} == HEXREF_X_STATE) {
2656     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
2657     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
2658     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
2659     # 0..9, A..F, a..f
2660     !!!cp (990);
2661     $self->{state} = HEXREF_HEX_STATE;
2662     $self->{s_kwd} = 0;
2663     ## Reconsume.
2664     redo A;
2665     } else {
2666     !!!parse-error (type => 'bare hcro',
2667     line => $self->{line_prev},
2668     column => $self->{column_prev} - 2);
2669    
2670     ## NOTE: According to the spec algorithm, nothing is returned,
2671     ## and then "&#" followed by "X" or "x" is appended to the parent
2672     ## element or the attribute value in the later processing.
2673    
2674     if ($self->{prev_state} == DATA_STATE) {
2675     !!!cp (1005);
2676     $self->{state} = $self->{prev_state};
2677 wakaba 1.5 $self->{s_kwd} = '';
2678 wakaba 1.1 ## Reconsume.
2679     !!!emit ({type => CHARACTER_TOKEN,
2680     data => '&' . $self->{s_kwd},
2681     line => $self->{line_prev},
2682     column => $self->{column_prev} - length $self->{s_kwd},
2683     });
2684     redo A;
2685     } else {
2686     !!!cp (989);
2687     $self->{ca}->{value} .= '&' . $self->{s_kwd};
2688     $self->{state} = $self->{prev_state};
2689 wakaba 1.5 $self->{s_kwd} = '';
2690 wakaba 1.1 ## Reconsume.
2691     redo A;
2692     }
2693     }
2694     } elsif ($self->{state} == HEXREF_HEX_STATE) {
2695     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
2696     # 0..9
2697     !!!cp (1002);
2698     $self->{s_kwd} *= 0x10;
2699     $self->{s_kwd} += $self->{nc} - 0x0030;
2700     ## Stay in the state.
2701     !!!next-input-character;
2702     redo A;
2703     } elsif (0x0061 <= $self->{nc} and
2704     $self->{nc} <= 0x0066) { # a..f
2705     !!!cp (1003);
2706     $self->{s_kwd} *= 0x10;
2707     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
2708     ## Stay in the state.
2709     !!!next-input-character;
2710     redo A;
2711     } elsif (0x0041 <= $self->{nc} and
2712     $self->{nc} <= 0x0046) { # A..F
2713     !!!cp (1004);
2714     $self->{s_kwd} *= 0x10;
2715     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
2716     ## Stay in the state.
2717     !!!next-input-character;
2718     redo A;
2719     } elsif ($self->{nc} == 0x003B) { # ;
2720     !!!cp (1006);
2721     !!!next-input-character;
2722     #
2723     } else {
2724     !!!cp (1007);
2725     !!!parse-error (type => 'no refc',
2726     line => $self->{line},
2727     column => $self->{column});
2728     ## Reconsume.
2729     #
2730     }
2731    
2732     my $code = $self->{s_kwd};
2733     my $l = $self->{line_prev};
2734     my $c = $self->{column_prev};
2735     if ($charref_map->{$code}) {
2736     !!!cp (1008);
2737     !!!parse-error (type => 'invalid character reference',
2738     text => (sprintf 'U+%04X', $code),
2739     line => $l, column => $c);
2740     $code = $charref_map->{$code};
2741     } elsif ($code > 0x10FFFF) {
2742     !!!cp (1009);
2743     !!!parse-error (type => 'invalid character reference',
2744     text => (sprintf 'U-%08X', $code),
2745     line => $l, column => $c);
2746     $code = 0xFFFD;
2747     }
2748    
2749     if ($self->{prev_state} == DATA_STATE) {
2750     !!!cp (988);
2751     $self->{state} = $self->{prev_state};
2752 wakaba 1.5 $self->{s_kwd} = '';
2753 wakaba 1.1 ## Reconsume.
2754     !!!emit ({type => CHARACTER_TOKEN, data => chr $code,
2755 wakaba 1.7 has_reference => 1,
2756 wakaba 1.1 line => $l, column => $c,
2757     });
2758     redo A;
2759     } else {
2760     !!!cp (987);
2761     $self->{ca}->{value} .= chr $code;
2762     $self->{ca}->{has_reference} = 1;
2763     $self->{state} = $self->{prev_state};
2764 wakaba 1.5 $self->{s_kwd} = '';
2765 wakaba 1.1 ## Reconsume.
2766     redo A;
2767     }
2768     } elsif ($self->{state} == ENTITY_NAME_STATE) {
2769     if (length $self->{s_kwd} < 30 and
2770     ## NOTE: Some number greater than the maximum length of entity name
2771     ((0x0041 <= $self->{nc} and # a
2772     $self->{nc} <= 0x005A) or # x
2773     (0x0061 <= $self->{nc} and # a
2774     $self->{nc} <= 0x007A) or # z
2775     (0x0030 <= $self->{nc} and # 0
2776     $self->{nc} <= 0x0039) or # 9
2777     $self->{nc} == 0x003B)) { # ;
2778     our $EntityChar;
2779     $self->{s_kwd} .= chr $self->{nc};
2780     if (defined $EntityChar->{$self->{s_kwd}}) {
2781     if ($self->{nc} == 0x003B) { # ;
2782     !!!cp (1020);
2783     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2784     $self->{entity__match} = 1;
2785     !!!next-input-character;
2786     #
2787     } else {
2788     !!!cp (1021);
2789     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
2790     $self->{entity__match} = -1;
2791     ## Stay in the state.
2792     !!!next-input-character;
2793     redo A;
2794     }
2795     } else {
2796     !!!cp (1022);
2797     $self->{entity__value} .= chr $self->{nc};
2798     $self->{entity__match} *= 2;
2799     ## Stay in the state.
2800     !!!next-input-character;
2801     redo A;
2802     }
2803     }
2804    
2805     my $data;
2806     my $has_ref;
2807     if ($self->{entity__match} > 0) {
2808     !!!cp (1023);
2809     $data = $self->{entity__value};
2810     $has_ref = 1;
2811     #
2812     } elsif ($self->{entity__match} < 0) {
2813     !!!parse-error (type => 'no refc');
2814     if ($self->{prev_state} != DATA_STATE and # in attribute
2815     $self->{entity__match} < -1) {
2816     !!!cp (1024);
2817     $data = '&' . $self->{s_kwd};
2818     #
2819     } else {
2820     !!!cp (1025);
2821     $data = $self->{entity__value};
2822     $has_ref = 1;
2823     #
2824     }
2825     } else {
2826     !!!cp (1026);
2827     !!!parse-error (type => 'bare ero',
2828     line => $self->{line_prev},
2829     column => $self->{column_prev} - length $self->{s_kwd});
2830     $data = '&' . $self->{s_kwd};
2831     #
2832     }
2833    
2834     ## NOTE: In these cases, when a character reference is found,
2835     ## it is consumed and a character token is returned, or, otherwise,
2836     ## nothing is consumed and returned, according to the spec algorithm.
2837     ## In this implementation, anything that has been examined by the
2838     ## tokenizer is appended to the parent element or the attribute value
2839     ## as string, either literal string when no character reference or
2840     ## entity-replaced string otherwise, in this stage, since any characters
2841     ## that would not be consumed are appended in the data state or in an
2842     ## appropriate attribute value state anyway.
2843    
2844     if ($self->{prev_state} == DATA_STATE) {
2845     !!!cp (986);
2846     $self->{state} = $self->{prev_state};
2847 wakaba 1.5 $self->{s_kwd} = '';
2848 wakaba 1.1 ## Reconsume.
2849     !!!emit ({type => CHARACTER_TOKEN,
2850     data => $data,
2851 wakaba 1.7 has_reference => $has_ref,
2852 wakaba 1.1 line => $self->{line_prev},
2853     column => $self->{column_prev} + 1 - length $self->{s_kwd},
2854     });
2855     redo A;
2856     } else {
2857     !!!cp (985);
2858     $self->{ca}->{value} .= $data;
2859     $self->{ca}->{has_reference} = 1 if $has_ref;
2860     $self->{state} = $self->{prev_state};
2861 wakaba 1.5 $self->{s_kwd} = '';
2862 wakaba 1.1 ## Reconsume.
2863     redo A;
2864     }
2865 wakaba 1.8
2866     ## XML-only states
2867    
2868     } elsif ($self->{state} == PI_STATE) {
2869     if ($is_space->{$self->{nc}} or
2870     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
2871     $self->{nc} == -1) {
2872     !!!parse-error (type => 'bare pio', ## TODO: type
2873     line => $self->{line_prev},
2874     column => $self->{column_prev}
2875     - 1 * ($self->{nc} != -1));
2876     $self->{state} = BOGUS_COMMENT_STATE;
2877     ## Reconsume.
2878     $self->{ct} = {type => COMMENT_TOKEN,
2879     data => '?',
2880     line => $self->{line_prev},
2881     column => $self->{column_prev}
2882     - 1 * ($self->{nc} != -1),
2883     };
2884     redo A;
2885     } else {
2886     $self->{ct} = {type => PI_TOKEN,
2887     target => chr $self->{nc},
2888     data => '',
2889     line => $self->{line_prev},
2890     column => $self->{column_prev} - 1,
2891     };
2892     $self->{state} = PI_TARGET_STATE;
2893     !!!next-input-character;
2894     redo A;
2895     }
2896     } elsif ($self->{state} == PI_TARGET_STATE) {
2897     if ($is_space->{$self->{nc}}) {
2898     $self->{state} = PI_TARGET_AFTER_STATE;
2899     !!!next-input-character;
2900     redo A;
2901     } elsif ($self->{nc} == -1) {
2902     !!!parse-error (type => 'no pic'); ## TODO: type
2903     $self->{state} = DATA_STATE;
2904     $self->{s_kwd} = '';
2905     ## Reconsume.
2906     !!!emit ($self->{ct}); # pi
2907     redo A;
2908     } elsif ($self->{nc} == 0x003F) { # ?
2909     $self->{state} = PI_AFTER_STATE;
2910     !!!next-input-character;
2911     redo A;
2912     } else {
2913     ## XML5: typo ("tag name" -> "target")
2914     $self->{ct}->{target} .= chr $self->{nc}; # pi
2915     !!!next-input-character;
2916     redo A;
2917     }
2918     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
2919     if ($is_space->{$self->{nc}}) {
2920     ## Stay in the state.
2921     !!!next-input-character;
2922     redo A;
2923     } else {
2924     $self->{state} = PI_DATA_STATE;
2925     ## Reprocess.
2926     redo A;
2927     }
2928     } elsif ($self->{state} == PI_DATA_STATE) {
2929     if ($self->{nc} == 0x003F) { # ?
2930     $self->{state} = PI_DATA_AFTER_STATE;
2931     !!!next-input-character;
2932     redo A;
2933     } elsif ($self->{nc} == -1) {
2934     !!!parse-error (type => 'no pic'); ## TODO: type
2935     $self->{state} = DATA_STATE;
2936     $self->{s_kwd} = '';
2937     ## Reprocess.
2938     !!!emit ($self->{ct}); # pi
2939     redo A;
2940     } else {
2941     $self->{ct}->{data} .= chr $self->{nc}; # pi
2942     $self->{read_until}->($self->{ct}->{data}, q[?],
2943     length $self->{ct}->{data});
2944     ## Stay in the state.
2945     !!!next-input-character;
2946     ## Reprocess.
2947     redo A;
2948     }
2949     } elsif ($self->{state} == PI_AFTER_STATE) {
2950     if ($self->{nc} == 0x003E) { # >
2951     $self->{state} = DATA_STATE;
2952     $self->{s_kwd} = '';
2953     !!!next-input-character;
2954     !!!emit ($self->{ct}); # pi
2955     redo A;
2956     } elsif ($self->{nc} == 0x003F) { # ?
2957     !!!parse-error (type => 'no s after target', ## TODO: type
2958     line => $self->{line_prev},
2959     column => $self->{column_prev}); ## XML5: no error
2960     $self->{ct}->{data} .= '?';
2961     $self->{state} = PI_DATA_AFTER_STATE;
2962     !!!next-input-character;
2963     redo A;
2964     } else {
2965     !!!parse-error (type => 'no s after target', ## TODO: type
2966     line => $self->{line_prev},
2967     column => $self->{column_prev}
2968     + 1 * ($self->{nc} == -1)); ## XML5: no error
2969     $self->{ct}->{data} .= '?'; ## XML5: not appended
2970     $self->{state} = PI_DATA_STATE;
2971     ## Reprocess.
2972     redo A;
2973     }
2974     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
2975     ## XML5: Same as "pi after state" in XML5
2976     if ($self->{nc} == 0x003E) { # >
2977     $self->{state} = DATA_STATE;
2978     $self->{s_kwd} = '';
2979     !!!next-input-character;
2980     !!!emit ($self->{ct}); # pi
2981     redo A;
2982     } elsif ($self->{nc} == 0x003F) { # ?
2983     $self->{ct}->{data} .= '?';
2984     ## Stay in the state.
2985     !!!next-input-character;
2986     redo A;
2987     } else {
2988     $self->{ct}->{data} .= '?'; ## XML5: not appended
2989     $self->{state} = PI_DATA_STATE;
2990     ## Reprocess.
2991     redo A;
2992     }
2993    
2994 wakaba 1.1 } else {
2995     die "$0: $self->{state}: Unknown state";
2996     }
2997     } # A
2998    
2999     die "$0: _get_next_token: unexpected case";
3000     } # _get_next_token
3001    
3002     1;
3003 wakaba 1.10 ## $Date: 2008/10/15 08:05:47 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24