/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (hide annotations) (download)
Wed Oct 15 04:38:22 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.7: +283 -15 lines
++ whatpm/t/ChangeLog	15 Oct 2008 04:37:36 -0000
	* XML-Parser.t: "xml/pis-1.dat" and "xml/xmldecls-1.dat" added.
	Test directifes "#xml-version", "#xml-encoding", and
	"#xml-standalone" are added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 04:37:54 -0000
	* pis-1.dat, xmldecls-1.dat: New test data files.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	15 Oct 2008 04:33:34 -0000
2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (create_processing_instruction): New method.
	(xml_version, xml_encoding, xml_standalone): New attributes.
	(ProcessingInstruction): New class.

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 04:34:03 -0000
	* Tokenizer.pm.src: Support for XML processing instructions.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 04:34:57 -0000
	* Parser.pm.src: Support for XML declarations.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.8 our $VERSION=do{my @r=(q$Revision: 1.7 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117 wakaba 1.8 ## XML states
118     sub PI_STATE () { 51 }
119     sub PI_TARGET_STATE () { 52 }
120     sub PI_TARGET_AFTER_STATE () { 53 }
121     sub PI_DATA_STATE () { 54 }
122     sub PI_AFTER_STATE () { 55 }
123     sub PI_DATA_AFTER_STATE () { 56 }
124    
125 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
126     ## list and descriptions)
127    
128     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129     sub FOREIGN_EL () { 0b1_00000000000 }
130    
131     ## Character reference mappings
132    
133     my $charref_map = {
134     0x0D => 0x000A,
135     0x80 => 0x20AC,
136     0x81 => 0xFFFD,
137     0x82 => 0x201A,
138     0x83 => 0x0192,
139     0x84 => 0x201E,
140     0x85 => 0x2026,
141     0x86 => 0x2020,
142     0x87 => 0x2021,
143     0x88 => 0x02C6,
144     0x89 => 0x2030,
145     0x8A => 0x0160,
146     0x8B => 0x2039,
147     0x8C => 0x0152,
148     0x8D => 0xFFFD,
149     0x8E => 0x017D,
150     0x8F => 0xFFFD,
151     0x90 => 0xFFFD,
152     0x91 => 0x2018,
153     0x92 => 0x2019,
154     0x93 => 0x201C,
155     0x94 => 0x201D,
156     0x95 => 0x2022,
157     0x96 => 0x2013,
158     0x97 => 0x2014,
159     0x98 => 0x02DC,
160     0x99 => 0x2122,
161     0x9A => 0x0161,
162     0x9B => 0x203A,
163     0x9C => 0x0153,
164     0x9D => 0xFFFD,
165     0x9E => 0x017E,
166     0x9F => 0x0178,
167     }; # $charref_map
168     $charref_map->{$_} = 0xFFFD
169     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176    
177     ## Implementations MUST act as if state machine in the spec
178    
179     sub _initialize_tokenizer ($) {
180     my $self = shift;
181    
182     ## NOTE: Fields set by |new| constructor:
183     #$self->{level}
184     #$self->{set_nc}
185     #$self->{parse_error}
186 wakaba 1.3 #$self->{is_xml} (if XML)
187 wakaba 1.1
188     $self->{state} = DATA_STATE; # MUST
189 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
190 wakaba 1.1 #$self->{entity__value}; # initialized when used
191     #$self->{entity__match}; # initialized when used
192     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193     undef $self->{ct}; # current token
194     undef $self->{ca}; # current attribute
195     undef $self->{last_stag_name}; # last emitted start tag name
196     #$self->{prev_state}; # initialized when used
197     delete $self->{self_closing};
198     $self->{char_buffer} = '';
199     $self->{char_buffer_pos} = 0;
200     $self->{nc} = -1; # next input character
201     #$self->{next_nc}
202    
203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
204     $self->{line_prev} = $self->{line};
205     $self->{column_prev} = $self->{column};
206     $self->{column}++;
207     $self->{nc}
208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
209     } else {
210     $self->{set_nc}->($self);
211     }
212    
213     $self->{token} = [];
214     # $self->{escape}
215     } # _initialize_tokenizer
216    
217     ## A token has:
218     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
220     ## ->{name} (DOCTYPE_TOKEN)
221     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222     ## ->{pubid} (DOCTYPE_TOKEN)
223     ## ->{sysid} (DOCTYPE_TOKEN)
224     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
225     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
226     ## ->{name}
227     ## ->{value}
228     ## ->{has_reference} == 1 or 0
229     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
230 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
231 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
232     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
233     ## while the token is pushed back to the stack.
234    
235     ## Emitted token MUST immediately be handled by the tree construction state.
236    
237     ## Before each step, UA MAY check to see if either one of the scripts in
238     ## "list of scripts that will execute as soon as possible" or the first
239     ## script in the "list of scripts that will execute asynchronously",
240     ## has completed loading. If one has, then it MUST be executed
241     ## and removed from the list.
242    
243     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
244     ## (This requirement was dropped from HTML5 spec, unfortunately.)
245    
246     my $is_space = {
247     0x0009 => 1, # CHARACTER TABULATION (HT)
248     0x000A => 1, # LINE FEED (LF)
249     #0x000B => 0, # LINE TABULATION (VT)
250     0x000C => 1, # FORM FEED (FF)
251     #0x000D => 1, # CARRIAGE RETURN (CR)
252     0x0020 => 1, # SPACE (SP)
253     };
254    
255     sub _get_next_token ($) {
256     my $self = shift;
257    
258     if ($self->{self_closing}) {
259     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
260     ## NOTE: The |self_closing| flag is only set by start tag token.
261     ## In addition, when a start tag token is emitted, it is always set to
262     ## |ct|.
263     delete $self->{self_closing};
264     }
265    
266     if (@{$self->{token}}) {
267     $self->{self_closing} = $self->{token}->[0]->{self_closing};
268     return shift @{$self->{token}};
269     }
270    
271     A: {
272     if ($self->{state} == PCDATA_STATE) {
273     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
274    
275     if ($self->{nc} == 0x0026) { # &
276    
277     ## NOTE: In the spec, the tokenizer is switched to the
278     ## "entity data state". In this implementation, the tokenizer
279     ## is switched to the |ENTITY_STATE|, which is an implementation
280     ## of the "consume a character reference" algorithm.
281     $self->{entity_add} = -1;
282     $self->{prev_state} = DATA_STATE;
283     $self->{state} = ENTITY_STATE;
284    
285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
286     $self->{line_prev} = $self->{line};
287     $self->{column_prev} = $self->{column};
288     $self->{column}++;
289     $self->{nc}
290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
291     } else {
292     $self->{set_nc}->($self);
293     }
294    
295     redo A;
296     } elsif ($self->{nc} == 0x003C) { # <
297    
298     $self->{state} = TAG_OPEN_STATE;
299    
300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
301     $self->{line_prev} = $self->{line};
302     $self->{column_prev} = $self->{column};
303     $self->{column}++;
304     $self->{nc}
305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
306     } else {
307     $self->{set_nc}->($self);
308     }
309    
310     redo A;
311     } elsif ($self->{nc} == -1) {
312    
313     return ({type => END_OF_FILE_TOKEN,
314     line => $self->{line}, column => $self->{column}});
315     last A; ## TODO: ok?
316     } else {
317    
318     #
319     }
320    
321     # Anything else
322     my $token = {type => CHARACTER_TOKEN,
323     data => chr $self->{nc},
324     line => $self->{line}, column => $self->{column},
325     };
326     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
327    
328     ## Stay in the state.
329    
330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
331     $self->{line_prev} = $self->{line};
332     $self->{column_prev} = $self->{column};
333     $self->{column}++;
334     $self->{nc}
335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
336     } else {
337     $self->{set_nc}->($self);
338     }
339    
340     return ($token);
341     redo A;
342     } elsif ($self->{state} == DATA_STATE) {
343     $self->{s_kwd} = '' unless defined $self->{s_kwd};
344     if ($self->{nc} == 0x0026) { # &
345     $self->{s_kwd} = '';
346     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
347     not $self->{escape}) {
348    
349     ## NOTE: In the spec, the tokenizer is switched to the
350     ## "entity data state". In this implementation, the tokenizer
351     ## is switched to the |ENTITY_STATE|, which is an implementation
352     ## of the "consume a character reference" algorithm.
353     $self->{entity_add} = -1;
354     $self->{prev_state} = DATA_STATE;
355     $self->{state} = ENTITY_STATE;
356    
357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
358     $self->{line_prev} = $self->{line};
359     $self->{column_prev} = $self->{column};
360     $self->{column}++;
361     $self->{nc}
362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
363     } else {
364     $self->{set_nc}->($self);
365     }
366    
367     redo A;
368     } else {
369    
370     #
371     }
372     } elsif ($self->{nc} == 0x002D) { # -
373     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
374 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
375 wakaba 1.1
376     $self->{escape} = 1; # unless $self->{escape};
377     $self->{s_kwd} = '--';
378     #
379 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
380 wakaba 1.1
381     $self->{s_kwd} = '--';
382     #
383 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
384    
385     $self->{s_kwd} .= '-';
386     #
387 wakaba 1.1 } else {
388    
389 wakaba 1.5 $self->{s_kwd} = '-';
390 wakaba 1.1 #
391     }
392     }
393    
394     #
395     } elsif ($self->{nc} == 0x0021) { # !
396     if (length $self->{s_kwd}) {
397    
398     $self->{s_kwd} .= '!';
399     #
400     } else {
401    
402     #$self->{s_kwd} = '';
403     #
404     }
405     #
406     } elsif ($self->{nc} == 0x003C) { # <
407     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
408     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
409     not $self->{escape})) {
410    
411     $self->{state} = TAG_OPEN_STATE;
412    
413     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
414     $self->{line_prev} = $self->{line};
415     $self->{column_prev} = $self->{column};
416     $self->{column}++;
417     $self->{nc}
418     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
419     } else {
420     $self->{set_nc}->($self);
421     }
422    
423     redo A;
424     } else {
425    
426     $self->{s_kwd} = '';
427     #
428     }
429     } elsif ($self->{nc} == 0x003E) { # >
430     if ($self->{escape} and
431     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
432     if ($self->{s_kwd} eq '--') {
433    
434     delete $self->{escape};
435 wakaba 1.5 #
436 wakaba 1.1 } else {
437    
438 wakaba 1.5 #
439 wakaba 1.1 }
440 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
441    
442     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
443     line => $self->{line_prev},
444     column => $self->{column_prev} - 1);
445     #
446 wakaba 1.1 } else {
447    
448 wakaba 1.5 #
449 wakaba 1.1 }
450    
451     $self->{s_kwd} = '';
452     #
453 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
454     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
455    
456     $self->{s_kwd} .= ']';
457     } elsif ($self->{s_kwd} eq ']]') {
458    
459     #
460     } else {
461    
462     $self->{s_kwd} = '';
463     }
464     #
465 wakaba 1.1 } elsif ($self->{nc} == -1) {
466    
467     $self->{s_kwd} = '';
468     return ({type => END_OF_FILE_TOKEN,
469     line => $self->{line}, column => $self->{column}});
470     last A; ## TODO: ok?
471     } else {
472    
473     $self->{s_kwd} = '';
474     #
475     }
476    
477     # Anything else
478     my $token = {type => CHARACTER_TOKEN,
479     data => chr $self->{nc},
480     line => $self->{line}, column => $self->{column},
481     };
482 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
483 wakaba 1.1 length $token->{data})) {
484     $self->{s_kwd} = '';
485     }
486    
487     ## Stay in the data state.
488 wakaba 1.5 if (not $self->{is_xml} and
489     $self->{content_model} == PCDATA_CONTENT_MODEL) {
490 wakaba 1.1
491     $self->{state} = PCDATA_STATE;
492     } else {
493    
494     ## Stay in the state.
495     }
496    
497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
498     $self->{line_prev} = $self->{line};
499     $self->{column_prev} = $self->{column};
500     $self->{column}++;
501     $self->{nc}
502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
503     } else {
504     $self->{set_nc}->($self);
505     }
506    
507     return ($token);
508     redo A;
509     } elsif ($self->{state} == TAG_OPEN_STATE) {
510     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
511     if ($self->{nc} == 0x002F) { # /
512    
513    
514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
515     $self->{line_prev} = $self->{line};
516     $self->{column_prev} = $self->{column};
517     $self->{column}++;
518     $self->{nc}
519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
520     } else {
521     $self->{set_nc}->($self);
522     }
523    
524     $self->{state} = CLOSE_TAG_OPEN_STATE;
525     redo A;
526     } elsif ($self->{nc} == 0x0021) { # !
527    
528     $self->{s_kwd} = '<' unless $self->{escape};
529     #
530     } else {
531    
532     #
533     }
534    
535     ## reconsume
536     $self->{state} = DATA_STATE;
537 wakaba 1.5 $self->{s_kwd} = '';
538 wakaba 1.1 return ({type => CHARACTER_TOKEN, data => '<',
539     line => $self->{line_prev},
540     column => $self->{column_prev},
541     });
542     redo A;
543     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
544     if ($self->{nc} == 0x0021) { # !
545    
546     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
547    
548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
549     $self->{line_prev} = $self->{line};
550     $self->{column_prev} = $self->{column};
551     $self->{column}++;
552     $self->{nc}
553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
554     } else {
555     $self->{set_nc}->($self);
556     }
557    
558     redo A;
559     } elsif ($self->{nc} == 0x002F) { # /
560    
561     $self->{state} = CLOSE_TAG_OPEN_STATE;
562    
563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
564     $self->{line_prev} = $self->{line};
565     $self->{column_prev} = $self->{column};
566     $self->{column}++;
567     $self->{nc}
568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
569     } else {
570     $self->{set_nc}->($self);
571     }
572    
573     redo A;
574     } elsif (0x0041 <= $self->{nc} and
575     $self->{nc} <= 0x005A) { # A..Z
576    
577     $self->{ct}
578     = {type => START_TAG_TOKEN,
579 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
580 wakaba 1.1 line => $self->{line_prev},
581     column => $self->{column_prev}};
582     $self->{state} = TAG_NAME_STATE;
583    
584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
585     $self->{line_prev} = $self->{line};
586     $self->{column_prev} = $self->{column};
587     $self->{column}++;
588     $self->{nc}
589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
590     } else {
591     $self->{set_nc}->($self);
592     }
593    
594     redo A;
595     } elsif (0x0061 <= $self->{nc} and
596     $self->{nc} <= 0x007A) { # a..z
597    
598     $self->{ct} = {type => START_TAG_TOKEN,
599     tag_name => chr ($self->{nc}),
600     line => $self->{line_prev},
601     column => $self->{column_prev}};
602     $self->{state} = TAG_NAME_STATE;
603    
604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
605     $self->{line_prev} = $self->{line};
606     $self->{column_prev} = $self->{column};
607     $self->{column}++;
608     $self->{nc}
609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
610     } else {
611     $self->{set_nc}->($self);
612     }
613    
614     redo A;
615     } elsif ($self->{nc} == 0x003E) { # >
616    
617     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
618     line => $self->{line_prev},
619     column => $self->{column_prev});
620     $self->{state} = DATA_STATE;
621 wakaba 1.5 $self->{s_kwd} = '';
622 wakaba 1.1
623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
624     $self->{line_prev} = $self->{line};
625     $self->{column_prev} = $self->{column};
626     $self->{column}++;
627     $self->{nc}
628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
629     } else {
630     $self->{set_nc}->($self);
631     }
632    
633    
634     return ({type => CHARACTER_TOKEN, data => '<>',
635     line => $self->{line_prev},
636     column => $self->{column_prev},
637     });
638    
639     redo A;
640     } elsif ($self->{nc} == 0x003F) { # ?
641 wakaba 1.8 if ($self->{is_xml}) {
642    
643     $self->{state} = PI_STATE;
644    
645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
646     $self->{line_prev} = $self->{line};
647     $self->{column_prev} = $self->{column};
648     $self->{column}++;
649     $self->{nc}
650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
651     } else {
652     $self->{set_nc}->($self);
653     }
654    
655     redo A;
656     } else {
657    
658     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
659     line => $self->{line_prev},
660     column => $self->{column_prev});
661     $self->{state} = BOGUS_COMMENT_STATE;
662     $self->{ct} = {type => COMMENT_TOKEN, data => '',
663     line => $self->{line_prev},
664     column => $self->{column_prev},
665     };
666     ## $self->{nc} is intentionally left as is
667     redo A;
668     }
669 wakaba 1.1 } else {
670    
671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
672     line => $self->{line_prev},
673     column => $self->{column_prev});
674     $self->{state} = DATA_STATE;
675 wakaba 1.5 $self->{s_kwd} = '';
676 wakaba 1.1 ## reconsume
677    
678     return ({type => CHARACTER_TOKEN, data => '<',
679     line => $self->{line_prev},
680     column => $self->{column_prev},
681     });
682    
683     redo A;
684     }
685     } else {
686     die "$0: $self->{content_model} in tag open";
687     }
688     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
689     ## NOTE: The "close tag open state" in the spec is implemented as
690     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
691    
692     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
693     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
694     if (defined $self->{last_stag_name}) {
695     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
696     $self->{s_kwd} = '';
697     ## Reconsume.
698     redo A;
699     } else {
700     ## No start tag token has ever been emitted
701     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
702    
703     $self->{state} = DATA_STATE;
704 wakaba 1.5 $self->{s_kwd} = '';
705 wakaba 1.1 ## Reconsume.
706     return ({type => CHARACTER_TOKEN, data => '</',
707     line => $l, column => $c,
708     });
709     redo A;
710     }
711     }
712    
713     if (0x0041 <= $self->{nc} and
714     $self->{nc} <= 0x005A) { # A..Z
715    
716     $self->{ct}
717     = {type => END_TAG_TOKEN,
718 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
719 wakaba 1.1 line => $l, column => $c};
720     $self->{state} = TAG_NAME_STATE;
721    
722     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
723     $self->{line_prev} = $self->{line};
724     $self->{column_prev} = $self->{column};
725     $self->{column}++;
726     $self->{nc}
727     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
728     } else {
729     $self->{set_nc}->($self);
730     }
731    
732     redo A;
733     } elsif (0x0061 <= $self->{nc} and
734     $self->{nc} <= 0x007A) { # a..z
735    
736     $self->{ct} = {type => END_TAG_TOKEN,
737     tag_name => chr ($self->{nc}),
738     line => $l, column => $c};
739     $self->{state} = TAG_NAME_STATE;
740    
741     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
742     $self->{line_prev} = $self->{line};
743     $self->{column_prev} = $self->{column};
744     $self->{column}++;
745     $self->{nc}
746     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
747     } else {
748     $self->{set_nc}->($self);
749     }
750    
751     redo A;
752     } elsif ($self->{nc} == 0x003E) { # >
753    
754     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
755     line => $self->{line_prev}, ## "<" in "</>"
756     column => $self->{column_prev} - 1);
757     $self->{state} = DATA_STATE;
758 wakaba 1.5 $self->{s_kwd} = '';
759 wakaba 1.1
760     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
761     $self->{line_prev} = $self->{line};
762     $self->{column_prev} = $self->{column};
763     $self->{column}++;
764     $self->{nc}
765     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
766     } else {
767     $self->{set_nc}->($self);
768     }
769    
770     redo A;
771     } elsif ($self->{nc} == -1) {
772    
773     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
774 wakaba 1.5 $self->{s_kwd} = '';
775 wakaba 1.1 $self->{state} = DATA_STATE;
776     # reconsume
777    
778     return ({type => CHARACTER_TOKEN, data => '</',
779     line => $l, column => $c,
780     });
781    
782     redo A;
783     } else {
784    
785     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
786     $self->{state} = BOGUS_COMMENT_STATE;
787     $self->{ct} = {type => COMMENT_TOKEN, data => '',
788     line => $self->{line_prev}, # "<" of "</"
789     column => $self->{column_prev} - 1,
790     };
791     ## NOTE: $self->{nc} is intentionally left as is.
792     ## Although the "anything else" case of the spec not explicitly
793     ## states that the next input character is to be reconsumed,
794     ## it will be included to the |data| of the comment token
795     ## generated from the bogus end tag, as defined in the
796     ## "bogus comment state" entry.
797     redo A;
798     }
799     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
800     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
801     if (length $ch) {
802     my $CH = $ch;
803     $ch =~ tr/a-z/A-Z/;
804     my $nch = chr $self->{nc};
805     if ($nch eq $ch or $nch eq $CH) {
806    
807     ## Stay in the state.
808     $self->{s_kwd} .= $nch;
809    
810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
811     $self->{line_prev} = $self->{line};
812     $self->{column_prev} = $self->{column};
813     $self->{column}++;
814     $self->{nc}
815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
816     } else {
817     $self->{set_nc}->($self);
818     }
819    
820     redo A;
821     } else {
822    
823     $self->{state} = DATA_STATE;
824 wakaba 1.5 $self->{s_kwd} = '';
825 wakaba 1.1 ## Reconsume.
826     return ({type => CHARACTER_TOKEN,
827     data => '</' . $self->{s_kwd},
828     line => $self->{line_prev},
829     column => $self->{column_prev} - 1 - length $self->{s_kwd},
830     });
831     redo A;
832     }
833     } else { # after "<{tag-name}"
834     unless ($is_space->{$self->{nc}} or
835     {
836     0x003E => 1, # >
837     0x002F => 1, # /
838     -1 => 1, # EOF
839     }->{$self->{nc}}) {
840    
841     ## Reconsume.
842     $self->{state} = DATA_STATE;
843 wakaba 1.5 $self->{s_kwd} = '';
844 wakaba 1.1 return ({type => CHARACTER_TOKEN,
845     data => '</' . $self->{s_kwd},
846     line => $self->{line_prev},
847     column => $self->{column_prev} - 1 - length $self->{s_kwd},
848     });
849     redo A;
850     } else {
851    
852     $self->{ct}
853     = {type => END_TAG_TOKEN,
854     tag_name => $self->{last_stag_name},
855     line => $self->{line_prev},
856     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
857     $self->{state} = TAG_NAME_STATE;
858     ## Reconsume.
859     redo A;
860     }
861     }
862     } elsif ($self->{state} == TAG_NAME_STATE) {
863     if ($is_space->{$self->{nc}}) {
864    
865     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
866    
867     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
868     $self->{line_prev} = $self->{line};
869     $self->{column_prev} = $self->{column};
870     $self->{column}++;
871     $self->{nc}
872     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
873     } else {
874     $self->{set_nc}->($self);
875     }
876    
877     redo A;
878     } elsif ($self->{nc} == 0x003E) { # >
879     if ($self->{ct}->{type} == START_TAG_TOKEN) {
880    
881     $self->{last_stag_name} = $self->{ct}->{tag_name};
882     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
883     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
884     #if ($self->{ct}->{attributes}) {
885     # ## NOTE: This should never be reached.
886     # !!! cp (36);
887     # !!! parse-error (type => 'end tag attribute');
888     #} else {
889    
890     #}
891     } else {
892     die "$0: $self->{ct}->{type}: Unknown token type";
893     }
894     $self->{state} = DATA_STATE;
895 wakaba 1.5 $self->{s_kwd} = '';
896 wakaba 1.1
897     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
898     $self->{line_prev} = $self->{line};
899     $self->{column_prev} = $self->{column};
900     $self->{column}++;
901     $self->{nc}
902     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
903     } else {
904     $self->{set_nc}->($self);
905     }
906    
907    
908     return ($self->{ct}); # start tag or end tag
909    
910     redo A;
911     } elsif (0x0041 <= $self->{nc} and
912     $self->{nc} <= 0x005A) { # A..Z
913    
914 wakaba 1.4 $self->{ct}->{tag_name}
915     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
916 wakaba 1.1 # start tag or end tag
917     ## Stay in this state
918    
919     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
920     $self->{line_prev} = $self->{line};
921     $self->{column_prev} = $self->{column};
922     $self->{column}++;
923     $self->{nc}
924     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
925     } else {
926     $self->{set_nc}->($self);
927     }
928    
929     redo A;
930     } elsif ($self->{nc} == -1) {
931     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
932     if ($self->{ct}->{type} == START_TAG_TOKEN) {
933    
934     $self->{last_stag_name} = $self->{ct}->{tag_name};
935     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
936     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
937     #if ($self->{ct}->{attributes}) {
938     # ## NOTE: This state should never be reached.
939     # !!! cp (40);
940     # !!! parse-error (type => 'end tag attribute');
941     #} else {
942    
943     #}
944     } else {
945     die "$0: $self->{ct}->{type}: Unknown token type";
946     }
947     $self->{state} = DATA_STATE;
948 wakaba 1.5 $self->{s_kwd} = '';
949 wakaba 1.1 # reconsume
950    
951     return ($self->{ct}); # start tag or end tag
952    
953     redo A;
954     } elsif ($self->{nc} == 0x002F) { # /
955    
956     $self->{state} = SELF_CLOSING_START_TAG_STATE;
957    
958     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
959     $self->{line_prev} = $self->{line};
960     $self->{column_prev} = $self->{column};
961     $self->{column}++;
962     $self->{nc}
963     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
964     } else {
965     $self->{set_nc}->($self);
966     }
967    
968     redo A;
969     } else {
970    
971     $self->{ct}->{tag_name} .= chr $self->{nc};
972     # start tag or end tag
973     ## Stay in the state
974    
975     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
976     $self->{line_prev} = $self->{line};
977     $self->{column_prev} = $self->{column};
978     $self->{column}++;
979     $self->{nc}
980     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
981     } else {
982     $self->{set_nc}->($self);
983     }
984    
985     redo A;
986     }
987     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
988     if ($is_space->{$self->{nc}}) {
989    
990     ## Stay in the state
991    
992     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
993     $self->{line_prev} = $self->{line};
994     $self->{column_prev} = $self->{column};
995     $self->{column}++;
996     $self->{nc}
997     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
998     } else {
999     $self->{set_nc}->($self);
1000     }
1001    
1002     redo A;
1003     } elsif ($self->{nc} == 0x003E) { # >
1004     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1005    
1006     $self->{last_stag_name} = $self->{ct}->{tag_name};
1007     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1008     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1009     if ($self->{ct}->{attributes}) {
1010    
1011     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1012     } else {
1013    
1014     }
1015     } else {
1016     die "$0: $self->{ct}->{type}: Unknown token type";
1017     }
1018     $self->{state} = DATA_STATE;
1019 wakaba 1.5 $self->{s_kwd} = '';
1020 wakaba 1.1
1021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1022     $self->{line_prev} = $self->{line};
1023     $self->{column_prev} = $self->{column};
1024     $self->{column}++;
1025     $self->{nc}
1026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1027     } else {
1028     $self->{set_nc}->($self);
1029     }
1030    
1031    
1032     return ($self->{ct}); # start tag or end tag
1033    
1034     redo A;
1035     } elsif (0x0041 <= $self->{nc} and
1036     $self->{nc} <= 0x005A) { # A..Z
1037    
1038     $self->{ca}
1039 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1040 wakaba 1.1 value => '',
1041     line => $self->{line}, column => $self->{column}};
1042     $self->{state} = ATTRIBUTE_NAME_STATE;
1043    
1044     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1045     $self->{line_prev} = $self->{line};
1046     $self->{column_prev} = $self->{column};
1047     $self->{column}++;
1048     $self->{nc}
1049     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1050     } else {
1051     $self->{set_nc}->($self);
1052     }
1053    
1054     redo A;
1055     } elsif ($self->{nc} == 0x002F) { # /
1056    
1057     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1058    
1059     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1060     $self->{line_prev} = $self->{line};
1061     $self->{column_prev} = $self->{column};
1062     $self->{column}++;
1063     $self->{nc}
1064     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1065     } else {
1066     $self->{set_nc}->($self);
1067     }
1068    
1069     redo A;
1070     } elsif ($self->{nc} == -1) {
1071     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1072     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1073    
1074     $self->{last_stag_name} = $self->{ct}->{tag_name};
1075     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1076     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1077     if ($self->{ct}->{attributes}) {
1078    
1079     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1080     } else {
1081    
1082     }
1083     } else {
1084     die "$0: $self->{ct}->{type}: Unknown token type";
1085     }
1086     $self->{state} = DATA_STATE;
1087 wakaba 1.5 $self->{s_kwd} = '';
1088 wakaba 1.1 # reconsume
1089    
1090     return ($self->{ct}); # start tag or end tag
1091    
1092     redo A;
1093     } else {
1094     if ({
1095     0x0022 => 1, # "
1096     0x0027 => 1, # '
1097     0x003D => 1, # =
1098     }->{$self->{nc}}) {
1099    
1100     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1101     } else {
1102    
1103     }
1104     $self->{ca}
1105     = {name => chr ($self->{nc}),
1106     value => '',
1107     line => $self->{line}, column => $self->{column}};
1108     $self->{state} = ATTRIBUTE_NAME_STATE;
1109    
1110     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1111     $self->{line_prev} = $self->{line};
1112     $self->{column_prev} = $self->{column};
1113     $self->{column}++;
1114     $self->{nc}
1115     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1116     } else {
1117     $self->{set_nc}->($self);
1118     }
1119    
1120     redo A;
1121     }
1122     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1123     my $before_leave = sub {
1124     if (exists $self->{ct}->{attributes} # start tag or end tag
1125     ->{$self->{ca}->{name}}) { # MUST
1126    
1127     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1128     ## Discard $self->{ca} # MUST
1129     } else {
1130    
1131     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1132     = $self->{ca};
1133     }
1134     }; # $before_leave
1135    
1136     if ($is_space->{$self->{nc}}) {
1137    
1138     $before_leave->();
1139     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1140    
1141     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1142     $self->{line_prev} = $self->{line};
1143     $self->{column_prev} = $self->{column};
1144     $self->{column}++;
1145     $self->{nc}
1146     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1147     } else {
1148     $self->{set_nc}->($self);
1149     }
1150    
1151     redo A;
1152     } elsif ($self->{nc} == 0x003D) { # =
1153    
1154     $before_leave->();
1155     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1156    
1157     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1158     $self->{line_prev} = $self->{line};
1159     $self->{column_prev} = $self->{column};
1160     $self->{column}++;
1161     $self->{nc}
1162     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1163     } else {
1164     $self->{set_nc}->($self);
1165     }
1166    
1167     redo A;
1168     } elsif ($self->{nc} == 0x003E) { # >
1169     $before_leave->();
1170     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1171    
1172     $self->{last_stag_name} = $self->{ct}->{tag_name};
1173     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1174    
1175     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1176     if ($self->{ct}->{attributes}) {
1177     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1178     }
1179     } else {
1180     die "$0: $self->{ct}->{type}: Unknown token type";
1181     }
1182     $self->{state} = DATA_STATE;
1183 wakaba 1.5 $self->{s_kwd} = '';
1184 wakaba 1.1
1185     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1186     $self->{line_prev} = $self->{line};
1187     $self->{column_prev} = $self->{column};
1188     $self->{column}++;
1189     $self->{nc}
1190     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1191     } else {
1192     $self->{set_nc}->($self);
1193     }
1194    
1195    
1196     return ($self->{ct}); # start tag or end tag
1197    
1198     redo A;
1199     } elsif (0x0041 <= $self->{nc} and
1200     $self->{nc} <= 0x005A) { # A..Z
1201    
1202 wakaba 1.4 $self->{ca}->{name}
1203     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1204 wakaba 1.1 ## Stay in the state
1205    
1206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1207     $self->{line_prev} = $self->{line};
1208     $self->{column_prev} = $self->{column};
1209     $self->{column}++;
1210     $self->{nc}
1211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1212     } else {
1213     $self->{set_nc}->($self);
1214     }
1215    
1216     redo A;
1217     } elsif ($self->{nc} == 0x002F) { # /
1218    
1219     $before_leave->();
1220     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1221    
1222     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1223     $self->{line_prev} = $self->{line};
1224     $self->{column_prev} = $self->{column};
1225     $self->{column}++;
1226     $self->{nc}
1227     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1228     } else {
1229     $self->{set_nc}->($self);
1230     }
1231    
1232     redo A;
1233     } elsif ($self->{nc} == -1) {
1234     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1235     $before_leave->();
1236     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1237    
1238     $self->{last_stag_name} = $self->{ct}->{tag_name};
1239     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1240     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1241     if ($self->{ct}->{attributes}) {
1242    
1243     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1244     } else {
1245     ## NOTE: This state should never be reached.
1246    
1247     }
1248     } else {
1249     die "$0: $self->{ct}->{type}: Unknown token type";
1250     }
1251     $self->{state} = DATA_STATE;
1252 wakaba 1.5 $self->{s_kwd} = '';
1253 wakaba 1.1 # reconsume
1254    
1255     return ($self->{ct}); # start tag or end tag
1256    
1257     redo A;
1258     } else {
1259     if ($self->{nc} == 0x0022 or # "
1260     $self->{nc} == 0x0027) { # '
1261    
1262     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1263     } else {
1264    
1265     }
1266     $self->{ca}->{name} .= chr ($self->{nc});
1267     ## Stay in the state
1268    
1269     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1270     $self->{line_prev} = $self->{line};
1271     $self->{column_prev} = $self->{column};
1272     $self->{column}++;
1273     $self->{nc}
1274     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1275     } else {
1276     $self->{set_nc}->($self);
1277     }
1278    
1279     redo A;
1280     }
1281     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1282     if ($is_space->{$self->{nc}}) {
1283    
1284     ## Stay in the state
1285    
1286     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1287     $self->{line_prev} = $self->{line};
1288     $self->{column_prev} = $self->{column};
1289     $self->{column}++;
1290     $self->{nc}
1291     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1292     } else {
1293     $self->{set_nc}->($self);
1294     }
1295    
1296     redo A;
1297     } elsif ($self->{nc} == 0x003D) { # =
1298    
1299     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1300    
1301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1302     $self->{line_prev} = $self->{line};
1303     $self->{column_prev} = $self->{column};
1304     $self->{column}++;
1305     $self->{nc}
1306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1307     } else {
1308     $self->{set_nc}->($self);
1309     }
1310    
1311     redo A;
1312     } elsif ($self->{nc} == 0x003E) { # >
1313     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1314    
1315     $self->{last_stag_name} = $self->{ct}->{tag_name};
1316     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1317     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1318     if ($self->{ct}->{attributes}) {
1319    
1320     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1321     } else {
1322     ## NOTE: This state should never be reached.
1323    
1324     }
1325     } else {
1326     die "$0: $self->{ct}->{type}: Unknown token type";
1327     }
1328     $self->{state} = DATA_STATE;
1329 wakaba 1.5 $self->{s_kwd} = '';
1330 wakaba 1.1
1331     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1332     $self->{line_prev} = $self->{line};
1333     $self->{column_prev} = $self->{column};
1334     $self->{column}++;
1335     $self->{nc}
1336     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1337     } else {
1338     $self->{set_nc}->($self);
1339     }
1340    
1341    
1342     return ($self->{ct}); # start tag or end tag
1343    
1344     redo A;
1345     } elsif (0x0041 <= $self->{nc} and
1346     $self->{nc} <= 0x005A) { # A..Z
1347    
1348     $self->{ca}
1349 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1350 wakaba 1.1 value => '',
1351     line => $self->{line}, column => $self->{column}};
1352     $self->{state} = ATTRIBUTE_NAME_STATE;
1353    
1354     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1355     $self->{line_prev} = $self->{line};
1356     $self->{column_prev} = $self->{column};
1357     $self->{column}++;
1358     $self->{nc}
1359     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1360     } else {
1361     $self->{set_nc}->($self);
1362     }
1363    
1364     redo A;
1365     } elsif ($self->{nc} == 0x002F) { # /
1366    
1367     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1368    
1369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1370     $self->{line_prev} = $self->{line};
1371     $self->{column_prev} = $self->{column};
1372     $self->{column}++;
1373     $self->{nc}
1374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1375     } else {
1376     $self->{set_nc}->($self);
1377     }
1378    
1379     redo A;
1380     } elsif ($self->{nc} == -1) {
1381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1382     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1383    
1384     $self->{last_stag_name} = $self->{ct}->{tag_name};
1385     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1386     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1387     if ($self->{ct}->{attributes}) {
1388    
1389     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1390     } else {
1391     ## NOTE: This state should never be reached.
1392    
1393     }
1394     } else {
1395     die "$0: $self->{ct}->{type}: Unknown token type";
1396     }
1397 wakaba 1.5 $self->{s_kwd} = '';
1398 wakaba 1.1 $self->{state} = DATA_STATE;
1399     # reconsume
1400    
1401     return ($self->{ct}); # start tag or end tag
1402    
1403     redo A;
1404     } else {
1405     if ($self->{nc} == 0x0022 or # "
1406     $self->{nc} == 0x0027) { # '
1407    
1408     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1409     } else {
1410    
1411     }
1412     $self->{ca}
1413     = {name => chr ($self->{nc}),
1414     value => '',
1415     line => $self->{line}, column => $self->{column}};
1416     $self->{state} = ATTRIBUTE_NAME_STATE;
1417    
1418     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1419     $self->{line_prev} = $self->{line};
1420     $self->{column_prev} = $self->{column};
1421     $self->{column}++;
1422     $self->{nc}
1423     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1424     } else {
1425     $self->{set_nc}->($self);
1426     }
1427    
1428     redo A;
1429     }
1430     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1431     if ($is_space->{$self->{nc}}) {
1432    
1433     ## Stay in the state
1434    
1435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1436     $self->{line_prev} = $self->{line};
1437     $self->{column_prev} = $self->{column};
1438     $self->{column}++;
1439     $self->{nc}
1440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1441     } else {
1442     $self->{set_nc}->($self);
1443     }
1444    
1445     redo A;
1446     } elsif ($self->{nc} == 0x0022) { # "
1447    
1448     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1449    
1450     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1451     $self->{line_prev} = $self->{line};
1452     $self->{column_prev} = $self->{column};
1453     $self->{column}++;
1454     $self->{nc}
1455     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1456     } else {
1457     $self->{set_nc}->($self);
1458     }
1459    
1460     redo A;
1461     } elsif ($self->{nc} == 0x0026) { # &
1462    
1463     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1464     ## reconsume
1465     redo A;
1466     } elsif ($self->{nc} == 0x0027) { # '
1467    
1468     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1469    
1470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1471     $self->{line_prev} = $self->{line};
1472     $self->{column_prev} = $self->{column};
1473     $self->{column}++;
1474     $self->{nc}
1475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1476     } else {
1477     $self->{set_nc}->($self);
1478     }
1479    
1480     redo A;
1481     } elsif ($self->{nc} == 0x003E) { # >
1482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1483     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1484    
1485     $self->{last_stag_name} = $self->{ct}->{tag_name};
1486     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1487     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1488     if ($self->{ct}->{attributes}) {
1489    
1490     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1491     } else {
1492     ## NOTE: This state should never be reached.
1493    
1494     }
1495     } else {
1496     die "$0: $self->{ct}->{type}: Unknown token type";
1497     }
1498     $self->{state} = DATA_STATE;
1499 wakaba 1.5 $self->{s_kwd} = '';
1500 wakaba 1.1
1501     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1502     $self->{line_prev} = $self->{line};
1503     $self->{column_prev} = $self->{column};
1504     $self->{column}++;
1505     $self->{nc}
1506     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1507     } else {
1508     $self->{set_nc}->($self);
1509     }
1510    
1511    
1512     return ($self->{ct}); # start tag or end tag
1513    
1514     redo A;
1515     } elsif ($self->{nc} == -1) {
1516     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1517     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1518    
1519     $self->{last_stag_name} = $self->{ct}->{tag_name};
1520     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1521     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1522     if ($self->{ct}->{attributes}) {
1523    
1524     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1525     } else {
1526     ## NOTE: This state should never be reached.
1527    
1528     }
1529     } else {
1530     die "$0: $self->{ct}->{type}: Unknown token type";
1531     }
1532     $self->{state} = DATA_STATE;
1533 wakaba 1.5 $self->{s_kwd} = '';
1534 wakaba 1.1 ## reconsume
1535    
1536     return ($self->{ct}); # start tag or end tag
1537    
1538     redo A;
1539     } else {
1540     if ($self->{nc} == 0x003D) { # =
1541    
1542     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1543     } else {
1544    
1545     }
1546     $self->{ca}->{value} .= chr ($self->{nc});
1547     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1548    
1549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1550     $self->{line_prev} = $self->{line};
1551     $self->{column_prev} = $self->{column};
1552     $self->{column}++;
1553     $self->{nc}
1554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1555     } else {
1556     $self->{set_nc}->($self);
1557     }
1558    
1559     redo A;
1560     }
1561     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1562     if ($self->{nc} == 0x0022) { # "
1563    
1564     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1565    
1566     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1567     $self->{line_prev} = $self->{line};
1568     $self->{column_prev} = $self->{column};
1569     $self->{column}++;
1570     $self->{nc}
1571     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1572     } else {
1573     $self->{set_nc}->($self);
1574     }
1575    
1576     redo A;
1577     } elsif ($self->{nc} == 0x0026) { # &
1578    
1579     ## NOTE: In the spec, the tokenizer is switched to the
1580     ## "entity in attribute value state". In this implementation, the
1581     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1582     ## implementation of the "consume a character reference" algorithm.
1583     $self->{prev_state} = $self->{state};
1584     $self->{entity_add} = 0x0022; # "
1585     $self->{state} = ENTITY_STATE;
1586    
1587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1588     $self->{line_prev} = $self->{line};
1589     $self->{column_prev} = $self->{column};
1590     $self->{column}++;
1591     $self->{nc}
1592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1593     } else {
1594     $self->{set_nc}->($self);
1595     }
1596    
1597     redo A;
1598     } elsif ($self->{nc} == -1) {
1599     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1600     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1601    
1602     $self->{last_stag_name} = $self->{ct}->{tag_name};
1603     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1604     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1605     if ($self->{ct}->{attributes}) {
1606    
1607     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1608     } else {
1609     ## NOTE: This state should never be reached.
1610    
1611     }
1612     } else {
1613     die "$0: $self->{ct}->{type}: Unknown token type";
1614     }
1615     $self->{state} = DATA_STATE;
1616 wakaba 1.5 $self->{s_kwd} = '';
1617 wakaba 1.1 ## reconsume
1618    
1619     return ($self->{ct}); # start tag or end tag
1620    
1621     redo A;
1622     } else {
1623    
1624     $self->{ca}->{value} .= chr ($self->{nc});
1625     $self->{read_until}->($self->{ca}->{value},
1626     q["&],
1627     length $self->{ca}->{value});
1628    
1629     ## Stay in the state
1630    
1631     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1632     $self->{line_prev} = $self->{line};
1633     $self->{column_prev} = $self->{column};
1634     $self->{column}++;
1635     $self->{nc}
1636     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1637     } else {
1638     $self->{set_nc}->($self);
1639     }
1640    
1641     redo A;
1642     }
1643     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1644     if ($self->{nc} == 0x0027) { # '
1645    
1646     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1647    
1648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1649     $self->{line_prev} = $self->{line};
1650     $self->{column_prev} = $self->{column};
1651     $self->{column}++;
1652     $self->{nc}
1653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1654     } else {
1655     $self->{set_nc}->($self);
1656     }
1657    
1658     redo A;
1659     } elsif ($self->{nc} == 0x0026) { # &
1660    
1661     ## NOTE: In the spec, the tokenizer is switched to the
1662     ## "entity in attribute value state". In this implementation, the
1663     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1664     ## implementation of the "consume a character reference" algorithm.
1665     $self->{entity_add} = 0x0027; # '
1666     $self->{prev_state} = $self->{state};
1667     $self->{state} = ENTITY_STATE;
1668    
1669     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1670     $self->{line_prev} = $self->{line};
1671     $self->{column_prev} = $self->{column};
1672     $self->{column}++;
1673     $self->{nc}
1674     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1675     } else {
1676     $self->{set_nc}->($self);
1677     }
1678    
1679     redo A;
1680     } elsif ($self->{nc} == -1) {
1681     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1682     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1683    
1684     $self->{last_stag_name} = $self->{ct}->{tag_name};
1685     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1686     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1687     if ($self->{ct}->{attributes}) {
1688    
1689     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1690     } else {
1691     ## NOTE: This state should never be reached.
1692    
1693     }
1694     } else {
1695     die "$0: $self->{ct}->{type}: Unknown token type";
1696     }
1697     $self->{state} = DATA_STATE;
1698 wakaba 1.5 $self->{s_kwd} = '';
1699 wakaba 1.1 ## reconsume
1700    
1701     return ($self->{ct}); # start tag or end tag
1702    
1703     redo A;
1704     } else {
1705    
1706     $self->{ca}->{value} .= chr ($self->{nc});
1707     $self->{read_until}->($self->{ca}->{value},
1708     q['&],
1709     length $self->{ca}->{value});
1710    
1711     ## Stay in the state
1712    
1713     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1714     $self->{line_prev} = $self->{line};
1715     $self->{column_prev} = $self->{column};
1716     $self->{column}++;
1717     $self->{nc}
1718     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1719     } else {
1720     $self->{set_nc}->($self);
1721     }
1722    
1723     redo A;
1724     }
1725     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1726     if ($is_space->{$self->{nc}}) {
1727    
1728     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1729    
1730     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1731     $self->{line_prev} = $self->{line};
1732     $self->{column_prev} = $self->{column};
1733     $self->{column}++;
1734     $self->{nc}
1735     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1736     } else {
1737     $self->{set_nc}->($self);
1738     }
1739    
1740     redo A;
1741     } elsif ($self->{nc} == 0x0026) { # &
1742    
1743     ## NOTE: In the spec, the tokenizer is switched to the
1744     ## "entity in attribute value state". In this implementation, the
1745     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1746     ## implementation of the "consume a character reference" algorithm.
1747     $self->{entity_add} = -1;
1748     $self->{prev_state} = $self->{state};
1749     $self->{state} = ENTITY_STATE;
1750    
1751     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1752     $self->{line_prev} = $self->{line};
1753     $self->{column_prev} = $self->{column};
1754     $self->{column}++;
1755     $self->{nc}
1756     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1757     } else {
1758     $self->{set_nc}->($self);
1759     }
1760    
1761     redo A;
1762     } elsif ($self->{nc} == 0x003E) { # >
1763     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1764    
1765     $self->{last_stag_name} = $self->{ct}->{tag_name};
1766     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1767     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1768     if ($self->{ct}->{attributes}) {
1769    
1770     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1771     } else {
1772     ## NOTE: This state should never be reached.
1773    
1774     }
1775     } else {
1776     die "$0: $self->{ct}->{type}: Unknown token type";
1777     }
1778     $self->{state} = DATA_STATE;
1779 wakaba 1.5 $self->{s_kwd} = '';
1780 wakaba 1.1
1781     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1782     $self->{line_prev} = $self->{line};
1783     $self->{column_prev} = $self->{column};
1784     $self->{column}++;
1785     $self->{nc}
1786     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1787     } else {
1788     $self->{set_nc}->($self);
1789     }
1790    
1791    
1792     return ($self->{ct}); # start tag or end tag
1793    
1794     redo A;
1795     } elsif ($self->{nc} == -1) {
1796     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1797     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1798    
1799     $self->{last_stag_name} = $self->{ct}->{tag_name};
1800     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1801     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1802     if ($self->{ct}->{attributes}) {
1803    
1804     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1805     } else {
1806     ## NOTE: This state should never be reached.
1807    
1808     }
1809     } else {
1810     die "$0: $self->{ct}->{type}: Unknown token type";
1811     }
1812     $self->{state} = DATA_STATE;
1813 wakaba 1.5 $self->{s_kwd} = '';
1814 wakaba 1.1 ## reconsume
1815    
1816     return ($self->{ct}); # start tag or end tag
1817    
1818     redo A;
1819     } else {
1820     if ({
1821     0x0022 => 1, # "
1822     0x0027 => 1, # '
1823     0x003D => 1, # =
1824     }->{$self->{nc}}) {
1825    
1826     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1827     } else {
1828    
1829     }
1830     $self->{ca}->{value} .= chr ($self->{nc});
1831     $self->{read_until}->($self->{ca}->{value},
1832     q["'=& >],
1833     length $self->{ca}->{value});
1834    
1835     ## Stay in the state
1836    
1837     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1838     $self->{line_prev} = $self->{line};
1839     $self->{column_prev} = $self->{column};
1840     $self->{column}++;
1841     $self->{nc}
1842     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1843     } else {
1844     $self->{set_nc}->($self);
1845     }
1846    
1847     redo A;
1848     }
1849     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1850     if ($is_space->{$self->{nc}}) {
1851    
1852     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1853    
1854     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1855     $self->{line_prev} = $self->{line};
1856     $self->{column_prev} = $self->{column};
1857     $self->{column}++;
1858     $self->{nc}
1859     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1860     } else {
1861     $self->{set_nc}->($self);
1862     }
1863    
1864     redo A;
1865     } elsif ($self->{nc} == 0x003E) { # >
1866     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1867    
1868     $self->{last_stag_name} = $self->{ct}->{tag_name};
1869     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1870     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1871     if ($self->{ct}->{attributes}) {
1872    
1873     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1874     } else {
1875     ## NOTE: This state should never be reached.
1876    
1877     }
1878     } else {
1879     die "$0: $self->{ct}->{type}: Unknown token type";
1880     }
1881     $self->{state} = DATA_STATE;
1882 wakaba 1.5 $self->{s_kwd} = '';
1883 wakaba 1.1
1884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1885     $self->{line_prev} = $self->{line};
1886     $self->{column_prev} = $self->{column};
1887     $self->{column}++;
1888     $self->{nc}
1889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1890     } else {
1891     $self->{set_nc}->($self);
1892     }
1893    
1894    
1895     return ($self->{ct}); # start tag or end tag
1896    
1897     redo A;
1898     } elsif ($self->{nc} == 0x002F) { # /
1899    
1900     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1901    
1902     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1903     $self->{line_prev} = $self->{line};
1904     $self->{column_prev} = $self->{column};
1905     $self->{column}++;
1906     $self->{nc}
1907     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1908     } else {
1909     $self->{set_nc}->($self);
1910     }
1911    
1912     redo A;
1913     } elsif ($self->{nc} == -1) {
1914     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1915     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1916    
1917     $self->{last_stag_name} = $self->{ct}->{tag_name};
1918     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1919     if ($self->{ct}->{attributes}) {
1920    
1921     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1922     } else {
1923     ## NOTE: This state should never be reached.
1924    
1925     }
1926     } else {
1927     die "$0: $self->{ct}->{type}: Unknown token type";
1928     }
1929     $self->{state} = DATA_STATE;
1930 wakaba 1.5 $self->{s_kwd} = '';
1931 wakaba 1.1 ## Reconsume.
1932     return ($self->{ct}); # start tag or end tag
1933     redo A;
1934     } else {
1935    
1936     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1937     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1938     ## reconsume
1939     redo A;
1940     }
1941     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1942     if ($self->{nc} == 0x003E) { # >
1943     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1944    
1945     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1946     ## TODO: Different type than slash in start tag
1947     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1948     if ($self->{ct}->{attributes}) {
1949    
1950     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1951     } else {
1952    
1953     }
1954     ## TODO: Test |<title></title/>|
1955     } else {
1956    
1957     $self->{self_closing} = 1;
1958     }
1959    
1960     $self->{state} = DATA_STATE;
1961 wakaba 1.5 $self->{s_kwd} = '';
1962 wakaba 1.1
1963     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1964     $self->{line_prev} = $self->{line};
1965     $self->{column_prev} = $self->{column};
1966     $self->{column}++;
1967     $self->{nc}
1968     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1969     } else {
1970     $self->{set_nc}->($self);
1971     }
1972    
1973    
1974     return ($self->{ct}); # start tag or end tag
1975    
1976     redo A;
1977     } elsif ($self->{nc} == -1) {
1978     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1979     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1980    
1981     $self->{last_stag_name} = $self->{ct}->{tag_name};
1982     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1983     if ($self->{ct}->{attributes}) {
1984    
1985     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1986     } else {
1987     ## NOTE: This state should never be reached.
1988    
1989     }
1990     } else {
1991     die "$0: $self->{ct}->{type}: Unknown token type";
1992     }
1993     $self->{state} = DATA_STATE;
1994 wakaba 1.5 $self->{s_kwd} = '';
1995 wakaba 1.1 ## Reconsume.
1996     return ($self->{ct}); # start tag or end tag
1997     redo A;
1998     } else {
1999    
2000     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2001     ## TODO: This error type is wrong.
2002     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2003     ## Reconsume.
2004     redo A;
2005     }
2006     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2007     ## (only happen if PCDATA state)
2008    
2009     ## NOTE: Unlike spec's "bogus comment state", this implementation
2010     ## consumes characters one-by-one basis.
2011    
2012     if ($self->{nc} == 0x003E) { # >
2013    
2014     $self->{state} = DATA_STATE;
2015 wakaba 1.5 $self->{s_kwd} = '';
2016 wakaba 1.1
2017     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2018     $self->{line_prev} = $self->{line};
2019     $self->{column_prev} = $self->{column};
2020     $self->{column}++;
2021     $self->{nc}
2022     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2023     } else {
2024     $self->{set_nc}->($self);
2025     }
2026    
2027    
2028     return ($self->{ct}); # comment
2029     redo A;
2030     } elsif ($self->{nc} == -1) {
2031    
2032     $self->{state} = DATA_STATE;
2033 wakaba 1.5 $self->{s_kwd} = '';
2034 wakaba 1.1 ## reconsume
2035    
2036     return ($self->{ct}); # comment
2037     redo A;
2038     } else {
2039    
2040     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2041     $self->{read_until}->($self->{ct}->{data},
2042     q[>],
2043     length $self->{ct}->{data});
2044    
2045     ## Stay in the state.
2046    
2047     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2048     $self->{line_prev} = $self->{line};
2049     $self->{column_prev} = $self->{column};
2050     $self->{column}++;
2051     $self->{nc}
2052     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2053     } else {
2054     $self->{set_nc}->($self);
2055     }
2056    
2057     redo A;
2058     }
2059     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2060     ## (only happen if PCDATA state)
2061    
2062     if ($self->{nc} == 0x002D) { # -
2063    
2064     $self->{state} = MD_HYPHEN_STATE;
2065    
2066     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2067     $self->{line_prev} = $self->{line};
2068     $self->{column_prev} = $self->{column};
2069     $self->{column}++;
2070     $self->{nc}
2071     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2072     } else {
2073     $self->{set_nc}->($self);
2074     }
2075    
2076     redo A;
2077     } elsif ($self->{nc} == 0x0044 or # D
2078     $self->{nc} == 0x0064) { # d
2079     ## ASCII case-insensitive.
2080    
2081     $self->{state} = MD_DOCTYPE_STATE;
2082     $self->{s_kwd} = chr $self->{nc};
2083    
2084     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2085     $self->{line_prev} = $self->{line};
2086     $self->{column_prev} = $self->{column};
2087     $self->{column}++;
2088     $self->{nc}
2089     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2090     } else {
2091     $self->{set_nc}->($self);
2092     }
2093    
2094     redo A;
2095 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2096     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2097     $self->{is_xml}) and
2098 wakaba 1.1 $self->{nc} == 0x005B) { # [
2099    
2100     $self->{state} = MD_CDATA_STATE;
2101     $self->{s_kwd} = '[';
2102    
2103     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2104     $self->{line_prev} = $self->{line};
2105     $self->{column_prev} = $self->{column};
2106     $self->{column}++;
2107     $self->{nc}
2108     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2109     } else {
2110     $self->{set_nc}->($self);
2111     }
2112    
2113     redo A;
2114     } else {
2115    
2116     }
2117    
2118     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2119     line => $self->{line_prev},
2120     column => $self->{column_prev} - 1);
2121     ## Reconsume.
2122     $self->{state} = BOGUS_COMMENT_STATE;
2123     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2124     line => $self->{line_prev},
2125     column => $self->{column_prev} - 1,
2126     };
2127     redo A;
2128     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2129     if ($self->{nc} == 0x002D) { # -
2130    
2131     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2132     line => $self->{line_prev},
2133     column => $self->{column_prev} - 2,
2134     };
2135     $self->{state} = COMMENT_START_STATE;
2136    
2137     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2138     $self->{line_prev} = $self->{line};
2139     $self->{column_prev} = $self->{column};
2140     $self->{column}++;
2141     $self->{nc}
2142     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2143     } else {
2144     $self->{set_nc}->($self);
2145     }
2146    
2147     redo A;
2148     } else {
2149    
2150     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2151     line => $self->{line_prev},
2152     column => $self->{column_prev} - 2);
2153     $self->{state} = BOGUS_COMMENT_STATE;
2154     ## Reconsume.
2155     $self->{ct} = {type => COMMENT_TOKEN,
2156     data => '-',
2157     line => $self->{line_prev},
2158     column => $self->{column_prev} - 2,
2159     };
2160     redo A;
2161     }
2162     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2163     ## ASCII case-insensitive.
2164     if ($self->{nc} == [
2165     undef,
2166     0x004F, # O
2167     0x0043, # C
2168     0x0054, # T
2169     0x0059, # Y
2170     0x0050, # P
2171     ]->[length $self->{s_kwd}] or
2172     $self->{nc} == [
2173     undef,
2174     0x006F, # o
2175     0x0063, # c
2176     0x0074, # t
2177     0x0079, # y
2178     0x0070, # p
2179     ]->[length $self->{s_kwd}]) {
2180    
2181     ## Stay in the state.
2182     $self->{s_kwd} .= chr $self->{nc};
2183    
2184     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2185     $self->{line_prev} = $self->{line};
2186     $self->{column_prev} = $self->{column};
2187     $self->{column}++;
2188     $self->{nc}
2189     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2190     } else {
2191     $self->{set_nc}->($self);
2192     }
2193    
2194     redo A;
2195     } elsif ((length $self->{s_kwd}) == 6 and
2196     ($self->{nc} == 0x0045 or # E
2197     $self->{nc} == 0x0065)) { # e
2198    
2199     $self->{state} = DOCTYPE_STATE;
2200     $self->{ct} = {type => DOCTYPE_TOKEN,
2201     quirks => 1,
2202     line => $self->{line_prev},
2203     column => $self->{column_prev} - 7,
2204     };
2205    
2206     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2207     $self->{line_prev} = $self->{line};
2208     $self->{column_prev} = $self->{column};
2209     $self->{column}++;
2210     $self->{nc}
2211     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2212     } else {
2213     $self->{set_nc}->($self);
2214     }
2215    
2216     redo A;
2217     } else {
2218    
2219     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2220     line => $self->{line_prev},
2221     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2222     $self->{state} = BOGUS_COMMENT_STATE;
2223     ## Reconsume.
2224     $self->{ct} = {type => COMMENT_TOKEN,
2225     data => $self->{s_kwd},
2226     line => $self->{line_prev},
2227     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2228     };
2229     redo A;
2230     }
2231     } elsif ($self->{state} == MD_CDATA_STATE) {
2232     if ($self->{nc} == {
2233     '[' => 0x0043, # C
2234     '[C' => 0x0044, # D
2235     '[CD' => 0x0041, # A
2236     '[CDA' => 0x0054, # T
2237     '[CDAT' => 0x0041, # A
2238     }->{$self->{s_kwd}}) {
2239    
2240     ## Stay in the state.
2241     $self->{s_kwd} .= chr $self->{nc};
2242    
2243     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2244     $self->{line_prev} = $self->{line};
2245     $self->{column_prev} = $self->{column};
2246     $self->{column}++;
2247     $self->{nc}
2248     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2249     } else {
2250     $self->{set_nc}->($self);
2251     }
2252    
2253     redo A;
2254     } elsif ($self->{s_kwd} eq '[CDATA' and
2255     $self->{nc} == 0x005B) { # [
2256 wakaba 1.6 if ($self->{is_xml} and
2257     not $self->{tainted} and
2258     @{$self->{open_elements} or []} == 0) {
2259 wakaba 1.8
2260 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2261     line => $self->{line_prev},
2262     column => $self->{column_prev} - 7);
2263     $self->{tainted} = 1;
2264 wakaba 1.8 } else {
2265    
2266 wakaba 1.6 }
2267    
2268 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2269     data => '',
2270     line => $self->{line_prev},
2271     column => $self->{column_prev} - 7};
2272     $self->{state} = CDATA_SECTION_STATE;
2273    
2274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2275     $self->{line_prev} = $self->{line};
2276     $self->{column_prev} = $self->{column};
2277     $self->{column}++;
2278     $self->{nc}
2279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2280     } else {
2281     $self->{set_nc}->($self);
2282     }
2283    
2284     redo A;
2285     } else {
2286    
2287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2288     line => $self->{line_prev},
2289     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2290     $self->{state} = BOGUS_COMMENT_STATE;
2291     ## Reconsume.
2292     $self->{ct} = {type => COMMENT_TOKEN,
2293     data => $self->{s_kwd},
2294     line => $self->{line_prev},
2295     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2296     };
2297     redo A;
2298     }
2299     } elsif ($self->{state} == COMMENT_START_STATE) {
2300     if ($self->{nc} == 0x002D) { # -
2301    
2302     $self->{state} = COMMENT_START_DASH_STATE;
2303    
2304     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2305     $self->{line_prev} = $self->{line};
2306     $self->{column_prev} = $self->{column};
2307     $self->{column}++;
2308     $self->{nc}
2309     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2310     } else {
2311     $self->{set_nc}->($self);
2312     }
2313    
2314     redo A;
2315     } elsif ($self->{nc} == 0x003E) { # >
2316    
2317     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2318     $self->{state} = DATA_STATE;
2319 wakaba 1.5 $self->{s_kwd} = '';
2320 wakaba 1.1
2321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2322     $self->{line_prev} = $self->{line};
2323     $self->{column_prev} = $self->{column};
2324     $self->{column}++;
2325     $self->{nc}
2326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2327     } else {
2328     $self->{set_nc}->($self);
2329     }
2330    
2331    
2332     return ($self->{ct}); # comment
2333    
2334     redo A;
2335     } elsif ($self->{nc} == -1) {
2336    
2337     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2338     $self->{state} = DATA_STATE;
2339 wakaba 1.5 $self->{s_kwd} = '';
2340 wakaba 1.1 ## reconsume
2341    
2342     return ($self->{ct}); # comment
2343    
2344     redo A;
2345     } else {
2346    
2347     $self->{ct}->{data} # comment
2348     .= chr ($self->{nc});
2349     $self->{state} = COMMENT_STATE;
2350    
2351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2352     $self->{line_prev} = $self->{line};
2353     $self->{column_prev} = $self->{column};
2354     $self->{column}++;
2355     $self->{nc}
2356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2357     } else {
2358     $self->{set_nc}->($self);
2359     }
2360    
2361     redo A;
2362     }
2363     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2364     if ($self->{nc} == 0x002D) { # -
2365    
2366     $self->{state} = COMMENT_END_STATE;
2367    
2368     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2369     $self->{line_prev} = $self->{line};
2370     $self->{column_prev} = $self->{column};
2371     $self->{column}++;
2372     $self->{nc}
2373     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2374     } else {
2375     $self->{set_nc}->($self);
2376     }
2377    
2378     redo A;
2379     } elsif ($self->{nc} == 0x003E) { # >
2380    
2381     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2382     $self->{state} = DATA_STATE;
2383 wakaba 1.5 $self->{s_kwd} = '';
2384 wakaba 1.1
2385     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2386     $self->{line_prev} = $self->{line};
2387     $self->{column_prev} = $self->{column};
2388     $self->{column}++;
2389     $self->{nc}
2390     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2391     } else {
2392     $self->{set_nc}->($self);
2393     }
2394    
2395    
2396     return ($self->{ct}); # comment
2397    
2398     redo A;
2399     } elsif ($self->{nc} == -1) {
2400    
2401     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2402     $self->{state} = DATA_STATE;
2403 wakaba 1.5 $self->{s_kwd} = '';
2404 wakaba 1.1 ## reconsume
2405    
2406     return ($self->{ct}); # comment
2407    
2408     redo A;
2409     } else {
2410    
2411     $self->{ct}->{data} # comment
2412     .= '-' . chr ($self->{nc});
2413     $self->{state} = COMMENT_STATE;
2414    
2415     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2416     $self->{line_prev} = $self->{line};
2417     $self->{column_prev} = $self->{column};
2418     $self->{column}++;
2419     $self->{nc}
2420     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2421     } else {
2422     $self->{set_nc}->($self);
2423     }
2424    
2425     redo A;
2426     }
2427     } elsif ($self->{state} == COMMENT_STATE) {
2428     if ($self->{nc} == 0x002D) { # -
2429    
2430     $self->{state} = COMMENT_END_DASH_STATE;
2431    
2432     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2433     $self->{line_prev} = $self->{line};
2434     $self->{column_prev} = $self->{column};
2435     $self->{column}++;
2436     $self->{nc}
2437     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2438     } else {
2439     $self->{set_nc}->($self);
2440     }
2441    
2442     redo A;
2443     } elsif ($self->{nc} == -1) {
2444    
2445     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2446     $self->{state} = DATA_STATE;
2447 wakaba 1.5 $self->{s_kwd} = '';
2448 wakaba 1.1 ## reconsume
2449    
2450     return ($self->{ct}); # comment
2451    
2452     redo A;
2453     } else {
2454    
2455     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2456     $self->{read_until}->($self->{ct}->{data},
2457     q[-],
2458     length $self->{ct}->{data});
2459    
2460     ## Stay in the state
2461    
2462     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2463     $self->{line_prev} = $self->{line};
2464     $self->{column_prev} = $self->{column};
2465     $self->{column}++;
2466     $self->{nc}
2467     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2468     } else {
2469     $self->{set_nc}->($self);
2470     }
2471    
2472     redo A;
2473     }
2474     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2475     if ($self->{nc} == 0x002D) { # -
2476    
2477     $self->{state} = COMMENT_END_STATE;
2478    
2479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480     $self->{line_prev} = $self->{line};
2481     $self->{column_prev} = $self->{column};
2482     $self->{column}++;
2483     $self->{nc}
2484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485     } else {
2486     $self->{set_nc}->($self);
2487     }
2488    
2489     redo A;
2490     } elsif ($self->{nc} == -1) {
2491    
2492     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2493 wakaba 1.5 $self->{s_kwd} = '';
2494 wakaba 1.1 $self->{state} = DATA_STATE;
2495 wakaba 1.5 $self->{s_kwd} = '';
2496 wakaba 1.1 ## reconsume
2497    
2498     return ($self->{ct}); # comment
2499    
2500     redo A;
2501     } else {
2502    
2503     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2504     $self->{state} = COMMENT_STATE;
2505    
2506     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2507     $self->{line_prev} = $self->{line};
2508     $self->{column_prev} = $self->{column};
2509     $self->{column}++;
2510     $self->{nc}
2511     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2512     } else {
2513     $self->{set_nc}->($self);
2514     }
2515    
2516     redo A;
2517     }
2518     } elsif ($self->{state} == COMMENT_END_STATE) {
2519     if ($self->{nc} == 0x003E) { # >
2520    
2521     $self->{state} = DATA_STATE;
2522 wakaba 1.5 $self->{s_kwd} = '';
2523 wakaba 1.1
2524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525     $self->{line_prev} = $self->{line};
2526     $self->{column_prev} = $self->{column};
2527     $self->{column}++;
2528     $self->{nc}
2529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2530     } else {
2531     $self->{set_nc}->($self);
2532     }
2533    
2534    
2535     return ($self->{ct}); # comment
2536    
2537     redo A;
2538     } elsif ($self->{nc} == 0x002D) { # -
2539    
2540     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2541     line => $self->{line_prev},
2542     column => $self->{column_prev});
2543     $self->{ct}->{data} .= '-'; # comment
2544     ## Stay in the state
2545    
2546     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2547     $self->{line_prev} = $self->{line};
2548     $self->{column_prev} = $self->{column};
2549     $self->{column}++;
2550     $self->{nc}
2551     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2552     } else {
2553     $self->{set_nc}->($self);
2554     }
2555    
2556     redo A;
2557     } elsif ($self->{nc} == -1) {
2558    
2559     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2560     $self->{state} = DATA_STATE;
2561 wakaba 1.5 $self->{s_kwd} = '';
2562 wakaba 1.1 ## reconsume
2563    
2564     return ($self->{ct}); # comment
2565    
2566     redo A;
2567     } else {
2568    
2569     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2570     line => $self->{line_prev},
2571     column => $self->{column_prev});
2572     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2573     $self->{state} = COMMENT_STATE;
2574    
2575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2576     $self->{line_prev} = $self->{line};
2577     $self->{column_prev} = $self->{column};
2578     $self->{column}++;
2579     $self->{nc}
2580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2581     } else {
2582     $self->{set_nc}->($self);
2583     }
2584    
2585     redo A;
2586     }
2587     } elsif ($self->{state} == DOCTYPE_STATE) {
2588     if ($is_space->{$self->{nc}}) {
2589    
2590     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2591    
2592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2593     $self->{line_prev} = $self->{line};
2594     $self->{column_prev} = $self->{column};
2595     $self->{column}++;
2596     $self->{nc}
2597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2598     } else {
2599     $self->{set_nc}->($self);
2600     }
2601    
2602     redo A;
2603     } else {
2604    
2605     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2606     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2607     ## reconsume
2608     redo A;
2609     }
2610     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2611     if ($is_space->{$self->{nc}}) {
2612    
2613     ## Stay in the state
2614    
2615     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2616     $self->{line_prev} = $self->{line};
2617     $self->{column_prev} = $self->{column};
2618     $self->{column}++;
2619     $self->{nc}
2620     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2621     } else {
2622     $self->{set_nc}->($self);
2623     }
2624    
2625     redo A;
2626     } elsif ($self->{nc} == 0x003E) { # >
2627    
2628     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2629     $self->{state} = DATA_STATE;
2630 wakaba 1.5 $self->{s_kwd} = '';
2631 wakaba 1.1
2632     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2633     $self->{line_prev} = $self->{line};
2634     $self->{column_prev} = $self->{column};
2635     $self->{column}++;
2636     $self->{nc}
2637     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2638     } else {
2639     $self->{set_nc}->($self);
2640     }
2641    
2642    
2643     return ($self->{ct}); # DOCTYPE (quirks)
2644    
2645     redo A;
2646     } elsif ($self->{nc} == -1) {
2647    
2648     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2649     $self->{state} = DATA_STATE;
2650 wakaba 1.5 $self->{s_kwd} = '';
2651 wakaba 1.1 ## reconsume
2652    
2653     return ($self->{ct}); # DOCTYPE (quirks)
2654    
2655     redo A;
2656     } else {
2657    
2658     $self->{ct}->{name} = chr $self->{nc};
2659     delete $self->{ct}->{quirks};
2660     $self->{state} = DOCTYPE_NAME_STATE;
2661    
2662     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2663     $self->{line_prev} = $self->{line};
2664     $self->{column_prev} = $self->{column};
2665     $self->{column}++;
2666     $self->{nc}
2667     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2668     } else {
2669     $self->{set_nc}->($self);
2670     }
2671    
2672     redo A;
2673     }
2674     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2675     ## ISSUE: Redundant "First," in the spec.
2676     if ($is_space->{$self->{nc}}) {
2677    
2678     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2679    
2680     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2681     $self->{line_prev} = $self->{line};
2682     $self->{column_prev} = $self->{column};
2683     $self->{column}++;
2684     $self->{nc}
2685     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2686     } else {
2687     $self->{set_nc}->($self);
2688     }
2689    
2690     redo A;
2691     } elsif ($self->{nc} == 0x003E) { # >
2692    
2693     $self->{state} = DATA_STATE;
2694 wakaba 1.5 $self->{s_kwd} = '';
2695 wakaba 1.1
2696     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2697     $self->{line_prev} = $self->{line};
2698     $self->{column_prev} = $self->{column};
2699     $self->{column}++;
2700     $self->{nc}
2701     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2702     } else {
2703     $self->{set_nc}->($self);
2704     }
2705    
2706    
2707     return ($self->{ct}); # DOCTYPE
2708    
2709     redo A;
2710     } elsif ($self->{nc} == -1) {
2711    
2712     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2713     $self->{state} = DATA_STATE;
2714 wakaba 1.5 $self->{s_kwd} = '';
2715 wakaba 1.1 ## reconsume
2716    
2717     $self->{ct}->{quirks} = 1;
2718     return ($self->{ct}); # DOCTYPE
2719    
2720     redo A;
2721     } else {
2722    
2723     $self->{ct}->{name}
2724     .= chr ($self->{nc}); # DOCTYPE
2725     ## Stay in the state
2726    
2727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2728     $self->{line_prev} = $self->{line};
2729     $self->{column_prev} = $self->{column};
2730     $self->{column}++;
2731     $self->{nc}
2732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2733     } else {
2734     $self->{set_nc}->($self);
2735     }
2736    
2737     redo A;
2738     }
2739     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2740     if ($is_space->{$self->{nc}}) {
2741    
2742     ## Stay in the state
2743    
2744     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2745     $self->{line_prev} = $self->{line};
2746     $self->{column_prev} = $self->{column};
2747     $self->{column}++;
2748     $self->{nc}
2749     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2750     } else {
2751     $self->{set_nc}->($self);
2752     }
2753    
2754     redo A;
2755     } elsif ($self->{nc} == 0x003E) { # >
2756    
2757     $self->{state} = DATA_STATE;
2758 wakaba 1.5 $self->{s_kwd} = '';
2759 wakaba 1.1
2760     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2761     $self->{line_prev} = $self->{line};
2762     $self->{column_prev} = $self->{column};
2763     $self->{column}++;
2764     $self->{nc}
2765     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2766     } else {
2767     $self->{set_nc}->($self);
2768     }
2769    
2770    
2771     return ($self->{ct}); # DOCTYPE
2772    
2773     redo A;
2774     } elsif ($self->{nc} == -1) {
2775    
2776     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2777     $self->{state} = DATA_STATE;
2778 wakaba 1.5 $self->{s_kwd} = '';
2779 wakaba 1.1 ## reconsume
2780    
2781     $self->{ct}->{quirks} = 1;
2782     return ($self->{ct}); # DOCTYPE
2783    
2784     redo A;
2785     } elsif ($self->{nc} == 0x0050 or # P
2786     $self->{nc} == 0x0070) { # p
2787     $self->{state} = PUBLIC_STATE;
2788     $self->{s_kwd} = chr $self->{nc};
2789    
2790     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2791     $self->{line_prev} = $self->{line};
2792     $self->{column_prev} = $self->{column};
2793     $self->{column}++;
2794     $self->{nc}
2795     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2796     } else {
2797     $self->{set_nc}->($self);
2798     }
2799    
2800     redo A;
2801     } elsif ($self->{nc} == 0x0053 or # S
2802     $self->{nc} == 0x0073) { # s
2803     $self->{state} = SYSTEM_STATE;
2804     $self->{s_kwd} = chr $self->{nc};
2805    
2806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2807     $self->{line_prev} = $self->{line};
2808     $self->{column_prev} = $self->{column};
2809     $self->{column}++;
2810     $self->{nc}
2811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2812     } else {
2813     $self->{set_nc}->($self);
2814     }
2815    
2816     redo A;
2817     } else {
2818    
2819     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2820     $self->{ct}->{quirks} = 1;
2821    
2822     $self->{state} = BOGUS_DOCTYPE_STATE;
2823    
2824     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2825     $self->{line_prev} = $self->{line};
2826     $self->{column_prev} = $self->{column};
2827     $self->{column}++;
2828     $self->{nc}
2829     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2830     } else {
2831     $self->{set_nc}->($self);
2832     }
2833    
2834     redo A;
2835     }
2836     } elsif ($self->{state} == PUBLIC_STATE) {
2837     ## ASCII case-insensitive
2838     if ($self->{nc} == [
2839     undef,
2840     0x0055, # U
2841     0x0042, # B
2842     0x004C, # L
2843     0x0049, # I
2844     ]->[length $self->{s_kwd}] or
2845     $self->{nc} == [
2846     undef,
2847     0x0075, # u
2848     0x0062, # b
2849     0x006C, # l
2850     0x0069, # i
2851     ]->[length $self->{s_kwd}]) {
2852    
2853     ## Stay in the state.
2854     $self->{s_kwd} .= chr $self->{nc};
2855    
2856     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2857     $self->{line_prev} = $self->{line};
2858     $self->{column_prev} = $self->{column};
2859     $self->{column}++;
2860     $self->{nc}
2861     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2862     } else {
2863     $self->{set_nc}->($self);
2864     }
2865    
2866     redo A;
2867     } elsif ((length $self->{s_kwd}) == 5 and
2868     ($self->{nc} == 0x0043 or # C
2869     $self->{nc} == 0x0063)) { # c
2870    
2871     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2872    
2873     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2874     $self->{line_prev} = $self->{line};
2875     $self->{column_prev} = $self->{column};
2876     $self->{column}++;
2877     $self->{nc}
2878     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2879     } else {
2880     $self->{set_nc}->($self);
2881     }
2882    
2883     redo A;
2884     } else {
2885    
2886     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2887     line => $self->{line_prev},
2888     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2889     $self->{ct}->{quirks} = 1;
2890    
2891     $self->{state} = BOGUS_DOCTYPE_STATE;
2892     ## Reconsume.
2893     redo A;
2894     }
2895     } elsif ($self->{state} == SYSTEM_STATE) {
2896     ## ASCII case-insensitive
2897     if ($self->{nc} == [
2898     undef,
2899     0x0059, # Y
2900     0x0053, # S
2901     0x0054, # T
2902     0x0045, # E
2903     ]->[length $self->{s_kwd}] or
2904     $self->{nc} == [
2905     undef,
2906     0x0079, # y
2907     0x0073, # s
2908     0x0074, # t
2909     0x0065, # e
2910     ]->[length $self->{s_kwd}]) {
2911    
2912     ## Stay in the state.
2913     $self->{s_kwd} .= chr $self->{nc};
2914    
2915     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2916     $self->{line_prev} = $self->{line};
2917     $self->{column_prev} = $self->{column};
2918     $self->{column}++;
2919     $self->{nc}
2920     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2921     } else {
2922     $self->{set_nc}->($self);
2923     }
2924    
2925     redo A;
2926     } elsif ((length $self->{s_kwd}) == 5 and
2927     ($self->{nc} == 0x004D or # M
2928     $self->{nc} == 0x006D)) { # m
2929    
2930     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2931    
2932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2933     $self->{line_prev} = $self->{line};
2934     $self->{column_prev} = $self->{column};
2935     $self->{column}++;
2936     $self->{nc}
2937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2938     } else {
2939     $self->{set_nc}->($self);
2940     }
2941    
2942     redo A;
2943     } else {
2944    
2945     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2946     line => $self->{line_prev},
2947     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2948     $self->{ct}->{quirks} = 1;
2949    
2950     $self->{state} = BOGUS_DOCTYPE_STATE;
2951     ## Reconsume.
2952     redo A;
2953     }
2954     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2955     if ($is_space->{$self->{nc}}) {
2956    
2957     ## Stay in the state
2958    
2959     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2960     $self->{line_prev} = $self->{line};
2961     $self->{column_prev} = $self->{column};
2962     $self->{column}++;
2963     $self->{nc}
2964     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2965     } else {
2966     $self->{set_nc}->($self);
2967     }
2968    
2969     redo A;
2970     } elsif ($self->{nc} eq 0x0022) { # "
2971    
2972     $self->{ct}->{pubid} = ''; # DOCTYPE
2973     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2974    
2975     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2976     $self->{line_prev} = $self->{line};
2977     $self->{column_prev} = $self->{column};
2978     $self->{column}++;
2979     $self->{nc}
2980     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2981     } else {
2982     $self->{set_nc}->($self);
2983     }
2984    
2985     redo A;
2986     } elsif ($self->{nc} eq 0x0027) { # '
2987    
2988     $self->{ct}->{pubid} = ''; # DOCTYPE
2989     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2990    
2991     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2992     $self->{line_prev} = $self->{line};
2993     $self->{column_prev} = $self->{column};
2994     $self->{column}++;
2995     $self->{nc}
2996     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2997     } else {
2998     $self->{set_nc}->($self);
2999     }
3000    
3001     redo A;
3002     } elsif ($self->{nc} eq 0x003E) { # >
3003    
3004     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3005    
3006     $self->{state} = DATA_STATE;
3007 wakaba 1.5 $self->{s_kwd} = '';
3008 wakaba 1.1
3009     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3010     $self->{line_prev} = $self->{line};
3011     $self->{column_prev} = $self->{column};
3012     $self->{column}++;
3013     $self->{nc}
3014     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3015     } else {
3016     $self->{set_nc}->($self);
3017     }
3018    
3019    
3020     $self->{ct}->{quirks} = 1;
3021     return ($self->{ct}); # DOCTYPE
3022    
3023     redo A;
3024     } elsif ($self->{nc} == -1) {
3025    
3026     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3027    
3028     $self->{state} = DATA_STATE;
3029 wakaba 1.5 $self->{s_kwd} = '';
3030 wakaba 1.1 ## reconsume
3031    
3032     $self->{ct}->{quirks} = 1;
3033     return ($self->{ct}); # DOCTYPE
3034    
3035     redo A;
3036     } else {
3037    
3038     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3039     $self->{ct}->{quirks} = 1;
3040    
3041     $self->{state} = BOGUS_DOCTYPE_STATE;
3042    
3043     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3044     $self->{line_prev} = $self->{line};
3045     $self->{column_prev} = $self->{column};
3046     $self->{column}++;
3047     $self->{nc}
3048     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3049     } else {
3050     $self->{set_nc}->($self);
3051     }
3052    
3053     redo A;
3054     }
3055     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3056     if ($self->{nc} == 0x0022) { # "
3057    
3058     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3059    
3060     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3061     $self->{line_prev} = $self->{line};
3062     $self->{column_prev} = $self->{column};
3063     $self->{column}++;
3064     $self->{nc}
3065     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3066     } else {
3067     $self->{set_nc}->($self);
3068     }
3069    
3070     redo A;
3071     } elsif ($self->{nc} == 0x003E) { # >
3072    
3073     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3074    
3075     $self->{state} = DATA_STATE;
3076 wakaba 1.5 $self->{s_kwd} = '';
3077 wakaba 1.1
3078     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3079     $self->{line_prev} = $self->{line};
3080     $self->{column_prev} = $self->{column};
3081     $self->{column}++;
3082     $self->{nc}
3083     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3084     } else {
3085     $self->{set_nc}->($self);
3086     }
3087    
3088    
3089     $self->{ct}->{quirks} = 1;
3090     return ($self->{ct}); # DOCTYPE
3091    
3092     redo A;
3093     } elsif ($self->{nc} == -1) {
3094    
3095     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3096    
3097     $self->{state} = DATA_STATE;
3098 wakaba 1.5 $self->{s_kwd} = '';
3099 wakaba 1.1 ## reconsume
3100    
3101     $self->{ct}->{quirks} = 1;
3102     return ($self->{ct}); # DOCTYPE
3103    
3104     redo A;
3105     } else {
3106    
3107     $self->{ct}->{pubid} # DOCTYPE
3108     .= chr $self->{nc};
3109     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3110     length $self->{ct}->{pubid});
3111    
3112     ## Stay in the state
3113    
3114     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3115     $self->{line_prev} = $self->{line};
3116     $self->{column_prev} = $self->{column};
3117     $self->{column}++;
3118     $self->{nc}
3119     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3120     } else {
3121     $self->{set_nc}->($self);
3122     }
3123    
3124     redo A;
3125     }
3126     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3127     if ($self->{nc} == 0x0027) { # '
3128    
3129     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3130    
3131     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3132     $self->{line_prev} = $self->{line};
3133     $self->{column_prev} = $self->{column};
3134     $self->{column}++;
3135     $self->{nc}
3136     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3137     } else {
3138     $self->{set_nc}->($self);
3139     }
3140    
3141     redo A;
3142     } elsif ($self->{nc} == 0x003E) { # >
3143    
3144     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3145    
3146     $self->{state} = DATA_STATE;
3147 wakaba 1.5 $self->{s_kwd} = '';
3148 wakaba 1.1
3149     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3150     $self->{line_prev} = $self->{line};
3151     $self->{column_prev} = $self->{column};
3152     $self->{column}++;
3153     $self->{nc}
3154     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3155     } else {
3156     $self->{set_nc}->($self);
3157     }
3158    
3159    
3160     $self->{ct}->{quirks} = 1;
3161     return ($self->{ct}); # DOCTYPE
3162    
3163     redo A;
3164     } elsif ($self->{nc} == -1) {
3165    
3166     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3167    
3168     $self->{state} = DATA_STATE;
3169 wakaba 1.5 $self->{s_kwd} = '';
3170 wakaba 1.1 ## reconsume
3171    
3172     $self->{ct}->{quirks} = 1;
3173     return ($self->{ct}); # DOCTYPE
3174    
3175     redo A;
3176     } else {
3177    
3178     $self->{ct}->{pubid} # DOCTYPE
3179     .= chr $self->{nc};
3180     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3181     length $self->{ct}->{pubid});
3182    
3183     ## Stay in the state
3184    
3185     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3186     $self->{line_prev} = $self->{line};
3187     $self->{column_prev} = $self->{column};
3188     $self->{column}++;
3189     $self->{nc}
3190     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3191     } else {
3192     $self->{set_nc}->($self);
3193     }
3194    
3195     redo A;
3196     }
3197     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3198     if ($is_space->{$self->{nc}}) {
3199    
3200     ## Stay in the state
3201    
3202     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3203     $self->{line_prev} = $self->{line};
3204     $self->{column_prev} = $self->{column};
3205     $self->{column}++;
3206     $self->{nc}
3207     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3208     } else {
3209     $self->{set_nc}->($self);
3210     }
3211    
3212     redo A;
3213     } elsif ($self->{nc} == 0x0022) { # "
3214    
3215     $self->{ct}->{sysid} = ''; # DOCTYPE
3216     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3217    
3218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3219     $self->{line_prev} = $self->{line};
3220     $self->{column_prev} = $self->{column};
3221     $self->{column}++;
3222     $self->{nc}
3223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3224     } else {
3225     $self->{set_nc}->($self);
3226     }
3227    
3228     redo A;
3229     } elsif ($self->{nc} == 0x0027) { # '
3230    
3231     $self->{ct}->{sysid} = ''; # DOCTYPE
3232     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3233    
3234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3235     $self->{line_prev} = $self->{line};
3236     $self->{column_prev} = $self->{column};
3237     $self->{column}++;
3238     $self->{nc}
3239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3240     } else {
3241     $self->{set_nc}->($self);
3242     }
3243    
3244     redo A;
3245     } elsif ($self->{nc} == 0x003E) { # >
3246    
3247     $self->{state} = DATA_STATE;
3248 wakaba 1.5 $self->{s_kwd} = '';
3249 wakaba 1.1
3250     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3251     $self->{line_prev} = $self->{line};
3252     $self->{column_prev} = $self->{column};
3253     $self->{column}++;
3254     $self->{nc}
3255     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3256     } else {
3257     $self->{set_nc}->($self);
3258     }
3259    
3260    
3261     return ($self->{ct}); # DOCTYPE
3262    
3263     redo A;
3264     } elsif ($self->{nc} == -1) {
3265    
3266     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3267    
3268     $self->{state} = DATA_STATE;
3269 wakaba 1.5 $self->{s_kwd} = '';
3270 wakaba 1.1 ## reconsume
3271    
3272     $self->{ct}->{quirks} = 1;
3273     return ($self->{ct}); # DOCTYPE
3274    
3275     redo A;
3276     } else {
3277    
3278     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3279     $self->{ct}->{quirks} = 1;
3280    
3281     $self->{state} = BOGUS_DOCTYPE_STATE;
3282    
3283     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3284     $self->{line_prev} = $self->{line};
3285     $self->{column_prev} = $self->{column};
3286     $self->{column}++;
3287     $self->{nc}
3288     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3289     } else {
3290     $self->{set_nc}->($self);
3291     }
3292    
3293     redo A;
3294     }
3295     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3296     if ($is_space->{$self->{nc}}) {
3297    
3298     ## Stay in the state
3299    
3300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301     $self->{line_prev} = $self->{line};
3302     $self->{column_prev} = $self->{column};
3303     $self->{column}++;
3304     $self->{nc}
3305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306     } else {
3307     $self->{set_nc}->($self);
3308     }
3309    
3310     redo A;
3311     } elsif ($self->{nc} == 0x0022) { # "
3312    
3313     $self->{ct}->{sysid} = ''; # DOCTYPE
3314     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3315    
3316     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3317     $self->{line_prev} = $self->{line};
3318     $self->{column_prev} = $self->{column};
3319     $self->{column}++;
3320     $self->{nc}
3321     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3322     } else {
3323     $self->{set_nc}->($self);
3324     }
3325    
3326     redo A;
3327     } elsif ($self->{nc} == 0x0027) { # '
3328    
3329     $self->{ct}->{sysid} = ''; # DOCTYPE
3330     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3331    
3332     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3333     $self->{line_prev} = $self->{line};
3334     $self->{column_prev} = $self->{column};
3335     $self->{column}++;
3336     $self->{nc}
3337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3338     } else {
3339     $self->{set_nc}->($self);
3340     }
3341    
3342     redo A;
3343     } elsif ($self->{nc} == 0x003E) { # >
3344    
3345     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3346     $self->{state} = DATA_STATE;
3347 wakaba 1.5 $self->{s_kwd} = '';
3348 wakaba 1.1
3349     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3350     $self->{line_prev} = $self->{line};
3351     $self->{column_prev} = $self->{column};
3352     $self->{column}++;
3353     $self->{nc}
3354     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3355     } else {
3356     $self->{set_nc}->($self);
3357     }
3358    
3359    
3360     $self->{ct}->{quirks} = 1;
3361     return ($self->{ct}); # DOCTYPE
3362    
3363     redo A;
3364     } elsif ($self->{nc} == -1) {
3365    
3366     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3367    
3368     $self->{state} = DATA_STATE;
3369 wakaba 1.5 $self->{s_kwd} = '';
3370 wakaba 1.1 ## reconsume
3371    
3372     $self->{ct}->{quirks} = 1;
3373     return ($self->{ct}); # DOCTYPE
3374    
3375     redo A;
3376     } else {
3377    
3378     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3379     $self->{ct}->{quirks} = 1;
3380    
3381     $self->{state} = BOGUS_DOCTYPE_STATE;
3382    
3383     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3384     $self->{line_prev} = $self->{line};
3385     $self->{column_prev} = $self->{column};
3386     $self->{column}++;
3387     $self->{nc}
3388     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3389     } else {
3390     $self->{set_nc}->($self);
3391     }
3392    
3393     redo A;
3394     }
3395     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3396     if ($self->{nc} == 0x0022) { # "
3397    
3398     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3399    
3400     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3401     $self->{line_prev} = $self->{line};
3402     $self->{column_prev} = $self->{column};
3403     $self->{column}++;
3404     $self->{nc}
3405     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3406     } else {
3407     $self->{set_nc}->($self);
3408     }
3409    
3410     redo A;
3411     } elsif ($self->{nc} == 0x003E) { # >
3412    
3413     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3414    
3415     $self->{state} = DATA_STATE;
3416 wakaba 1.5 $self->{s_kwd} = '';
3417 wakaba 1.1
3418     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3419     $self->{line_prev} = $self->{line};
3420     $self->{column_prev} = $self->{column};
3421     $self->{column}++;
3422     $self->{nc}
3423     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3424     } else {
3425     $self->{set_nc}->($self);
3426     }
3427    
3428    
3429     $self->{ct}->{quirks} = 1;
3430     return ($self->{ct}); # DOCTYPE
3431    
3432     redo A;
3433     } elsif ($self->{nc} == -1) {
3434    
3435     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3436    
3437     $self->{state} = DATA_STATE;
3438 wakaba 1.5 $self->{s_kwd} = '';
3439 wakaba 1.1 ## reconsume
3440    
3441     $self->{ct}->{quirks} = 1;
3442     return ($self->{ct}); # DOCTYPE
3443    
3444     redo A;
3445     } else {
3446    
3447     $self->{ct}->{sysid} # DOCTYPE
3448     .= chr $self->{nc};
3449     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3450     length $self->{ct}->{sysid});
3451    
3452     ## Stay in the state
3453    
3454     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3455     $self->{line_prev} = $self->{line};
3456     $self->{column_prev} = $self->{column};
3457     $self->{column}++;
3458     $self->{nc}
3459     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3460     } else {
3461     $self->{set_nc}->($self);
3462     }
3463    
3464     redo A;
3465     }
3466     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3467     if ($self->{nc} == 0x0027) { # '
3468    
3469     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3470    
3471     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3472     $self->{line_prev} = $self->{line};
3473     $self->{column_prev} = $self->{column};
3474     $self->{column}++;
3475     $self->{nc}
3476     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3477     } else {
3478     $self->{set_nc}->($self);
3479     }
3480    
3481     redo A;
3482     } elsif ($self->{nc} == 0x003E) { # >
3483    
3484     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3485    
3486     $self->{state} = DATA_STATE;
3487 wakaba 1.5 $self->{s_kwd} = '';
3488 wakaba 1.1
3489     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3490     $self->{line_prev} = $self->{line};
3491     $self->{column_prev} = $self->{column};
3492     $self->{column}++;
3493     $self->{nc}
3494     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3495     } else {
3496     $self->{set_nc}->($self);
3497     }
3498    
3499    
3500     $self->{ct}->{quirks} = 1;
3501     return ($self->{ct}); # DOCTYPE
3502    
3503     redo A;
3504     } elsif ($self->{nc} == -1) {
3505    
3506     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3507    
3508     $self->{state} = DATA_STATE;
3509 wakaba 1.5 $self->{s_kwd} = '';
3510 wakaba 1.1 ## reconsume
3511    
3512     $self->{ct}->{quirks} = 1;
3513     return ($self->{ct}); # DOCTYPE
3514    
3515     redo A;
3516     } else {
3517    
3518     $self->{ct}->{sysid} # DOCTYPE
3519     .= chr $self->{nc};
3520     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3521     length $self->{ct}->{sysid});
3522    
3523     ## Stay in the state
3524    
3525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3526     $self->{line_prev} = $self->{line};
3527     $self->{column_prev} = $self->{column};
3528     $self->{column}++;
3529     $self->{nc}
3530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3531     } else {
3532     $self->{set_nc}->($self);
3533     }
3534    
3535     redo A;
3536     }
3537     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3538     if ($is_space->{$self->{nc}}) {
3539    
3540     ## Stay in the state
3541    
3542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3543     $self->{line_prev} = $self->{line};
3544     $self->{column_prev} = $self->{column};
3545     $self->{column}++;
3546     $self->{nc}
3547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3548     } else {
3549     $self->{set_nc}->($self);
3550     }
3551    
3552     redo A;
3553     } elsif ($self->{nc} == 0x003E) { # >
3554    
3555     $self->{state} = DATA_STATE;
3556 wakaba 1.5 $self->{s_kwd} = '';
3557 wakaba 1.1
3558     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3559     $self->{line_prev} = $self->{line};
3560     $self->{column_prev} = $self->{column};
3561     $self->{column}++;
3562     $self->{nc}
3563     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3564     } else {
3565     $self->{set_nc}->($self);
3566     }
3567    
3568    
3569     return ($self->{ct}); # DOCTYPE
3570    
3571     redo A;
3572     } elsif ($self->{nc} == -1) {
3573    
3574     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3575     $self->{state} = DATA_STATE;
3576 wakaba 1.5 $self->{s_kwd} = '';
3577 wakaba 1.1 ## reconsume
3578    
3579     $self->{ct}->{quirks} = 1;
3580     return ($self->{ct}); # DOCTYPE
3581    
3582     redo A;
3583     } else {
3584    
3585     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3586     #$self->{ct}->{quirks} = 1;
3587    
3588     $self->{state} = BOGUS_DOCTYPE_STATE;
3589    
3590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3591     $self->{line_prev} = $self->{line};
3592     $self->{column_prev} = $self->{column};
3593     $self->{column}++;
3594     $self->{nc}
3595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3596     } else {
3597     $self->{set_nc}->($self);
3598     }
3599    
3600     redo A;
3601     }
3602     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3603     if ($self->{nc} == 0x003E) { # >
3604    
3605     $self->{state} = DATA_STATE;
3606 wakaba 1.5 $self->{s_kwd} = '';
3607 wakaba 1.1
3608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3609     $self->{line_prev} = $self->{line};
3610     $self->{column_prev} = $self->{column};
3611     $self->{column}++;
3612     $self->{nc}
3613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3614     } else {
3615     $self->{set_nc}->($self);
3616     }
3617    
3618    
3619     return ($self->{ct}); # DOCTYPE
3620    
3621     redo A;
3622     } elsif ($self->{nc} == -1) {
3623    
3624     $self->{state} = DATA_STATE;
3625 wakaba 1.5 $self->{s_kwd} = '';
3626 wakaba 1.1 ## reconsume
3627    
3628     return ($self->{ct}); # DOCTYPE
3629    
3630     redo A;
3631     } else {
3632    
3633     my $s = '';
3634     $self->{read_until}->($s, q[>], 0);
3635    
3636     ## Stay in the state
3637    
3638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3639     $self->{line_prev} = $self->{line};
3640     $self->{column_prev} = $self->{column};
3641     $self->{column}++;
3642     $self->{nc}
3643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3644     } else {
3645     $self->{set_nc}->($self);
3646     }
3647    
3648     redo A;
3649     }
3650     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3651     ## NOTE: "CDATA section state" in the state is jointly implemented
3652     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3653     ## and |CDATA_SECTION_MSE2_STATE|.
3654    
3655     if ($self->{nc} == 0x005D) { # ]
3656    
3657     $self->{state} = CDATA_SECTION_MSE1_STATE;
3658    
3659     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3660     $self->{line_prev} = $self->{line};
3661     $self->{column_prev} = $self->{column};
3662     $self->{column}++;
3663     $self->{nc}
3664     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3665     } else {
3666     $self->{set_nc}->($self);
3667     }
3668    
3669     redo A;
3670     } elsif ($self->{nc} == -1) {
3671 wakaba 1.6 if ($self->{is_xml}) {
3672 wakaba 1.8
3673 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3674 wakaba 1.8 } else {
3675    
3676 wakaba 1.6 }
3677    
3678 wakaba 1.1 $self->{state} = DATA_STATE;
3679 wakaba 1.5 $self->{s_kwd} = '';
3680 wakaba 1.1
3681     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3682     $self->{line_prev} = $self->{line};
3683     $self->{column_prev} = $self->{column};
3684     $self->{column}++;
3685     $self->{nc}
3686     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3687     } else {
3688     $self->{set_nc}->($self);
3689     }
3690    
3691     if (length $self->{ct}->{data}) { # character
3692    
3693     return ($self->{ct}); # character
3694     } else {
3695    
3696     ## No token to emit. $self->{ct} is discarded.
3697     }
3698     redo A;
3699     } else {
3700    
3701     $self->{ct}->{data} .= chr $self->{nc};
3702     $self->{read_until}->($self->{ct}->{data},
3703     q<]>,
3704     length $self->{ct}->{data});
3705    
3706     ## Stay in the state.
3707    
3708     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3709     $self->{line_prev} = $self->{line};
3710     $self->{column_prev} = $self->{column};
3711     $self->{column}++;
3712     $self->{nc}
3713     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3714     } else {
3715     $self->{set_nc}->($self);
3716     }
3717    
3718     redo A;
3719     }
3720    
3721     ## ISSUE: "text tokens" in spec.
3722     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3723     if ($self->{nc} == 0x005D) { # ]
3724    
3725     $self->{state} = CDATA_SECTION_MSE2_STATE;
3726    
3727     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3728     $self->{line_prev} = $self->{line};
3729     $self->{column_prev} = $self->{column};
3730     $self->{column}++;
3731     $self->{nc}
3732     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3733     } else {
3734     $self->{set_nc}->($self);
3735     }
3736    
3737     redo A;
3738     } else {
3739    
3740     $self->{ct}->{data} .= ']';
3741     $self->{state} = CDATA_SECTION_STATE;
3742     ## Reconsume.
3743     redo A;
3744     }
3745     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3746     if ($self->{nc} == 0x003E) { # >
3747     $self->{state} = DATA_STATE;
3748 wakaba 1.5 $self->{s_kwd} = '';
3749 wakaba 1.1
3750     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3751     $self->{line_prev} = $self->{line};
3752     $self->{column_prev} = $self->{column};
3753     $self->{column}++;
3754     $self->{nc}
3755     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3756     } else {
3757     $self->{set_nc}->($self);
3758     }
3759    
3760     if (length $self->{ct}->{data}) { # character
3761    
3762     return ($self->{ct}); # character
3763     } else {
3764    
3765     ## No token to emit. $self->{ct} is discarded.
3766     }
3767     redo A;
3768     } elsif ($self->{nc} == 0x005D) { # ]
3769     # character
3770     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3771     ## Stay in the state.
3772    
3773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3774     $self->{line_prev} = $self->{line};
3775     $self->{column_prev} = $self->{column};
3776     $self->{column}++;
3777     $self->{nc}
3778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3779     } else {
3780     $self->{set_nc}->($self);
3781     }
3782    
3783     redo A;
3784     } else {
3785    
3786     $self->{ct}->{data} .= ']]'; # character
3787     $self->{state} = CDATA_SECTION_STATE;
3788     ## Reconsume.
3789     redo A;
3790     }
3791     } elsif ($self->{state} == ENTITY_STATE) {
3792     if ($is_space->{$self->{nc}} or
3793     {
3794     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3795     $self->{entity_add} => 1,
3796     }->{$self->{nc}}) {
3797    
3798     ## Don't consume
3799     ## No error
3800     ## Return nothing.
3801     #
3802     } elsif ($self->{nc} == 0x0023) { # #
3803    
3804     $self->{state} = ENTITY_HASH_STATE;
3805     $self->{s_kwd} = '#';
3806    
3807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3808     $self->{line_prev} = $self->{line};
3809     $self->{column_prev} = $self->{column};
3810     $self->{column}++;
3811     $self->{nc}
3812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3813     } else {
3814     $self->{set_nc}->($self);
3815     }
3816    
3817     redo A;
3818     } elsif ((0x0041 <= $self->{nc} and
3819     $self->{nc} <= 0x005A) or # A..Z
3820     (0x0061 <= $self->{nc} and
3821     $self->{nc} <= 0x007A)) { # a..z
3822    
3823     require Whatpm::_NamedEntityList;
3824     $self->{state} = ENTITY_NAME_STATE;
3825     $self->{s_kwd} = chr $self->{nc};
3826     $self->{entity__value} = $self->{s_kwd};
3827     $self->{entity__match} = 0;
3828    
3829     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3830     $self->{line_prev} = $self->{line};
3831     $self->{column_prev} = $self->{column};
3832     $self->{column}++;
3833     $self->{nc}
3834     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3835     } else {
3836     $self->{set_nc}->($self);
3837     }
3838    
3839     redo A;
3840     } else {
3841    
3842     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3843     ## Return nothing.
3844     #
3845     }
3846    
3847     ## NOTE: No character is consumed by the "consume a character
3848     ## reference" algorithm. In other word, there is an "&" character
3849     ## that does not introduce a character reference, which would be
3850     ## appended to the parent element or the attribute value in later
3851     ## process of the tokenizer.
3852    
3853     if ($self->{prev_state} == DATA_STATE) {
3854    
3855     $self->{state} = $self->{prev_state};
3856 wakaba 1.5 $self->{s_kwd} = '';
3857 wakaba 1.1 ## Reconsume.
3858     return ({type => CHARACTER_TOKEN, data => '&',
3859     line => $self->{line_prev},
3860     column => $self->{column_prev},
3861     });
3862     redo A;
3863     } else {
3864    
3865     $self->{ca}->{value} .= '&';
3866     $self->{state} = $self->{prev_state};
3867 wakaba 1.5 $self->{s_kwd} = '';
3868 wakaba 1.1 ## Reconsume.
3869     redo A;
3870     }
3871     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3872     if ($self->{nc} == 0x0078 or # x
3873     $self->{nc} == 0x0058) { # X
3874    
3875     $self->{state} = HEXREF_X_STATE;
3876     $self->{s_kwd} .= chr $self->{nc};
3877    
3878     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3879     $self->{line_prev} = $self->{line};
3880     $self->{column_prev} = $self->{column};
3881     $self->{column}++;
3882     $self->{nc}
3883     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3884     } else {
3885     $self->{set_nc}->($self);
3886     }
3887    
3888     redo A;
3889     } elsif (0x0030 <= $self->{nc} and
3890     $self->{nc} <= 0x0039) { # 0..9
3891    
3892     $self->{state} = NCR_NUM_STATE;
3893     $self->{s_kwd} = $self->{nc} - 0x0030;
3894    
3895     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3896     $self->{line_prev} = $self->{line};
3897     $self->{column_prev} = $self->{column};
3898     $self->{column}++;
3899     $self->{nc}
3900     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3901     } else {
3902     $self->{set_nc}->($self);
3903     }
3904    
3905     redo A;
3906     } else {
3907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3908     line => $self->{line_prev},
3909     column => $self->{column_prev} - 1);
3910    
3911     ## NOTE: According to the spec algorithm, nothing is returned,
3912     ## and then "&#" is appended to the parent element or the attribute
3913     ## value in the later processing.
3914    
3915     if ($self->{prev_state} == DATA_STATE) {
3916    
3917     $self->{state} = $self->{prev_state};
3918 wakaba 1.5 $self->{s_kwd} = '';
3919 wakaba 1.1 ## Reconsume.
3920     return ({type => CHARACTER_TOKEN,
3921     data => '&#',
3922     line => $self->{line_prev},
3923     column => $self->{column_prev} - 1,
3924     });
3925     redo A;
3926     } else {
3927    
3928     $self->{ca}->{value} .= '&#';
3929     $self->{state} = $self->{prev_state};
3930 wakaba 1.5 $self->{s_kwd} = '';
3931 wakaba 1.1 ## Reconsume.
3932     redo A;
3933     }
3934     }
3935     } elsif ($self->{state} == NCR_NUM_STATE) {
3936     if (0x0030 <= $self->{nc} and
3937     $self->{nc} <= 0x0039) { # 0..9
3938    
3939     $self->{s_kwd} *= 10;
3940     $self->{s_kwd} += $self->{nc} - 0x0030;
3941    
3942     ## Stay in the state.
3943    
3944     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3945     $self->{line_prev} = $self->{line};
3946     $self->{column_prev} = $self->{column};
3947     $self->{column}++;
3948     $self->{nc}
3949     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3950     } else {
3951     $self->{set_nc}->($self);
3952     }
3953    
3954     redo A;
3955     } elsif ($self->{nc} == 0x003B) { # ;
3956    
3957    
3958     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3959     $self->{line_prev} = $self->{line};
3960     $self->{column_prev} = $self->{column};
3961     $self->{column}++;
3962     $self->{nc}
3963     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3964     } else {
3965     $self->{set_nc}->($self);
3966     }
3967    
3968     #
3969     } else {
3970    
3971     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3972     ## Reconsume.
3973     #
3974     }
3975    
3976     my $code = $self->{s_kwd};
3977     my $l = $self->{line_prev};
3978     my $c = $self->{column_prev};
3979     if ($charref_map->{$code}) {
3980    
3981     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3982     text => (sprintf 'U+%04X', $code),
3983     line => $l, column => $c);
3984     $code = $charref_map->{$code};
3985     } elsif ($code > 0x10FFFF) {
3986    
3987     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3988     text => (sprintf 'U-%08X', $code),
3989     line => $l, column => $c);
3990     $code = 0xFFFD;
3991     }
3992    
3993     if ($self->{prev_state} == DATA_STATE) {
3994    
3995     $self->{state} = $self->{prev_state};
3996 wakaba 1.5 $self->{s_kwd} = '';
3997 wakaba 1.1 ## Reconsume.
3998     return ({type => CHARACTER_TOKEN, data => chr $code,
3999 wakaba 1.7 has_reference => 1,
4000 wakaba 1.1 line => $l, column => $c,
4001     });
4002     redo A;
4003     } else {
4004    
4005     $self->{ca}->{value} .= chr $code;
4006     $self->{ca}->{has_reference} = 1;
4007     $self->{state} = $self->{prev_state};
4008 wakaba 1.5 $self->{s_kwd} = '';
4009 wakaba 1.1 ## Reconsume.
4010     redo A;
4011     }
4012     } elsif ($self->{state} == HEXREF_X_STATE) {
4013     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4014     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4015     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4016     # 0..9, A..F, a..f
4017    
4018     $self->{state} = HEXREF_HEX_STATE;
4019     $self->{s_kwd} = 0;
4020     ## Reconsume.
4021     redo A;
4022     } else {
4023     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4024     line => $self->{line_prev},
4025     column => $self->{column_prev} - 2);
4026    
4027     ## NOTE: According to the spec algorithm, nothing is returned,
4028     ## and then "&#" followed by "X" or "x" is appended to the parent
4029     ## element or the attribute value in the later processing.
4030    
4031     if ($self->{prev_state} == DATA_STATE) {
4032    
4033     $self->{state} = $self->{prev_state};
4034 wakaba 1.5 $self->{s_kwd} = '';
4035 wakaba 1.1 ## Reconsume.
4036     return ({type => CHARACTER_TOKEN,
4037     data => '&' . $self->{s_kwd},
4038     line => $self->{line_prev},
4039     column => $self->{column_prev} - length $self->{s_kwd},
4040     });
4041     redo A;
4042     } else {
4043    
4044     $self->{ca}->{value} .= '&' . $self->{s_kwd};
4045     $self->{state} = $self->{prev_state};
4046 wakaba 1.5 $self->{s_kwd} = '';
4047 wakaba 1.1 ## Reconsume.
4048     redo A;
4049     }
4050     }
4051     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4052     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4053     # 0..9
4054    
4055     $self->{s_kwd} *= 0x10;
4056     $self->{s_kwd} += $self->{nc} - 0x0030;
4057     ## Stay in the state.
4058    
4059     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4060     $self->{line_prev} = $self->{line};
4061     $self->{column_prev} = $self->{column};
4062     $self->{column}++;
4063     $self->{nc}
4064     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4065     } else {
4066     $self->{set_nc}->($self);
4067     }
4068    
4069     redo A;
4070     } elsif (0x0061 <= $self->{nc} and
4071     $self->{nc} <= 0x0066) { # a..f
4072    
4073     $self->{s_kwd} *= 0x10;
4074     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4075     ## Stay in the state.
4076    
4077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4078     $self->{line_prev} = $self->{line};
4079     $self->{column_prev} = $self->{column};
4080     $self->{column}++;
4081     $self->{nc}
4082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4083     } else {
4084     $self->{set_nc}->($self);
4085     }
4086    
4087     redo A;
4088     } elsif (0x0041 <= $self->{nc} and
4089     $self->{nc} <= 0x0046) { # A..F
4090    
4091     $self->{s_kwd} *= 0x10;
4092     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4093     ## Stay in the state.
4094    
4095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4096     $self->{line_prev} = $self->{line};
4097     $self->{column_prev} = $self->{column};
4098     $self->{column}++;
4099     $self->{nc}
4100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4101     } else {
4102     $self->{set_nc}->($self);
4103     }
4104    
4105     redo A;
4106     } elsif ($self->{nc} == 0x003B) { # ;
4107    
4108    
4109     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4110     $self->{line_prev} = $self->{line};
4111     $self->{column_prev} = $self->{column};
4112     $self->{column}++;
4113     $self->{nc}
4114     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4115     } else {
4116     $self->{set_nc}->($self);
4117     }
4118    
4119     #
4120     } else {
4121    
4122     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4123     line => $self->{line},
4124     column => $self->{column});
4125     ## Reconsume.
4126     #
4127     }
4128    
4129     my $code = $self->{s_kwd};
4130     my $l = $self->{line_prev};
4131     my $c = $self->{column_prev};
4132     if ($charref_map->{$code}) {
4133    
4134     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4135     text => (sprintf 'U+%04X', $code),
4136     line => $l, column => $c);
4137     $code = $charref_map->{$code};
4138     } elsif ($code > 0x10FFFF) {
4139    
4140     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4141     text => (sprintf 'U-%08X', $code),
4142     line => $l, column => $c);
4143     $code = 0xFFFD;
4144     }
4145    
4146     if ($self->{prev_state} == DATA_STATE) {
4147    
4148     $self->{state} = $self->{prev_state};
4149 wakaba 1.5 $self->{s_kwd} = '';
4150 wakaba 1.1 ## Reconsume.
4151     return ({type => CHARACTER_TOKEN, data => chr $code,
4152 wakaba 1.7 has_reference => 1,
4153 wakaba 1.1 line => $l, column => $c,
4154     });
4155     redo A;
4156     } else {
4157    
4158     $self->{ca}->{value} .= chr $code;
4159     $self->{ca}->{has_reference} = 1;
4160     $self->{state} = $self->{prev_state};
4161 wakaba 1.5 $self->{s_kwd} = '';
4162 wakaba 1.1 ## Reconsume.
4163     redo A;
4164     }
4165     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4166     if (length $self->{s_kwd} < 30 and
4167     ## NOTE: Some number greater than the maximum length of entity name
4168     ((0x0041 <= $self->{nc} and # a
4169     $self->{nc} <= 0x005A) or # x
4170     (0x0061 <= $self->{nc} and # a
4171     $self->{nc} <= 0x007A) or # z
4172     (0x0030 <= $self->{nc} and # 0
4173     $self->{nc} <= 0x0039) or # 9
4174     $self->{nc} == 0x003B)) { # ;
4175     our $EntityChar;
4176     $self->{s_kwd} .= chr $self->{nc};
4177     if (defined $EntityChar->{$self->{s_kwd}}) {
4178     if ($self->{nc} == 0x003B) { # ;
4179    
4180     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4181     $self->{entity__match} = 1;
4182    
4183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4184     $self->{line_prev} = $self->{line};
4185     $self->{column_prev} = $self->{column};
4186     $self->{column}++;
4187     $self->{nc}
4188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4189     } else {
4190     $self->{set_nc}->($self);
4191     }
4192    
4193     #
4194     } else {
4195    
4196     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4197     $self->{entity__match} = -1;
4198     ## Stay in the state.
4199    
4200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4201     $self->{line_prev} = $self->{line};
4202     $self->{column_prev} = $self->{column};
4203     $self->{column}++;
4204     $self->{nc}
4205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4206     } else {
4207     $self->{set_nc}->($self);
4208     }
4209    
4210     redo A;
4211     }
4212     } else {
4213    
4214     $self->{entity__value} .= chr $self->{nc};
4215     $self->{entity__match} *= 2;
4216     ## Stay in the state.
4217    
4218     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4219     $self->{line_prev} = $self->{line};
4220     $self->{column_prev} = $self->{column};
4221     $self->{column}++;
4222     $self->{nc}
4223     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4224     } else {
4225     $self->{set_nc}->($self);
4226     }
4227    
4228     redo A;
4229     }
4230     }
4231    
4232     my $data;
4233     my $has_ref;
4234     if ($self->{entity__match} > 0) {
4235    
4236     $data = $self->{entity__value};
4237     $has_ref = 1;
4238     #
4239     } elsif ($self->{entity__match} < 0) {
4240     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4241     if ($self->{prev_state} != DATA_STATE and # in attribute
4242     $self->{entity__match} < -1) {
4243    
4244     $data = '&' . $self->{s_kwd};
4245     #
4246     } else {
4247    
4248     $data = $self->{entity__value};
4249     $has_ref = 1;
4250     #
4251     }
4252     } else {
4253    
4254     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4255     line => $self->{line_prev},
4256     column => $self->{column_prev} - length $self->{s_kwd});
4257     $data = '&' . $self->{s_kwd};
4258     #
4259     }
4260    
4261     ## NOTE: In these cases, when a character reference is found,
4262     ## it is consumed and a character token is returned, or, otherwise,
4263     ## nothing is consumed and returned, according to the spec algorithm.
4264     ## In this implementation, anything that has been examined by the
4265     ## tokenizer is appended to the parent element or the attribute value
4266     ## as string, either literal string when no character reference or
4267     ## entity-replaced string otherwise, in this stage, since any characters
4268     ## that would not be consumed are appended in the data state or in an
4269     ## appropriate attribute value state anyway.
4270    
4271     if ($self->{prev_state} == DATA_STATE) {
4272    
4273     $self->{state} = $self->{prev_state};
4274 wakaba 1.5 $self->{s_kwd} = '';
4275 wakaba 1.1 ## Reconsume.
4276     return ({type => CHARACTER_TOKEN,
4277     data => $data,
4278 wakaba 1.7 has_reference => $has_ref,
4279 wakaba 1.1 line => $self->{line_prev},
4280     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4281     });
4282     redo A;
4283     } else {
4284    
4285     $self->{ca}->{value} .= $data;
4286     $self->{ca}->{has_reference} = 1 if $has_ref;
4287     $self->{state} = $self->{prev_state};
4288 wakaba 1.5 $self->{s_kwd} = '';
4289 wakaba 1.1 ## Reconsume.
4290     redo A;
4291     }
4292 wakaba 1.8
4293     ## XML-only states
4294    
4295     } elsif ($self->{state} == PI_STATE) {
4296     if ($is_space->{$self->{nc}} or
4297     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4298     $self->{nc} == -1) {
4299     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4300     line => $self->{line_prev},
4301     column => $self->{column_prev}
4302     - 1 * ($self->{nc} != -1));
4303     $self->{state} = BOGUS_COMMENT_STATE;
4304     ## Reconsume.
4305     $self->{ct} = {type => COMMENT_TOKEN,
4306     data => '?',
4307     line => $self->{line_prev},
4308     column => $self->{column_prev}
4309     - 1 * ($self->{nc} != -1),
4310     };
4311     redo A;
4312     } else {
4313     $self->{ct} = {type => PI_TOKEN,
4314     target => chr $self->{nc},
4315     data => '',
4316     line => $self->{line_prev},
4317     column => $self->{column_prev} - 1,
4318     };
4319     $self->{state} = PI_TARGET_STATE;
4320    
4321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4322     $self->{line_prev} = $self->{line};
4323     $self->{column_prev} = $self->{column};
4324     $self->{column}++;
4325     $self->{nc}
4326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4327     } else {
4328     $self->{set_nc}->($self);
4329     }
4330    
4331     redo A;
4332     }
4333     } elsif ($self->{state} == PI_TARGET_STATE) {
4334     if ($is_space->{$self->{nc}}) {
4335     $self->{state} = PI_TARGET_AFTER_STATE;
4336    
4337     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4338     $self->{line_prev} = $self->{line};
4339     $self->{column_prev} = $self->{column};
4340     $self->{column}++;
4341     $self->{nc}
4342     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4343     } else {
4344     $self->{set_nc}->($self);
4345     }
4346    
4347     redo A;
4348     } elsif ($self->{nc} == -1) {
4349     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4350     $self->{state} = DATA_STATE;
4351     $self->{s_kwd} = '';
4352     ## Reconsume.
4353     return ($self->{ct}); # pi
4354     redo A;
4355     } elsif ($self->{nc} == 0x003F) { # ?
4356     $self->{state} = PI_AFTER_STATE;
4357    
4358     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4359     $self->{line_prev} = $self->{line};
4360     $self->{column_prev} = $self->{column};
4361     $self->{column}++;
4362     $self->{nc}
4363     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4364     } else {
4365     $self->{set_nc}->($self);
4366     }
4367    
4368     redo A;
4369     } else {
4370     ## XML5: typo ("tag name" -> "target")
4371     $self->{ct}->{target} .= chr $self->{nc}; # pi
4372    
4373     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4374     $self->{line_prev} = $self->{line};
4375     $self->{column_prev} = $self->{column};
4376     $self->{column}++;
4377     $self->{nc}
4378     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4379     } else {
4380     $self->{set_nc}->($self);
4381     }
4382    
4383     redo A;
4384     }
4385     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4386     if ($is_space->{$self->{nc}}) {
4387     ## Stay in the state.
4388    
4389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4390     $self->{line_prev} = $self->{line};
4391     $self->{column_prev} = $self->{column};
4392     $self->{column}++;
4393     $self->{nc}
4394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4395     } else {
4396     $self->{set_nc}->($self);
4397     }
4398    
4399     redo A;
4400     } else {
4401     $self->{state} = PI_DATA_STATE;
4402     ## Reprocess.
4403     redo A;
4404     }
4405     } elsif ($self->{state} == PI_DATA_STATE) {
4406     if ($self->{nc} == 0x003F) { # ?
4407     $self->{state} = PI_DATA_AFTER_STATE;
4408    
4409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410     $self->{line_prev} = $self->{line};
4411     $self->{column_prev} = $self->{column};
4412     $self->{column}++;
4413     $self->{nc}
4414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415     } else {
4416     $self->{set_nc}->($self);
4417     }
4418    
4419     redo A;
4420     } elsif ($self->{nc} == -1) {
4421     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4422     $self->{state} = DATA_STATE;
4423     $self->{s_kwd} = '';
4424     ## Reprocess.
4425     return ($self->{ct}); # pi
4426     redo A;
4427     } else {
4428     $self->{ct}->{data} .= chr $self->{nc}; # pi
4429     $self->{read_until}->($self->{ct}->{data}, q[?],
4430     length $self->{ct}->{data});
4431     ## Stay in the state.
4432    
4433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4434     $self->{line_prev} = $self->{line};
4435     $self->{column_prev} = $self->{column};
4436     $self->{column}++;
4437     $self->{nc}
4438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4439     } else {
4440     $self->{set_nc}->($self);
4441     }
4442    
4443     ## Reprocess.
4444     redo A;
4445     }
4446     } elsif ($self->{state} == PI_AFTER_STATE) {
4447     if ($self->{nc} == 0x003E) { # >
4448     $self->{state} = DATA_STATE;
4449     $self->{s_kwd} = '';
4450    
4451     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4452     $self->{line_prev} = $self->{line};
4453     $self->{column_prev} = $self->{column};
4454     $self->{column}++;
4455     $self->{nc}
4456     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4457     } else {
4458     $self->{set_nc}->($self);
4459     }
4460    
4461     return ($self->{ct}); # pi
4462     redo A;
4463     } elsif ($self->{nc} == 0x003F) { # ?
4464     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4465     line => $self->{line_prev},
4466     column => $self->{column_prev}); ## XML5: no error
4467     $self->{ct}->{data} .= '?';
4468     $self->{state} = PI_DATA_AFTER_STATE;
4469    
4470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4471     $self->{line_prev} = $self->{line};
4472     $self->{column_prev} = $self->{column};
4473     $self->{column}++;
4474     $self->{nc}
4475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4476     } else {
4477     $self->{set_nc}->($self);
4478     }
4479    
4480     redo A;
4481     } else {
4482     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4483     line => $self->{line_prev},
4484     column => $self->{column_prev}
4485     + 1 * ($self->{nc} == -1)); ## XML5: no error
4486     $self->{ct}->{data} .= '?'; ## XML5: not appended
4487     $self->{state} = PI_DATA_STATE;
4488     ## Reprocess.
4489     redo A;
4490     }
4491     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4492     ## XML5: Same as "pi after state" in XML5
4493     if ($self->{nc} == 0x003E) { # >
4494     $self->{state} = DATA_STATE;
4495     $self->{s_kwd} = '';
4496    
4497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4498     $self->{line_prev} = $self->{line};
4499     $self->{column_prev} = $self->{column};
4500     $self->{column}++;
4501     $self->{nc}
4502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4503     } else {
4504     $self->{set_nc}->($self);
4505     }
4506    
4507     return ($self->{ct}); # pi
4508     redo A;
4509     } elsif ($self->{nc} == 0x003F) { # ?
4510     $self->{ct}->{data} .= '?';
4511     ## Stay in the state.
4512    
4513     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4514     $self->{line_prev} = $self->{line};
4515     $self->{column_prev} = $self->{column};
4516     $self->{column}++;
4517     $self->{nc}
4518     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4519     } else {
4520     $self->{set_nc}->($self);
4521     }
4522    
4523     redo A;
4524     } else {
4525     $self->{ct}->{data} .= '?'; ## XML5: not appended
4526     $self->{state} = PI_DATA_STATE;
4527     ## Reprocess.
4528     redo A;
4529     }
4530    
4531 wakaba 1.1 } else {
4532     die "$0: $self->{state}: Unknown state";
4533     }
4534     } # A
4535    
4536     die "$0: _get_next_token: unexpected case";
4537     } # _get_next_token
4538    
4539     1;
4540 wakaba 1.8 ## $Date: 2008/10/14 15:25:50 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24