/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (hide annotations) (download)
Wed Oct 15 08:51:02 2008 UTC (16 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.9: +84 -21 lines
++ whatpm/t/xml/ChangeLog	15 Oct 2008 08:50:58 -0000
	* doctypes-1.dat: Lowercase <!doctype> test added.

	* elements-1.dat: End tag tests added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 08:50:06 -0000
	* Tokenizer.pm.src: XML tag name start character support for end
	tags.  Support for the short end tag syntax of XML5.  Raise a
	parse erorr for a lowercase <!doctype> in XML.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: XML tag name start character support for start

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.10 our $VERSION=do{my @r=(q$Revision: 1.9 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117 wakaba 1.8 ## XML states
118     sub PI_STATE () { 51 }
119     sub PI_TARGET_STATE () { 52 }
120     sub PI_TARGET_AFTER_STATE () { 53 }
121     sub PI_DATA_STATE () { 54 }
122     sub PI_AFTER_STATE () { 55 }
123     sub PI_DATA_AFTER_STATE () { 56 }
124    
125 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
126     ## list and descriptions)
127    
128     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129     sub FOREIGN_EL () { 0b1_00000000000 }
130    
131     ## Character reference mappings
132    
133     my $charref_map = {
134     0x0D => 0x000A,
135     0x80 => 0x20AC,
136     0x81 => 0xFFFD,
137     0x82 => 0x201A,
138     0x83 => 0x0192,
139     0x84 => 0x201E,
140     0x85 => 0x2026,
141     0x86 => 0x2020,
142     0x87 => 0x2021,
143     0x88 => 0x02C6,
144     0x89 => 0x2030,
145     0x8A => 0x0160,
146     0x8B => 0x2039,
147     0x8C => 0x0152,
148     0x8D => 0xFFFD,
149     0x8E => 0x017D,
150     0x8F => 0xFFFD,
151     0x90 => 0xFFFD,
152     0x91 => 0x2018,
153     0x92 => 0x2019,
154     0x93 => 0x201C,
155     0x94 => 0x201D,
156     0x95 => 0x2022,
157     0x96 => 0x2013,
158     0x97 => 0x2014,
159     0x98 => 0x02DC,
160     0x99 => 0x2122,
161     0x9A => 0x0161,
162     0x9B => 0x203A,
163     0x9C => 0x0153,
164     0x9D => 0xFFFD,
165     0x9E => 0x017E,
166     0x9F => 0x0178,
167     }; # $charref_map
168     $charref_map->{$_} = 0xFFFD
169     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176    
177     ## Implementations MUST act as if state machine in the spec
178    
179     sub _initialize_tokenizer ($) {
180     my $self = shift;
181    
182     ## NOTE: Fields set by |new| constructor:
183     #$self->{level}
184     #$self->{set_nc}
185     #$self->{parse_error}
186 wakaba 1.3 #$self->{is_xml} (if XML)
187 wakaba 1.1
188     $self->{state} = DATA_STATE; # MUST
189 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
190 wakaba 1.1 #$self->{entity__value}; # initialized when used
191     #$self->{entity__match}; # initialized when used
192     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193     undef $self->{ct}; # current token
194     undef $self->{ca}; # current attribute
195     undef $self->{last_stag_name}; # last emitted start tag name
196     #$self->{prev_state}; # initialized when used
197     delete $self->{self_closing};
198     $self->{char_buffer} = '';
199     $self->{char_buffer_pos} = 0;
200     $self->{nc} = -1; # next input character
201     #$self->{next_nc}
202    
203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
204     $self->{line_prev} = $self->{line};
205     $self->{column_prev} = $self->{column};
206     $self->{column}++;
207     $self->{nc}
208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
209     } else {
210     $self->{set_nc}->($self);
211     }
212    
213     $self->{token} = [];
214     # $self->{escape}
215     } # _initialize_tokenizer
216    
217     ## A token has:
218     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
220     ## ->{name} (DOCTYPE_TOKEN)
221     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222     ## ->{pubid} (DOCTYPE_TOKEN)
223     ## ->{sysid} (DOCTYPE_TOKEN)
224     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
225     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
226     ## ->{name}
227     ## ->{value}
228     ## ->{has_reference} == 1 or 0
229     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
230 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
231 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
232     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
233     ## while the token is pushed back to the stack.
234    
235     ## Emitted token MUST immediately be handled by the tree construction state.
236    
237     ## Before each step, UA MAY check to see if either one of the scripts in
238     ## "list of scripts that will execute as soon as possible" or the first
239     ## script in the "list of scripts that will execute asynchronously",
240     ## has completed loading. If one has, then it MUST be executed
241     ## and removed from the list.
242    
243     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
244     ## (This requirement was dropped from HTML5 spec, unfortunately.)
245    
246     my $is_space = {
247     0x0009 => 1, # CHARACTER TABULATION (HT)
248     0x000A => 1, # LINE FEED (LF)
249     #0x000B => 0, # LINE TABULATION (VT)
250     0x000C => 1, # FORM FEED (FF)
251     #0x000D => 1, # CARRIAGE RETURN (CR)
252     0x0020 => 1, # SPACE (SP)
253     };
254    
255     sub _get_next_token ($) {
256     my $self = shift;
257    
258     if ($self->{self_closing}) {
259     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
260     ## NOTE: The |self_closing| flag is only set by start tag token.
261     ## In addition, when a start tag token is emitted, it is always set to
262     ## |ct|.
263     delete $self->{self_closing};
264     }
265    
266     if (@{$self->{token}}) {
267     $self->{self_closing} = $self->{token}->[0]->{self_closing};
268     return shift @{$self->{token}};
269     }
270    
271     A: {
272     if ($self->{state} == PCDATA_STATE) {
273     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
274    
275     if ($self->{nc} == 0x0026) { # &
276    
277     ## NOTE: In the spec, the tokenizer is switched to the
278     ## "entity data state". In this implementation, the tokenizer
279     ## is switched to the |ENTITY_STATE|, which is an implementation
280     ## of the "consume a character reference" algorithm.
281     $self->{entity_add} = -1;
282     $self->{prev_state} = DATA_STATE;
283     $self->{state} = ENTITY_STATE;
284    
285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
286     $self->{line_prev} = $self->{line};
287     $self->{column_prev} = $self->{column};
288     $self->{column}++;
289     $self->{nc}
290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
291     } else {
292     $self->{set_nc}->($self);
293     }
294    
295     redo A;
296     } elsif ($self->{nc} == 0x003C) { # <
297    
298     $self->{state} = TAG_OPEN_STATE;
299    
300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
301     $self->{line_prev} = $self->{line};
302     $self->{column_prev} = $self->{column};
303     $self->{column}++;
304     $self->{nc}
305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
306     } else {
307     $self->{set_nc}->($self);
308     }
309    
310     redo A;
311     } elsif ($self->{nc} == -1) {
312    
313     return ({type => END_OF_FILE_TOKEN,
314     line => $self->{line}, column => $self->{column}});
315     last A; ## TODO: ok?
316     } else {
317    
318     #
319     }
320    
321     # Anything else
322     my $token = {type => CHARACTER_TOKEN,
323     data => chr $self->{nc},
324     line => $self->{line}, column => $self->{column},
325     };
326     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
327    
328     ## Stay in the state.
329    
330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
331     $self->{line_prev} = $self->{line};
332     $self->{column_prev} = $self->{column};
333     $self->{column}++;
334     $self->{nc}
335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
336     } else {
337     $self->{set_nc}->($self);
338     }
339    
340     return ($token);
341     redo A;
342     } elsif ($self->{state} == DATA_STATE) {
343     $self->{s_kwd} = '' unless defined $self->{s_kwd};
344     if ($self->{nc} == 0x0026) { # &
345     $self->{s_kwd} = '';
346     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
347     not $self->{escape}) {
348    
349     ## NOTE: In the spec, the tokenizer is switched to the
350     ## "entity data state". In this implementation, the tokenizer
351     ## is switched to the |ENTITY_STATE|, which is an implementation
352     ## of the "consume a character reference" algorithm.
353     $self->{entity_add} = -1;
354     $self->{prev_state} = DATA_STATE;
355     $self->{state} = ENTITY_STATE;
356    
357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
358     $self->{line_prev} = $self->{line};
359     $self->{column_prev} = $self->{column};
360     $self->{column}++;
361     $self->{nc}
362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
363     } else {
364     $self->{set_nc}->($self);
365     }
366    
367     redo A;
368     } else {
369    
370     #
371     }
372     } elsif ($self->{nc} == 0x002D) { # -
373     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
374 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
375 wakaba 1.1
376     $self->{escape} = 1; # unless $self->{escape};
377     $self->{s_kwd} = '--';
378     #
379 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
380 wakaba 1.1
381     $self->{s_kwd} = '--';
382     #
383 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
384    
385     $self->{s_kwd} .= '-';
386     #
387 wakaba 1.1 } else {
388    
389 wakaba 1.5 $self->{s_kwd} = '-';
390 wakaba 1.1 #
391     }
392     }
393    
394     #
395     } elsif ($self->{nc} == 0x0021) { # !
396     if (length $self->{s_kwd}) {
397    
398     $self->{s_kwd} .= '!';
399     #
400     } else {
401    
402     #$self->{s_kwd} = '';
403     #
404     }
405     #
406     } elsif ($self->{nc} == 0x003C) { # <
407     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
408     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
409     not $self->{escape})) {
410    
411     $self->{state} = TAG_OPEN_STATE;
412    
413     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
414     $self->{line_prev} = $self->{line};
415     $self->{column_prev} = $self->{column};
416     $self->{column}++;
417     $self->{nc}
418     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
419     } else {
420     $self->{set_nc}->($self);
421     }
422    
423     redo A;
424     } else {
425    
426     $self->{s_kwd} = '';
427     #
428     }
429     } elsif ($self->{nc} == 0x003E) { # >
430     if ($self->{escape} and
431     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
432     if ($self->{s_kwd} eq '--') {
433    
434     delete $self->{escape};
435 wakaba 1.5 #
436 wakaba 1.1 } else {
437    
438 wakaba 1.5 #
439 wakaba 1.1 }
440 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
441    
442     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
443     line => $self->{line_prev},
444     column => $self->{column_prev} - 1);
445     #
446 wakaba 1.1 } else {
447    
448 wakaba 1.5 #
449 wakaba 1.1 }
450    
451     $self->{s_kwd} = '';
452     #
453 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
454     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
455    
456     $self->{s_kwd} .= ']';
457     } elsif ($self->{s_kwd} eq ']]') {
458    
459     #
460     } else {
461    
462     $self->{s_kwd} = '';
463     }
464     #
465 wakaba 1.1 } elsif ($self->{nc} == -1) {
466    
467     $self->{s_kwd} = '';
468     return ({type => END_OF_FILE_TOKEN,
469     line => $self->{line}, column => $self->{column}});
470     last A; ## TODO: ok?
471     } else {
472    
473     $self->{s_kwd} = '';
474     #
475     }
476    
477     # Anything else
478     my $token = {type => CHARACTER_TOKEN,
479     data => chr $self->{nc},
480     line => $self->{line}, column => $self->{column},
481     };
482 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
483 wakaba 1.1 length $token->{data})) {
484     $self->{s_kwd} = '';
485     }
486    
487     ## Stay in the data state.
488 wakaba 1.5 if (not $self->{is_xml} and
489     $self->{content_model} == PCDATA_CONTENT_MODEL) {
490 wakaba 1.1
491     $self->{state} = PCDATA_STATE;
492     } else {
493    
494     ## Stay in the state.
495     }
496    
497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
498     $self->{line_prev} = $self->{line};
499     $self->{column_prev} = $self->{column};
500     $self->{column}++;
501     $self->{nc}
502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
503     } else {
504     $self->{set_nc}->($self);
505     }
506    
507     return ($token);
508     redo A;
509     } elsif ($self->{state} == TAG_OPEN_STATE) {
510 wakaba 1.10 ## XML5: "tag state".
511    
512 wakaba 1.1 if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
513     if ($self->{nc} == 0x002F) { # /
514    
515    
516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
517     $self->{line_prev} = $self->{line};
518     $self->{column_prev} = $self->{column};
519     $self->{column}++;
520     $self->{nc}
521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
522     } else {
523     $self->{set_nc}->($self);
524     }
525    
526     $self->{state} = CLOSE_TAG_OPEN_STATE;
527     redo A;
528     } elsif ($self->{nc} == 0x0021) { # !
529    
530     $self->{s_kwd} = '<' unless $self->{escape};
531     #
532     } else {
533    
534     #
535     }
536    
537     ## reconsume
538     $self->{state} = DATA_STATE;
539 wakaba 1.5 $self->{s_kwd} = '';
540 wakaba 1.1 return ({type => CHARACTER_TOKEN, data => '<',
541     line => $self->{line_prev},
542     column => $self->{column_prev},
543     });
544     redo A;
545     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
546     if ($self->{nc} == 0x0021) { # !
547    
548     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
549    
550     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
551     $self->{line_prev} = $self->{line};
552     $self->{column_prev} = $self->{column};
553     $self->{column}++;
554     $self->{nc}
555     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
556     } else {
557     $self->{set_nc}->($self);
558     }
559    
560     redo A;
561     } elsif ($self->{nc} == 0x002F) { # /
562    
563     $self->{state} = CLOSE_TAG_OPEN_STATE;
564    
565     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
566     $self->{line_prev} = $self->{line};
567     $self->{column_prev} = $self->{column};
568     $self->{column}++;
569     $self->{nc}
570     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
571     } else {
572     $self->{set_nc}->($self);
573     }
574    
575     redo A;
576     } elsif (0x0041 <= $self->{nc} and
577     $self->{nc} <= 0x005A) { # A..Z
578    
579     $self->{ct}
580     = {type => START_TAG_TOKEN,
581 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
582 wakaba 1.1 line => $self->{line_prev},
583     column => $self->{column_prev}};
584     $self->{state} = TAG_NAME_STATE;
585    
586     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
587     $self->{line_prev} = $self->{line};
588     $self->{column_prev} = $self->{column};
589     $self->{column}++;
590     $self->{nc}
591     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
592     } else {
593     $self->{set_nc}->($self);
594     }
595    
596     redo A;
597     } elsif (0x0061 <= $self->{nc} and
598     $self->{nc} <= 0x007A) { # a..z
599    
600     $self->{ct} = {type => START_TAG_TOKEN,
601     tag_name => chr ($self->{nc}),
602     line => $self->{line_prev},
603     column => $self->{column_prev}};
604     $self->{state} = TAG_NAME_STATE;
605    
606     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
607     $self->{line_prev} = $self->{line};
608     $self->{column_prev} = $self->{column};
609     $self->{column}++;
610     $self->{nc}
611     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
612     } else {
613     $self->{set_nc}->($self);
614     }
615    
616     redo A;
617     } elsif ($self->{nc} == 0x003E) { # >
618    
619     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
620     line => $self->{line_prev},
621     column => $self->{column_prev});
622     $self->{state} = DATA_STATE;
623 wakaba 1.5 $self->{s_kwd} = '';
624 wakaba 1.1
625     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
626     $self->{line_prev} = $self->{line};
627     $self->{column_prev} = $self->{column};
628     $self->{column}++;
629     $self->{nc}
630     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
631     } else {
632     $self->{set_nc}->($self);
633     }
634    
635    
636     return ({type => CHARACTER_TOKEN, data => '<>',
637     line => $self->{line_prev},
638     column => $self->{column_prev},
639     });
640    
641     redo A;
642     } elsif ($self->{nc} == 0x003F) { # ?
643 wakaba 1.8 if ($self->{is_xml}) {
644    
645     $self->{state} = PI_STATE;
646    
647     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
648     $self->{line_prev} = $self->{line};
649     $self->{column_prev} = $self->{column};
650     $self->{column}++;
651     $self->{nc}
652     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
653     } else {
654     $self->{set_nc}->($self);
655     }
656    
657     redo A;
658     } else {
659    
660     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
661     line => $self->{line_prev},
662     column => $self->{column_prev});
663     $self->{state} = BOGUS_COMMENT_STATE;
664     $self->{ct} = {type => COMMENT_TOKEN, data => '',
665     line => $self->{line_prev},
666     column => $self->{column_prev},
667     };
668     ## $self->{nc} is intentionally left as is
669     redo A;
670     }
671 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
672 wakaba 1.1
673     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
674     line => $self->{line_prev},
675     column => $self->{column_prev});
676     $self->{state} = DATA_STATE;
677 wakaba 1.5 $self->{s_kwd} = '';
678 wakaba 1.1 ## reconsume
679    
680     return ({type => CHARACTER_TOKEN, data => '<',
681     line => $self->{line_prev},
682     column => $self->{column_prev},
683     });
684    
685     redo A;
686 wakaba 1.9 } else {
687     ## XML5: "<:" is a parse error.
688    
689     $self->{ct} = {type => START_TAG_TOKEN,
690     tag_name => chr ($self->{nc}),
691     line => $self->{line_prev},
692     column => $self->{column_prev}};
693     $self->{state} = TAG_NAME_STATE;
694    
695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
696     $self->{line_prev} = $self->{line};
697     $self->{column_prev} = $self->{column};
698     $self->{column}++;
699     $self->{nc}
700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
701     } else {
702     $self->{set_nc}->($self);
703     }
704    
705     redo A;
706 wakaba 1.1 }
707     } else {
708     die "$0: $self->{content_model} in tag open";
709     }
710     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
711     ## NOTE: The "close tag open state" in the spec is implemented as
712     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
713    
714 wakaba 1.10 ## XML5: "end tag state".
715    
716 wakaba 1.1 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
717     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
718     if (defined $self->{last_stag_name}) {
719     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
720     $self->{s_kwd} = '';
721     ## Reconsume.
722     redo A;
723     } else {
724     ## No start tag token has ever been emitted
725     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
726    
727     $self->{state} = DATA_STATE;
728 wakaba 1.5 $self->{s_kwd} = '';
729 wakaba 1.1 ## Reconsume.
730     return ({type => CHARACTER_TOKEN, data => '</',
731     line => $l, column => $c,
732     });
733     redo A;
734     }
735     }
736    
737     if (0x0041 <= $self->{nc} and
738     $self->{nc} <= 0x005A) { # A..Z
739    
740     $self->{ct}
741     = {type => END_TAG_TOKEN,
742 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
743 wakaba 1.1 line => $l, column => $c};
744     $self->{state} = TAG_NAME_STATE;
745    
746     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
747     $self->{line_prev} = $self->{line};
748     $self->{column_prev} = $self->{column};
749     $self->{column}++;
750     $self->{nc}
751     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
752     } else {
753     $self->{set_nc}->($self);
754     }
755    
756     redo A;
757     } elsif (0x0061 <= $self->{nc} and
758     $self->{nc} <= 0x007A) { # a..z
759    
760     $self->{ct} = {type => END_TAG_TOKEN,
761     tag_name => chr ($self->{nc}),
762     line => $l, column => $c};
763     $self->{state} = TAG_NAME_STATE;
764    
765     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
766     $self->{line_prev} = $self->{line};
767     $self->{column_prev} = $self->{column};
768     $self->{column}++;
769     $self->{nc}
770     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
771     } else {
772     $self->{set_nc}->($self);
773     }
774    
775     redo A;
776     } elsif ($self->{nc} == 0x003E) { # >
777     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
778     line => $self->{line_prev}, ## "<" in "</>"
779     column => $self->{column_prev} - 1);
780     $self->{state} = DATA_STATE;
781 wakaba 1.5 $self->{s_kwd} = '';
782 wakaba 1.10 if ($self->{is_xml}) {
783    
784     ## XML5: No parse error.
785    
786     ## NOTE: This parser raises a parse error, since it supports
787     ## XML1, not XML5.
788    
789     ## NOTE: A short end tag token.
790     my $ct = {type => END_TAG_TOKEN,
791     tag_name => '',
792     line => $self->{line_prev},
793     column => $self->{column_prev} - 1,
794     };
795    
796     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
797     $self->{line_prev} = $self->{line};
798     $self->{column_prev} = $self->{column};
799     $self->{column}++;
800     $self->{nc}
801     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
802     } else {
803     $self->{set_nc}->($self);
804     }
805    
806     return ($ct);
807     } else {
808    
809    
810 wakaba 1.1 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
811     $self->{line_prev} = $self->{line};
812     $self->{column_prev} = $self->{column};
813     $self->{column}++;
814     $self->{nc}
815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
816     } else {
817     $self->{set_nc}->($self);
818     }
819    
820 wakaba 1.10 }
821 wakaba 1.1 redo A;
822     } elsif ($self->{nc} == -1) {
823    
824     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
825 wakaba 1.5 $self->{s_kwd} = '';
826 wakaba 1.1 $self->{state} = DATA_STATE;
827     # reconsume
828    
829     return ({type => CHARACTER_TOKEN, data => '</',
830     line => $l, column => $c,
831     });
832    
833     redo A;
834 wakaba 1.10 } elsif (not $self->{is_xml} or
835     $is_space->{$self->{nc}}) {
836 wakaba 1.1
837 wakaba 1.10 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag',
838     line => $self->{line_prev}, # "<" of "</"
839     column => $self->{column_prev} - 1);
840 wakaba 1.1 $self->{state} = BOGUS_COMMENT_STATE;
841     $self->{ct} = {type => COMMENT_TOKEN, data => '',
842     line => $self->{line_prev}, # "<" of "</"
843     column => $self->{column_prev} - 1,
844     };
845     ## NOTE: $self->{nc} is intentionally left as is.
846     ## Although the "anything else" case of the spec not explicitly
847     ## states that the next input character is to be reconsumed,
848     ## it will be included to the |data| of the comment token
849     ## generated from the bogus end tag, as defined in the
850     ## "bogus comment state" entry.
851     redo A;
852 wakaba 1.10 } else {
853     ## XML5: "</:" is a parse error.
854    
855     $self->{ct} = {type => END_TAG_TOKEN,
856     tag_name => chr ($self->{nc}),
857     line => $l, column => $c};
858     $self->{state} = TAG_NAME_STATE; ## XML5: "end tag name state".
859    
860     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
861     $self->{line_prev} = $self->{line};
862     $self->{column_prev} = $self->{column};
863     $self->{column}++;
864     $self->{nc}
865     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
866     } else {
867     $self->{set_nc}->($self);
868     }
869    
870     redo A;
871 wakaba 1.1 }
872     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
873     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
874     if (length $ch) {
875     my $CH = $ch;
876     $ch =~ tr/a-z/A-Z/;
877     my $nch = chr $self->{nc};
878     if ($nch eq $ch or $nch eq $CH) {
879    
880     ## Stay in the state.
881     $self->{s_kwd} .= $nch;
882    
883     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
884     $self->{line_prev} = $self->{line};
885     $self->{column_prev} = $self->{column};
886     $self->{column}++;
887     $self->{nc}
888     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
889     } else {
890     $self->{set_nc}->($self);
891     }
892    
893     redo A;
894     } else {
895    
896     $self->{state} = DATA_STATE;
897 wakaba 1.5 $self->{s_kwd} = '';
898 wakaba 1.1 ## Reconsume.
899     return ({type => CHARACTER_TOKEN,
900     data => '</' . $self->{s_kwd},
901     line => $self->{line_prev},
902     column => $self->{column_prev} - 1 - length $self->{s_kwd},
903     });
904     redo A;
905     }
906     } else { # after "<{tag-name}"
907     unless ($is_space->{$self->{nc}} or
908     {
909     0x003E => 1, # >
910     0x002F => 1, # /
911     -1 => 1, # EOF
912     }->{$self->{nc}}) {
913    
914     ## Reconsume.
915     $self->{state} = DATA_STATE;
916 wakaba 1.5 $self->{s_kwd} = '';
917 wakaba 1.1 return ({type => CHARACTER_TOKEN,
918     data => '</' . $self->{s_kwd},
919     line => $self->{line_prev},
920     column => $self->{column_prev} - 1 - length $self->{s_kwd},
921     });
922     redo A;
923     } else {
924    
925     $self->{ct}
926     = {type => END_TAG_TOKEN,
927     tag_name => $self->{last_stag_name},
928     line => $self->{line_prev},
929     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
930     $self->{state} = TAG_NAME_STATE;
931     ## Reconsume.
932     redo A;
933     }
934     }
935     } elsif ($self->{state} == TAG_NAME_STATE) {
936     if ($is_space->{$self->{nc}}) {
937    
938     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
939    
940     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
941     $self->{line_prev} = $self->{line};
942     $self->{column_prev} = $self->{column};
943     $self->{column}++;
944     $self->{nc}
945     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
946     } else {
947     $self->{set_nc}->($self);
948     }
949    
950     redo A;
951     } elsif ($self->{nc} == 0x003E) { # >
952     if ($self->{ct}->{type} == START_TAG_TOKEN) {
953    
954     $self->{last_stag_name} = $self->{ct}->{tag_name};
955     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
956     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
957     #if ($self->{ct}->{attributes}) {
958     # ## NOTE: This should never be reached.
959     # !!! cp (36);
960     # !!! parse-error (type => 'end tag attribute');
961     #} else {
962    
963     #}
964     } else {
965     die "$0: $self->{ct}->{type}: Unknown token type";
966     }
967     $self->{state} = DATA_STATE;
968 wakaba 1.5 $self->{s_kwd} = '';
969 wakaba 1.1
970     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
971     $self->{line_prev} = $self->{line};
972     $self->{column_prev} = $self->{column};
973     $self->{column}++;
974     $self->{nc}
975     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
976     } else {
977     $self->{set_nc}->($self);
978     }
979    
980    
981     return ($self->{ct}); # start tag or end tag
982    
983     redo A;
984     } elsif (0x0041 <= $self->{nc} and
985     $self->{nc} <= 0x005A) { # A..Z
986    
987 wakaba 1.4 $self->{ct}->{tag_name}
988     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
989 wakaba 1.1 # start tag or end tag
990     ## Stay in this state
991    
992     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
993     $self->{line_prev} = $self->{line};
994     $self->{column_prev} = $self->{column};
995     $self->{column}++;
996     $self->{nc}
997     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
998     } else {
999     $self->{set_nc}->($self);
1000     }
1001    
1002     redo A;
1003     } elsif ($self->{nc} == -1) {
1004     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1005     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1006    
1007     $self->{last_stag_name} = $self->{ct}->{tag_name};
1008     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1009     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1010     #if ($self->{ct}->{attributes}) {
1011     # ## NOTE: This state should never be reached.
1012     # !!! cp (40);
1013     # !!! parse-error (type => 'end tag attribute');
1014     #} else {
1015    
1016     #}
1017     } else {
1018     die "$0: $self->{ct}->{type}: Unknown token type";
1019     }
1020     $self->{state} = DATA_STATE;
1021 wakaba 1.5 $self->{s_kwd} = '';
1022 wakaba 1.1 # reconsume
1023    
1024     return ($self->{ct}); # start tag or end tag
1025    
1026     redo A;
1027     } elsif ($self->{nc} == 0x002F) { # /
1028    
1029     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1030    
1031     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1032     $self->{line_prev} = $self->{line};
1033     $self->{column_prev} = $self->{column};
1034     $self->{column}++;
1035     $self->{nc}
1036     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1037     } else {
1038     $self->{set_nc}->($self);
1039     }
1040    
1041     redo A;
1042     } else {
1043    
1044     $self->{ct}->{tag_name} .= chr $self->{nc};
1045     # start tag or end tag
1046     ## Stay in the state
1047    
1048     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1049     $self->{line_prev} = $self->{line};
1050     $self->{column_prev} = $self->{column};
1051     $self->{column}++;
1052     $self->{nc}
1053     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1054     } else {
1055     $self->{set_nc}->($self);
1056     }
1057    
1058     redo A;
1059     }
1060     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1061     if ($is_space->{$self->{nc}}) {
1062    
1063     ## Stay in the state
1064    
1065     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1066     $self->{line_prev} = $self->{line};
1067     $self->{column_prev} = $self->{column};
1068     $self->{column}++;
1069     $self->{nc}
1070     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1071     } else {
1072     $self->{set_nc}->($self);
1073     }
1074    
1075     redo A;
1076     } elsif ($self->{nc} == 0x003E) { # >
1077     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1078    
1079     $self->{last_stag_name} = $self->{ct}->{tag_name};
1080     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1081     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1082     if ($self->{ct}->{attributes}) {
1083    
1084     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1085     } else {
1086    
1087     }
1088     } else {
1089     die "$0: $self->{ct}->{type}: Unknown token type";
1090     }
1091     $self->{state} = DATA_STATE;
1092 wakaba 1.5 $self->{s_kwd} = '';
1093 wakaba 1.1
1094     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1095     $self->{line_prev} = $self->{line};
1096     $self->{column_prev} = $self->{column};
1097     $self->{column}++;
1098     $self->{nc}
1099     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1100     } else {
1101     $self->{set_nc}->($self);
1102     }
1103    
1104    
1105     return ($self->{ct}); # start tag or end tag
1106    
1107     redo A;
1108     } elsif (0x0041 <= $self->{nc} and
1109     $self->{nc} <= 0x005A) { # A..Z
1110    
1111     $self->{ca}
1112 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1113 wakaba 1.1 value => '',
1114     line => $self->{line}, column => $self->{column}};
1115     $self->{state} = ATTRIBUTE_NAME_STATE;
1116    
1117     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1118     $self->{line_prev} = $self->{line};
1119     $self->{column_prev} = $self->{column};
1120     $self->{column}++;
1121     $self->{nc}
1122     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1123     } else {
1124     $self->{set_nc}->($self);
1125     }
1126    
1127     redo A;
1128     } elsif ($self->{nc} == 0x002F) { # /
1129    
1130     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1131    
1132     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1133     $self->{line_prev} = $self->{line};
1134     $self->{column_prev} = $self->{column};
1135     $self->{column}++;
1136     $self->{nc}
1137     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1138     } else {
1139     $self->{set_nc}->($self);
1140     }
1141    
1142     redo A;
1143     } elsif ($self->{nc} == -1) {
1144     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1145     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1146    
1147     $self->{last_stag_name} = $self->{ct}->{tag_name};
1148     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1149     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1150     if ($self->{ct}->{attributes}) {
1151    
1152     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1153     } else {
1154    
1155     }
1156     } else {
1157     die "$0: $self->{ct}->{type}: Unknown token type";
1158     }
1159     $self->{state} = DATA_STATE;
1160 wakaba 1.5 $self->{s_kwd} = '';
1161 wakaba 1.1 # reconsume
1162    
1163     return ($self->{ct}); # start tag or end tag
1164    
1165     redo A;
1166     } else {
1167     if ({
1168     0x0022 => 1, # "
1169     0x0027 => 1, # '
1170     0x003D => 1, # =
1171     }->{$self->{nc}}) {
1172    
1173     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1174     } else {
1175    
1176     }
1177     $self->{ca}
1178     = {name => chr ($self->{nc}),
1179     value => '',
1180     line => $self->{line}, column => $self->{column}};
1181     $self->{state} = ATTRIBUTE_NAME_STATE;
1182    
1183     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1184     $self->{line_prev} = $self->{line};
1185     $self->{column_prev} = $self->{column};
1186     $self->{column}++;
1187     $self->{nc}
1188     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1189     } else {
1190     $self->{set_nc}->($self);
1191     }
1192    
1193     redo A;
1194     }
1195     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1196     my $before_leave = sub {
1197     if (exists $self->{ct}->{attributes} # start tag or end tag
1198     ->{$self->{ca}->{name}}) { # MUST
1199    
1200     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1201     ## Discard $self->{ca} # MUST
1202     } else {
1203    
1204     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1205     = $self->{ca};
1206     }
1207     }; # $before_leave
1208    
1209     if ($is_space->{$self->{nc}}) {
1210    
1211     $before_leave->();
1212     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1213    
1214     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1215     $self->{line_prev} = $self->{line};
1216     $self->{column_prev} = $self->{column};
1217     $self->{column}++;
1218     $self->{nc}
1219     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1220     } else {
1221     $self->{set_nc}->($self);
1222     }
1223    
1224     redo A;
1225     } elsif ($self->{nc} == 0x003D) { # =
1226    
1227     $before_leave->();
1228     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1229    
1230     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1231     $self->{line_prev} = $self->{line};
1232     $self->{column_prev} = $self->{column};
1233     $self->{column}++;
1234     $self->{nc}
1235     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1236     } else {
1237     $self->{set_nc}->($self);
1238     }
1239    
1240     redo A;
1241     } elsif ($self->{nc} == 0x003E) { # >
1242     $before_leave->();
1243     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1244    
1245     $self->{last_stag_name} = $self->{ct}->{tag_name};
1246     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1247    
1248     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1249     if ($self->{ct}->{attributes}) {
1250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1251     }
1252     } else {
1253     die "$0: $self->{ct}->{type}: Unknown token type";
1254     }
1255     $self->{state} = DATA_STATE;
1256 wakaba 1.5 $self->{s_kwd} = '';
1257 wakaba 1.1
1258     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1259     $self->{line_prev} = $self->{line};
1260     $self->{column_prev} = $self->{column};
1261     $self->{column}++;
1262     $self->{nc}
1263     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1264     } else {
1265     $self->{set_nc}->($self);
1266     }
1267    
1268    
1269     return ($self->{ct}); # start tag or end tag
1270    
1271     redo A;
1272     } elsif (0x0041 <= $self->{nc} and
1273     $self->{nc} <= 0x005A) { # A..Z
1274    
1275 wakaba 1.4 $self->{ca}->{name}
1276     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1277 wakaba 1.1 ## Stay in the state
1278    
1279     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1280     $self->{line_prev} = $self->{line};
1281     $self->{column_prev} = $self->{column};
1282     $self->{column}++;
1283     $self->{nc}
1284     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1285     } else {
1286     $self->{set_nc}->($self);
1287     }
1288    
1289     redo A;
1290     } elsif ($self->{nc} == 0x002F) { # /
1291    
1292     $before_leave->();
1293     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1294    
1295     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1296     $self->{line_prev} = $self->{line};
1297     $self->{column_prev} = $self->{column};
1298     $self->{column}++;
1299     $self->{nc}
1300     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1301     } else {
1302     $self->{set_nc}->($self);
1303     }
1304    
1305     redo A;
1306     } elsif ($self->{nc} == -1) {
1307     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1308     $before_leave->();
1309     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1310    
1311     $self->{last_stag_name} = $self->{ct}->{tag_name};
1312     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1313     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1314     if ($self->{ct}->{attributes}) {
1315    
1316     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1317     } else {
1318     ## NOTE: This state should never be reached.
1319    
1320     }
1321     } else {
1322     die "$0: $self->{ct}->{type}: Unknown token type";
1323     }
1324     $self->{state} = DATA_STATE;
1325 wakaba 1.5 $self->{s_kwd} = '';
1326 wakaba 1.1 # reconsume
1327    
1328     return ($self->{ct}); # start tag or end tag
1329    
1330     redo A;
1331     } else {
1332     if ($self->{nc} == 0x0022 or # "
1333     $self->{nc} == 0x0027) { # '
1334    
1335     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1336     } else {
1337    
1338     }
1339     $self->{ca}->{name} .= chr ($self->{nc});
1340     ## Stay in the state
1341    
1342     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1343     $self->{line_prev} = $self->{line};
1344     $self->{column_prev} = $self->{column};
1345     $self->{column}++;
1346     $self->{nc}
1347     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1348     } else {
1349     $self->{set_nc}->($self);
1350     }
1351    
1352     redo A;
1353     }
1354     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1355     if ($is_space->{$self->{nc}}) {
1356    
1357     ## Stay in the state
1358    
1359     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1360     $self->{line_prev} = $self->{line};
1361     $self->{column_prev} = $self->{column};
1362     $self->{column}++;
1363     $self->{nc}
1364     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1365     } else {
1366     $self->{set_nc}->($self);
1367     }
1368    
1369     redo A;
1370     } elsif ($self->{nc} == 0x003D) { # =
1371    
1372     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1373    
1374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1375     $self->{line_prev} = $self->{line};
1376     $self->{column_prev} = $self->{column};
1377     $self->{column}++;
1378     $self->{nc}
1379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1380     } else {
1381     $self->{set_nc}->($self);
1382     }
1383    
1384     redo A;
1385     } elsif ($self->{nc} == 0x003E) { # >
1386     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1387    
1388     $self->{last_stag_name} = $self->{ct}->{tag_name};
1389     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1390     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1391     if ($self->{ct}->{attributes}) {
1392    
1393     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1394     } else {
1395     ## NOTE: This state should never be reached.
1396    
1397     }
1398     } else {
1399     die "$0: $self->{ct}->{type}: Unknown token type";
1400     }
1401     $self->{state} = DATA_STATE;
1402 wakaba 1.5 $self->{s_kwd} = '';
1403 wakaba 1.1
1404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1405     $self->{line_prev} = $self->{line};
1406     $self->{column_prev} = $self->{column};
1407     $self->{column}++;
1408     $self->{nc}
1409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1410     } else {
1411     $self->{set_nc}->($self);
1412     }
1413    
1414    
1415     return ($self->{ct}); # start tag or end tag
1416    
1417     redo A;
1418     } elsif (0x0041 <= $self->{nc} and
1419     $self->{nc} <= 0x005A) { # A..Z
1420    
1421     $self->{ca}
1422 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1423 wakaba 1.1 value => '',
1424     line => $self->{line}, column => $self->{column}};
1425     $self->{state} = ATTRIBUTE_NAME_STATE;
1426    
1427     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1428     $self->{line_prev} = $self->{line};
1429     $self->{column_prev} = $self->{column};
1430     $self->{column}++;
1431     $self->{nc}
1432     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1433     } else {
1434     $self->{set_nc}->($self);
1435     }
1436    
1437     redo A;
1438     } elsif ($self->{nc} == 0x002F) { # /
1439    
1440     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1441    
1442     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1443     $self->{line_prev} = $self->{line};
1444     $self->{column_prev} = $self->{column};
1445     $self->{column}++;
1446     $self->{nc}
1447     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1448     } else {
1449     $self->{set_nc}->($self);
1450     }
1451    
1452     redo A;
1453     } elsif ($self->{nc} == -1) {
1454     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1455     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1456    
1457     $self->{last_stag_name} = $self->{ct}->{tag_name};
1458     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1459     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1460     if ($self->{ct}->{attributes}) {
1461    
1462     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1463     } else {
1464     ## NOTE: This state should never be reached.
1465    
1466     }
1467     } else {
1468     die "$0: $self->{ct}->{type}: Unknown token type";
1469     }
1470 wakaba 1.5 $self->{s_kwd} = '';
1471 wakaba 1.1 $self->{state} = DATA_STATE;
1472     # reconsume
1473    
1474     return ($self->{ct}); # start tag or end tag
1475    
1476     redo A;
1477     } else {
1478     if ($self->{nc} == 0x0022 or # "
1479     $self->{nc} == 0x0027) { # '
1480    
1481     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1482     } else {
1483    
1484     }
1485     $self->{ca}
1486     = {name => chr ($self->{nc}),
1487     value => '',
1488     line => $self->{line}, column => $self->{column}};
1489     $self->{state} = ATTRIBUTE_NAME_STATE;
1490    
1491     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1492     $self->{line_prev} = $self->{line};
1493     $self->{column_prev} = $self->{column};
1494     $self->{column}++;
1495     $self->{nc}
1496     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1497     } else {
1498     $self->{set_nc}->($self);
1499     }
1500    
1501     redo A;
1502     }
1503     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1504     if ($is_space->{$self->{nc}}) {
1505    
1506     ## Stay in the state
1507    
1508     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1509     $self->{line_prev} = $self->{line};
1510     $self->{column_prev} = $self->{column};
1511     $self->{column}++;
1512     $self->{nc}
1513     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1514     } else {
1515     $self->{set_nc}->($self);
1516     }
1517    
1518     redo A;
1519     } elsif ($self->{nc} == 0x0022) { # "
1520    
1521     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1522    
1523     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1524     $self->{line_prev} = $self->{line};
1525     $self->{column_prev} = $self->{column};
1526     $self->{column}++;
1527     $self->{nc}
1528     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1529     } else {
1530     $self->{set_nc}->($self);
1531     }
1532    
1533     redo A;
1534     } elsif ($self->{nc} == 0x0026) { # &
1535    
1536     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1537     ## reconsume
1538     redo A;
1539     } elsif ($self->{nc} == 0x0027) { # '
1540    
1541     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1542    
1543     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1544     $self->{line_prev} = $self->{line};
1545     $self->{column_prev} = $self->{column};
1546     $self->{column}++;
1547     $self->{nc}
1548     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1549     } else {
1550     $self->{set_nc}->($self);
1551     }
1552    
1553     redo A;
1554     } elsif ($self->{nc} == 0x003E) { # >
1555     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1556     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1557    
1558     $self->{last_stag_name} = $self->{ct}->{tag_name};
1559     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1560     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1561     if ($self->{ct}->{attributes}) {
1562    
1563     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1564     } else {
1565     ## NOTE: This state should never be reached.
1566    
1567     }
1568     } else {
1569     die "$0: $self->{ct}->{type}: Unknown token type";
1570     }
1571     $self->{state} = DATA_STATE;
1572 wakaba 1.5 $self->{s_kwd} = '';
1573 wakaba 1.1
1574     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1575     $self->{line_prev} = $self->{line};
1576     $self->{column_prev} = $self->{column};
1577     $self->{column}++;
1578     $self->{nc}
1579     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1580     } else {
1581     $self->{set_nc}->($self);
1582     }
1583    
1584    
1585     return ($self->{ct}); # start tag or end tag
1586    
1587     redo A;
1588     } elsif ($self->{nc} == -1) {
1589     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1590     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1591    
1592     $self->{last_stag_name} = $self->{ct}->{tag_name};
1593     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1594     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1595     if ($self->{ct}->{attributes}) {
1596    
1597     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1598     } else {
1599     ## NOTE: This state should never be reached.
1600    
1601     }
1602     } else {
1603     die "$0: $self->{ct}->{type}: Unknown token type";
1604     }
1605     $self->{state} = DATA_STATE;
1606 wakaba 1.5 $self->{s_kwd} = '';
1607 wakaba 1.1 ## reconsume
1608    
1609     return ($self->{ct}); # start tag or end tag
1610    
1611     redo A;
1612     } else {
1613     if ($self->{nc} == 0x003D) { # =
1614    
1615     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1616     } else {
1617    
1618     }
1619     $self->{ca}->{value} .= chr ($self->{nc});
1620     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1621    
1622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1623     $self->{line_prev} = $self->{line};
1624     $self->{column_prev} = $self->{column};
1625     $self->{column}++;
1626     $self->{nc}
1627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1628     } else {
1629     $self->{set_nc}->($self);
1630     }
1631    
1632     redo A;
1633     }
1634     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1635     if ($self->{nc} == 0x0022) { # "
1636    
1637     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1638    
1639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1640     $self->{line_prev} = $self->{line};
1641     $self->{column_prev} = $self->{column};
1642     $self->{column}++;
1643     $self->{nc}
1644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1645     } else {
1646     $self->{set_nc}->($self);
1647     }
1648    
1649     redo A;
1650     } elsif ($self->{nc} == 0x0026) { # &
1651    
1652     ## NOTE: In the spec, the tokenizer is switched to the
1653     ## "entity in attribute value state". In this implementation, the
1654     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1655     ## implementation of the "consume a character reference" algorithm.
1656     $self->{prev_state} = $self->{state};
1657     $self->{entity_add} = 0x0022; # "
1658     $self->{state} = ENTITY_STATE;
1659    
1660     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1661     $self->{line_prev} = $self->{line};
1662     $self->{column_prev} = $self->{column};
1663     $self->{column}++;
1664     $self->{nc}
1665     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1666     } else {
1667     $self->{set_nc}->($self);
1668     }
1669    
1670     redo A;
1671     } elsif ($self->{nc} == -1) {
1672     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1673     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1674    
1675     $self->{last_stag_name} = $self->{ct}->{tag_name};
1676     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1677     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1678     if ($self->{ct}->{attributes}) {
1679    
1680     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1681     } else {
1682     ## NOTE: This state should never be reached.
1683    
1684     }
1685     } else {
1686     die "$0: $self->{ct}->{type}: Unknown token type";
1687     }
1688     $self->{state} = DATA_STATE;
1689 wakaba 1.5 $self->{s_kwd} = '';
1690 wakaba 1.1 ## reconsume
1691    
1692     return ($self->{ct}); # start tag or end tag
1693    
1694     redo A;
1695     } else {
1696    
1697     $self->{ca}->{value} .= chr ($self->{nc});
1698     $self->{read_until}->($self->{ca}->{value},
1699     q["&],
1700     length $self->{ca}->{value});
1701    
1702     ## Stay in the state
1703    
1704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1705     $self->{line_prev} = $self->{line};
1706     $self->{column_prev} = $self->{column};
1707     $self->{column}++;
1708     $self->{nc}
1709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1710     } else {
1711     $self->{set_nc}->($self);
1712     }
1713    
1714     redo A;
1715     }
1716     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1717     if ($self->{nc} == 0x0027) { # '
1718    
1719     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1720    
1721     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1722     $self->{line_prev} = $self->{line};
1723     $self->{column_prev} = $self->{column};
1724     $self->{column}++;
1725     $self->{nc}
1726     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1727     } else {
1728     $self->{set_nc}->($self);
1729     }
1730    
1731     redo A;
1732     } elsif ($self->{nc} == 0x0026) { # &
1733    
1734     ## NOTE: In the spec, the tokenizer is switched to the
1735     ## "entity in attribute value state". In this implementation, the
1736     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1737     ## implementation of the "consume a character reference" algorithm.
1738     $self->{entity_add} = 0x0027; # '
1739     $self->{prev_state} = $self->{state};
1740     $self->{state} = ENTITY_STATE;
1741    
1742     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1743     $self->{line_prev} = $self->{line};
1744     $self->{column_prev} = $self->{column};
1745     $self->{column}++;
1746     $self->{nc}
1747     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1748     } else {
1749     $self->{set_nc}->($self);
1750     }
1751    
1752     redo A;
1753     } elsif ($self->{nc} == -1) {
1754     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1755     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1756    
1757     $self->{last_stag_name} = $self->{ct}->{tag_name};
1758     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1759     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1760     if ($self->{ct}->{attributes}) {
1761    
1762     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1763     } else {
1764     ## NOTE: This state should never be reached.
1765    
1766     }
1767     } else {
1768     die "$0: $self->{ct}->{type}: Unknown token type";
1769     }
1770     $self->{state} = DATA_STATE;
1771 wakaba 1.5 $self->{s_kwd} = '';
1772 wakaba 1.1 ## reconsume
1773    
1774     return ($self->{ct}); # start tag or end tag
1775    
1776     redo A;
1777     } else {
1778    
1779     $self->{ca}->{value} .= chr ($self->{nc});
1780     $self->{read_until}->($self->{ca}->{value},
1781     q['&],
1782     length $self->{ca}->{value});
1783    
1784     ## Stay in the state
1785    
1786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1787     $self->{line_prev} = $self->{line};
1788     $self->{column_prev} = $self->{column};
1789     $self->{column}++;
1790     $self->{nc}
1791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1792     } else {
1793     $self->{set_nc}->($self);
1794     }
1795    
1796     redo A;
1797     }
1798     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1799     if ($is_space->{$self->{nc}}) {
1800    
1801     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1802    
1803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1804     $self->{line_prev} = $self->{line};
1805     $self->{column_prev} = $self->{column};
1806     $self->{column}++;
1807     $self->{nc}
1808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1809     } else {
1810     $self->{set_nc}->($self);
1811     }
1812    
1813     redo A;
1814     } elsif ($self->{nc} == 0x0026) { # &
1815    
1816     ## NOTE: In the spec, the tokenizer is switched to the
1817     ## "entity in attribute value state". In this implementation, the
1818     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1819     ## implementation of the "consume a character reference" algorithm.
1820     $self->{entity_add} = -1;
1821     $self->{prev_state} = $self->{state};
1822     $self->{state} = ENTITY_STATE;
1823    
1824     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1825     $self->{line_prev} = $self->{line};
1826     $self->{column_prev} = $self->{column};
1827     $self->{column}++;
1828     $self->{nc}
1829     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1830     } else {
1831     $self->{set_nc}->($self);
1832     }
1833    
1834     redo A;
1835     } elsif ($self->{nc} == 0x003E) { # >
1836     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1837    
1838     $self->{last_stag_name} = $self->{ct}->{tag_name};
1839     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1840     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1841     if ($self->{ct}->{attributes}) {
1842    
1843     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1844     } else {
1845     ## NOTE: This state should never be reached.
1846    
1847     }
1848     } else {
1849     die "$0: $self->{ct}->{type}: Unknown token type";
1850     }
1851     $self->{state} = DATA_STATE;
1852 wakaba 1.5 $self->{s_kwd} = '';
1853 wakaba 1.1
1854     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1855     $self->{line_prev} = $self->{line};
1856     $self->{column_prev} = $self->{column};
1857     $self->{column}++;
1858     $self->{nc}
1859     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1860     } else {
1861     $self->{set_nc}->($self);
1862     }
1863    
1864    
1865     return ($self->{ct}); # start tag or end tag
1866    
1867     redo A;
1868     } elsif ($self->{nc} == -1) {
1869     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1870     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1871    
1872     $self->{last_stag_name} = $self->{ct}->{tag_name};
1873     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1874     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1875     if ($self->{ct}->{attributes}) {
1876    
1877     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1878     } else {
1879     ## NOTE: This state should never be reached.
1880    
1881     }
1882     } else {
1883     die "$0: $self->{ct}->{type}: Unknown token type";
1884     }
1885     $self->{state} = DATA_STATE;
1886 wakaba 1.5 $self->{s_kwd} = '';
1887 wakaba 1.1 ## reconsume
1888    
1889     return ($self->{ct}); # start tag or end tag
1890    
1891     redo A;
1892     } else {
1893     if ({
1894     0x0022 => 1, # "
1895     0x0027 => 1, # '
1896     0x003D => 1, # =
1897     }->{$self->{nc}}) {
1898    
1899     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1900     } else {
1901    
1902     }
1903     $self->{ca}->{value} .= chr ($self->{nc});
1904     $self->{read_until}->($self->{ca}->{value},
1905     q["'=& >],
1906     length $self->{ca}->{value});
1907    
1908     ## Stay in the state
1909    
1910     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1911     $self->{line_prev} = $self->{line};
1912     $self->{column_prev} = $self->{column};
1913     $self->{column}++;
1914     $self->{nc}
1915     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1916     } else {
1917     $self->{set_nc}->($self);
1918     }
1919    
1920     redo A;
1921     }
1922     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1923     if ($is_space->{$self->{nc}}) {
1924    
1925     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1926    
1927     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1928     $self->{line_prev} = $self->{line};
1929     $self->{column_prev} = $self->{column};
1930     $self->{column}++;
1931     $self->{nc}
1932     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1933     } else {
1934     $self->{set_nc}->($self);
1935     }
1936    
1937     redo A;
1938     } elsif ($self->{nc} == 0x003E) { # >
1939     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1940    
1941     $self->{last_stag_name} = $self->{ct}->{tag_name};
1942     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1943     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1944     if ($self->{ct}->{attributes}) {
1945    
1946     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1947     } else {
1948     ## NOTE: This state should never be reached.
1949    
1950     }
1951     } else {
1952     die "$0: $self->{ct}->{type}: Unknown token type";
1953     }
1954     $self->{state} = DATA_STATE;
1955 wakaba 1.5 $self->{s_kwd} = '';
1956 wakaba 1.1
1957     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1958     $self->{line_prev} = $self->{line};
1959     $self->{column_prev} = $self->{column};
1960     $self->{column}++;
1961     $self->{nc}
1962     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1963     } else {
1964     $self->{set_nc}->($self);
1965     }
1966    
1967    
1968     return ($self->{ct}); # start tag or end tag
1969    
1970     redo A;
1971     } elsif ($self->{nc} == 0x002F) { # /
1972    
1973     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1974    
1975     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1976     $self->{line_prev} = $self->{line};
1977     $self->{column_prev} = $self->{column};
1978     $self->{column}++;
1979     $self->{nc}
1980     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1981     } else {
1982     $self->{set_nc}->($self);
1983     }
1984    
1985     redo A;
1986     } elsif ($self->{nc} == -1) {
1987     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1988     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1989    
1990     $self->{last_stag_name} = $self->{ct}->{tag_name};
1991     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1992     if ($self->{ct}->{attributes}) {
1993    
1994     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1995     } else {
1996     ## NOTE: This state should never be reached.
1997    
1998     }
1999     } else {
2000     die "$0: $self->{ct}->{type}: Unknown token type";
2001     }
2002     $self->{state} = DATA_STATE;
2003 wakaba 1.5 $self->{s_kwd} = '';
2004 wakaba 1.1 ## Reconsume.
2005     return ($self->{ct}); # start tag or end tag
2006     redo A;
2007     } else {
2008    
2009     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
2010     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2011     ## reconsume
2012     redo A;
2013     }
2014     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
2015     if ($self->{nc} == 0x003E) { # >
2016     if ($self->{ct}->{type} == END_TAG_TOKEN) {
2017    
2018     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
2019     ## TODO: Different type than slash in start tag
2020     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
2021     if ($self->{ct}->{attributes}) {
2022    
2023     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2024     } else {
2025    
2026     }
2027     ## TODO: Test |<title></title/>|
2028     } else {
2029    
2030     $self->{self_closing} = 1;
2031     }
2032    
2033     $self->{state} = DATA_STATE;
2034 wakaba 1.5 $self->{s_kwd} = '';
2035 wakaba 1.1
2036     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2037     $self->{line_prev} = $self->{line};
2038     $self->{column_prev} = $self->{column};
2039     $self->{column}++;
2040     $self->{nc}
2041     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2042     } else {
2043     $self->{set_nc}->($self);
2044     }
2045    
2046    
2047     return ($self->{ct}); # start tag or end tag
2048    
2049     redo A;
2050     } elsif ($self->{nc} == -1) {
2051     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
2052     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2053    
2054     $self->{last_stag_name} = $self->{ct}->{tag_name};
2055     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2056     if ($self->{ct}->{attributes}) {
2057    
2058     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2059     } else {
2060     ## NOTE: This state should never be reached.
2061    
2062     }
2063     } else {
2064     die "$0: $self->{ct}->{type}: Unknown token type";
2065     }
2066     $self->{state} = DATA_STATE;
2067 wakaba 1.5 $self->{s_kwd} = '';
2068 wakaba 1.1 ## Reconsume.
2069     return ($self->{ct}); # start tag or end tag
2070     redo A;
2071     } else {
2072    
2073     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2074     ## TODO: This error type is wrong.
2075     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2076     ## Reconsume.
2077     redo A;
2078     }
2079     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2080     ## (only happen if PCDATA state)
2081    
2082     ## NOTE: Unlike spec's "bogus comment state", this implementation
2083     ## consumes characters one-by-one basis.
2084    
2085     if ($self->{nc} == 0x003E) { # >
2086    
2087     $self->{state} = DATA_STATE;
2088 wakaba 1.5 $self->{s_kwd} = '';
2089 wakaba 1.1
2090     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2091     $self->{line_prev} = $self->{line};
2092     $self->{column_prev} = $self->{column};
2093     $self->{column}++;
2094     $self->{nc}
2095     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2096     } else {
2097     $self->{set_nc}->($self);
2098     }
2099    
2100    
2101     return ($self->{ct}); # comment
2102     redo A;
2103     } elsif ($self->{nc} == -1) {
2104    
2105     $self->{state} = DATA_STATE;
2106 wakaba 1.5 $self->{s_kwd} = '';
2107 wakaba 1.1 ## reconsume
2108    
2109     return ($self->{ct}); # comment
2110     redo A;
2111     } else {
2112    
2113     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2114     $self->{read_until}->($self->{ct}->{data},
2115     q[>],
2116     length $self->{ct}->{data});
2117    
2118     ## Stay in the state.
2119    
2120     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2121     $self->{line_prev} = $self->{line};
2122     $self->{column_prev} = $self->{column};
2123     $self->{column}++;
2124     $self->{nc}
2125     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2126     } else {
2127     $self->{set_nc}->($self);
2128     }
2129    
2130     redo A;
2131     }
2132     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2133     ## (only happen if PCDATA state)
2134    
2135     if ($self->{nc} == 0x002D) { # -
2136    
2137     $self->{state} = MD_HYPHEN_STATE;
2138    
2139     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2140     $self->{line_prev} = $self->{line};
2141     $self->{column_prev} = $self->{column};
2142     $self->{column}++;
2143     $self->{nc}
2144     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2145     } else {
2146     $self->{set_nc}->($self);
2147     }
2148    
2149     redo A;
2150     } elsif ($self->{nc} == 0x0044 or # D
2151     $self->{nc} == 0x0064) { # d
2152     ## ASCII case-insensitive.
2153    
2154     $self->{state} = MD_DOCTYPE_STATE;
2155     $self->{s_kwd} = chr $self->{nc};
2156    
2157     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2158     $self->{line_prev} = $self->{line};
2159     $self->{column_prev} = $self->{column};
2160     $self->{column}++;
2161     $self->{nc}
2162     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2163     } else {
2164     $self->{set_nc}->($self);
2165     }
2166    
2167     redo A;
2168 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2169     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2170     $self->{is_xml}) and
2171 wakaba 1.1 $self->{nc} == 0x005B) { # [
2172    
2173     $self->{state} = MD_CDATA_STATE;
2174     $self->{s_kwd} = '[';
2175    
2176     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2177     $self->{line_prev} = $self->{line};
2178     $self->{column_prev} = $self->{column};
2179     $self->{column}++;
2180     $self->{nc}
2181     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2182     } else {
2183     $self->{set_nc}->($self);
2184     }
2185    
2186     redo A;
2187     } else {
2188    
2189     }
2190    
2191     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2192     line => $self->{line_prev},
2193     column => $self->{column_prev} - 1);
2194     ## Reconsume.
2195     $self->{state} = BOGUS_COMMENT_STATE;
2196     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2197     line => $self->{line_prev},
2198     column => $self->{column_prev} - 1,
2199     };
2200     redo A;
2201     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2202     if ($self->{nc} == 0x002D) { # -
2203    
2204     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2205     line => $self->{line_prev},
2206     column => $self->{column_prev} - 2,
2207     };
2208 wakaba 1.10 $self->{state} = COMMENT_START_STATE; ## XML5: "comment state".
2209 wakaba 1.1
2210     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2211     $self->{line_prev} = $self->{line};
2212     $self->{column_prev} = $self->{column};
2213     $self->{column}++;
2214     $self->{nc}
2215     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2216     } else {
2217     $self->{set_nc}->($self);
2218     }
2219    
2220     redo A;
2221     } else {
2222    
2223     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2224     line => $self->{line_prev},
2225     column => $self->{column_prev} - 2);
2226     $self->{state} = BOGUS_COMMENT_STATE;
2227     ## Reconsume.
2228     $self->{ct} = {type => COMMENT_TOKEN,
2229     data => '-',
2230     line => $self->{line_prev},
2231     column => $self->{column_prev} - 2,
2232     };
2233     redo A;
2234     }
2235     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2236     ## ASCII case-insensitive.
2237     if ($self->{nc} == [
2238     undef,
2239     0x004F, # O
2240     0x0043, # C
2241     0x0054, # T
2242     0x0059, # Y
2243     0x0050, # P
2244     ]->[length $self->{s_kwd}] or
2245     $self->{nc} == [
2246     undef,
2247     0x006F, # o
2248     0x0063, # c
2249     0x0074, # t
2250     0x0079, # y
2251     0x0070, # p
2252     ]->[length $self->{s_kwd}]) {
2253    
2254     ## Stay in the state.
2255     $self->{s_kwd} .= chr $self->{nc};
2256    
2257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258     $self->{line_prev} = $self->{line};
2259     $self->{column_prev} = $self->{column};
2260     $self->{column}++;
2261     $self->{nc}
2262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2263     } else {
2264     $self->{set_nc}->($self);
2265     }
2266    
2267     redo A;
2268     } elsif ((length $self->{s_kwd}) == 6 and
2269     ($self->{nc} == 0x0045 or # E
2270     $self->{nc} == 0x0065)) { # e
2271 wakaba 1.10 if ($self->{s_kwd} ne 'DOCTYP') {
2272    
2273     ## XML5: case-sensitive.
2274     $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO
2275     text => 'DOCTYPE',
2276     line => $self->{line_prev},
2277     column => $self->{column_prev} - 5);
2278     } else {
2279    
2280     }
2281 wakaba 1.1 $self->{state} = DOCTYPE_STATE;
2282     $self->{ct} = {type => DOCTYPE_TOKEN,
2283     quirks => 1,
2284     line => $self->{line_prev},
2285     column => $self->{column_prev} - 7,
2286     };
2287    
2288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2289     $self->{line_prev} = $self->{line};
2290     $self->{column_prev} = $self->{column};
2291     $self->{column}++;
2292     $self->{nc}
2293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2294     } else {
2295     $self->{set_nc}->($self);
2296     }
2297    
2298     redo A;
2299     } else {
2300    
2301     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2302     line => $self->{line_prev},
2303     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2304     $self->{state} = BOGUS_COMMENT_STATE;
2305     ## Reconsume.
2306     $self->{ct} = {type => COMMENT_TOKEN,
2307     data => $self->{s_kwd},
2308     line => $self->{line_prev},
2309     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2310     };
2311     redo A;
2312     }
2313     } elsif ($self->{state} == MD_CDATA_STATE) {
2314     if ($self->{nc} == {
2315     '[' => 0x0043, # C
2316     '[C' => 0x0044, # D
2317     '[CD' => 0x0041, # A
2318     '[CDA' => 0x0054, # T
2319     '[CDAT' => 0x0041, # A
2320     }->{$self->{s_kwd}}) {
2321    
2322     ## Stay in the state.
2323     $self->{s_kwd} .= chr $self->{nc};
2324    
2325     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2326     $self->{line_prev} = $self->{line};
2327     $self->{column_prev} = $self->{column};
2328     $self->{column}++;
2329     $self->{nc}
2330     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2331     } else {
2332     $self->{set_nc}->($self);
2333     }
2334    
2335     redo A;
2336     } elsif ($self->{s_kwd} eq '[CDATA' and
2337     $self->{nc} == 0x005B) { # [
2338 wakaba 1.6 if ($self->{is_xml} and
2339     not $self->{tainted} and
2340     @{$self->{open_elements} or []} == 0) {
2341 wakaba 1.8
2342 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2343     line => $self->{line_prev},
2344     column => $self->{column_prev} - 7);
2345     $self->{tainted} = 1;
2346 wakaba 1.8 } else {
2347    
2348 wakaba 1.6 }
2349    
2350 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2351     data => '',
2352     line => $self->{line_prev},
2353     column => $self->{column_prev} - 7};
2354     $self->{state} = CDATA_SECTION_STATE;
2355    
2356     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2357     $self->{line_prev} = $self->{line};
2358     $self->{column_prev} = $self->{column};
2359     $self->{column}++;
2360     $self->{nc}
2361     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2362     } else {
2363     $self->{set_nc}->($self);
2364     }
2365    
2366     redo A;
2367     } else {
2368    
2369     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2370     line => $self->{line_prev},
2371     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2372     $self->{state} = BOGUS_COMMENT_STATE;
2373     ## Reconsume.
2374     $self->{ct} = {type => COMMENT_TOKEN,
2375     data => $self->{s_kwd},
2376     line => $self->{line_prev},
2377     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2378     };
2379     redo A;
2380     }
2381     } elsif ($self->{state} == COMMENT_START_STATE) {
2382     if ($self->{nc} == 0x002D) { # -
2383    
2384     $self->{state} = COMMENT_START_DASH_STATE;
2385    
2386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2387     $self->{line_prev} = $self->{line};
2388     $self->{column_prev} = $self->{column};
2389     $self->{column}++;
2390     $self->{nc}
2391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2392     } else {
2393     $self->{set_nc}->($self);
2394     }
2395    
2396     redo A;
2397     } elsif ($self->{nc} == 0x003E) { # >
2398    
2399     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2400     $self->{state} = DATA_STATE;
2401 wakaba 1.5 $self->{s_kwd} = '';
2402 wakaba 1.1
2403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2404     $self->{line_prev} = $self->{line};
2405     $self->{column_prev} = $self->{column};
2406     $self->{column}++;
2407     $self->{nc}
2408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2409     } else {
2410     $self->{set_nc}->($self);
2411     }
2412    
2413    
2414     return ($self->{ct}); # comment
2415    
2416     redo A;
2417     } elsif ($self->{nc} == -1) {
2418    
2419     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2420     $self->{state} = DATA_STATE;
2421 wakaba 1.5 $self->{s_kwd} = '';
2422 wakaba 1.1 ## reconsume
2423    
2424     return ($self->{ct}); # comment
2425    
2426     redo A;
2427     } else {
2428    
2429     $self->{ct}->{data} # comment
2430     .= chr ($self->{nc});
2431     $self->{state} = COMMENT_STATE;
2432    
2433     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2434     $self->{line_prev} = $self->{line};
2435     $self->{column_prev} = $self->{column};
2436     $self->{column}++;
2437     $self->{nc}
2438     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2439     } else {
2440     $self->{set_nc}->($self);
2441     }
2442    
2443     redo A;
2444     }
2445     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2446     if ($self->{nc} == 0x002D) { # -
2447    
2448     $self->{state} = COMMENT_END_STATE;
2449    
2450     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2451     $self->{line_prev} = $self->{line};
2452     $self->{column_prev} = $self->{column};
2453     $self->{column}++;
2454     $self->{nc}
2455     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2456     } else {
2457     $self->{set_nc}->($self);
2458     }
2459    
2460     redo A;
2461     } elsif ($self->{nc} == 0x003E) { # >
2462    
2463     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2464     $self->{state} = DATA_STATE;
2465 wakaba 1.5 $self->{s_kwd} = '';
2466 wakaba 1.1
2467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2468     $self->{line_prev} = $self->{line};
2469     $self->{column_prev} = $self->{column};
2470     $self->{column}++;
2471     $self->{nc}
2472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2473     } else {
2474     $self->{set_nc}->($self);
2475     }
2476    
2477    
2478     return ($self->{ct}); # comment
2479    
2480     redo A;
2481     } elsif ($self->{nc} == -1) {
2482    
2483     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2484     $self->{state} = DATA_STATE;
2485 wakaba 1.5 $self->{s_kwd} = '';
2486 wakaba 1.1 ## reconsume
2487    
2488     return ($self->{ct}); # comment
2489    
2490     redo A;
2491     } else {
2492    
2493     $self->{ct}->{data} # comment
2494     .= '-' . chr ($self->{nc});
2495     $self->{state} = COMMENT_STATE;
2496    
2497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2498     $self->{line_prev} = $self->{line};
2499     $self->{column_prev} = $self->{column};
2500     $self->{column}++;
2501     $self->{nc}
2502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2503     } else {
2504     $self->{set_nc}->($self);
2505     }
2506    
2507     redo A;
2508     }
2509     } elsif ($self->{state} == COMMENT_STATE) {
2510     if ($self->{nc} == 0x002D) { # -
2511    
2512     $self->{state} = COMMENT_END_DASH_STATE;
2513    
2514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515     $self->{line_prev} = $self->{line};
2516     $self->{column_prev} = $self->{column};
2517     $self->{column}++;
2518     $self->{nc}
2519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2520     } else {
2521     $self->{set_nc}->($self);
2522     }
2523    
2524     redo A;
2525     } elsif ($self->{nc} == -1) {
2526    
2527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2528     $self->{state} = DATA_STATE;
2529 wakaba 1.5 $self->{s_kwd} = '';
2530 wakaba 1.1 ## reconsume
2531    
2532     return ($self->{ct}); # comment
2533    
2534     redo A;
2535     } else {
2536    
2537     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2538     $self->{read_until}->($self->{ct}->{data},
2539     q[-],
2540     length $self->{ct}->{data});
2541    
2542     ## Stay in the state
2543    
2544     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2545     $self->{line_prev} = $self->{line};
2546     $self->{column_prev} = $self->{column};
2547     $self->{column}++;
2548     $self->{nc}
2549     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2550     } else {
2551     $self->{set_nc}->($self);
2552     }
2553    
2554     redo A;
2555     }
2556     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2557 wakaba 1.10 ## XML5: "comment dash state".
2558    
2559 wakaba 1.1 if ($self->{nc} == 0x002D) { # -
2560    
2561     $self->{state} = COMMENT_END_STATE;
2562    
2563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2564     $self->{line_prev} = $self->{line};
2565     $self->{column_prev} = $self->{column};
2566     $self->{column}++;
2567     $self->{nc}
2568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2569     } else {
2570     $self->{set_nc}->($self);
2571     }
2572    
2573     redo A;
2574     } elsif ($self->{nc} == -1) {
2575    
2576     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2577 wakaba 1.5 $self->{s_kwd} = '';
2578 wakaba 1.1 $self->{state} = DATA_STATE;
2579 wakaba 1.5 $self->{s_kwd} = '';
2580 wakaba 1.1 ## reconsume
2581    
2582     return ($self->{ct}); # comment
2583    
2584     redo A;
2585     } else {
2586    
2587     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2588     $self->{state} = COMMENT_STATE;
2589    
2590     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2591     $self->{line_prev} = $self->{line};
2592     $self->{column_prev} = $self->{column};
2593     $self->{column}++;
2594     $self->{nc}
2595     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2596     } else {
2597     $self->{set_nc}->($self);
2598     }
2599    
2600     redo A;
2601     }
2602     } elsif ($self->{state} == COMMENT_END_STATE) {
2603     if ($self->{nc} == 0x003E) { # >
2604    
2605     $self->{state} = DATA_STATE;
2606 wakaba 1.5 $self->{s_kwd} = '';
2607 wakaba 1.1
2608     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2609     $self->{line_prev} = $self->{line};
2610     $self->{column_prev} = $self->{column};
2611     $self->{column}++;
2612     $self->{nc}
2613     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2614     } else {
2615     $self->{set_nc}->($self);
2616     }
2617    
2618    
2619     return ($self->{ct}); # comment
2620    
2621     redo A;
2622     } elsif ($self->{nc} == 0x002D) { # -
2623    
2624 wakaba 1.10 ## XML5: Not a parse error.
2625 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2626     line => $self->{line_prev},
2627     column => $self->{column_prev});
2628     $self->{ct}->{data} .= '-'; # comment
2629     ## Stay in the state
2630    
2631     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2632     $self->{line_prev} = $self->{line};
2633     $self->{column_prev} = $self->{column};
2634     $self->{column}++;
2635     $self->{nc}
2636     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2637     } else {
2638     $self->{set_nc}->($self);
2639     }
2640    
2641     redo A;
2642     } elsif ($self->{nc} == -1) {
2643    
2644     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2645     $self->{state} = DATA_STATE;
2646 wakaba 1.5 $self->{s_kwd} = '';
2647 wakaba 1.1 ## reconsume
2648    
2649     return ($self->{ct}); # comment
2650    
2651     redo A;
2652     } else {
2653    
2654 wakaba 1.10 ## XML5: Not a parse error.
2655 wakaba 1.1 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2656     line => $self->{line_prev},
2657     column => $self->{column_prev});
2658     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2659     $self->{state} = COMMENT_STATE;
2660    
2661     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2662     $self->{line_prev} = $self->{line};
2663     $self->{column_prev} = $self->{column};
2664     $self->{column}++;
2665     $self->{nc}
2666     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2667     } else {
2668     $self->{set_nc}->($self);
2669     }
2670    
2671     redo A;
2672     }
2673     } elsif ($self->{state} == DOCTYPE_STATE) {
2674     if ($is_space->{$self->{nc}}) {
2675    
2676     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2677    
2678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2679     $self->{line_prev} = $self->{line};
2680     $self->{column_prev} = $self->{column};
2681     $self->{column}++;
2682     $self->{nc}
2683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2684     } else {
2685     $self->{set_nc}->($self);
2686     }
2687    
2688     redo A;
2689     } else {
2690    
2691     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2692     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2693     ## reconsume
2694     redo A;
2695     }
2696     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2697     if ($is_space->{$self->{nc}}) {
2698    
2699     ## Stay in the state
2700    
2701     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2702     $self->{line_prev} = $self->{line};
2703     $self->{column_prev} = $self->{column};
2704     $self->{column}++;
2705     $self->{nc}
2706     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2707     } else {
2708     $self->{set_nc}->($self);
2709     }
2710    
2711     redo A;
2712     } elsif ($self->{nc} == 0x003E) { # >
2713    
2714     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2715     $self->{state} = DATA_STATE;
2716 wakaba 1.5 $self->{s_kwd} = '';
2717 wakaba 1.1
2718     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2719     $self->{line_prev} = $self->{line};
2720     $self->{column_prev} = $self->{column};
2721     $self->{column}++;
2722     $self->{nc}
2723     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2724     } else {
2725     $self->{set_nc}->($self);
2726     }
2727    
2728    
2729     return ($self->{ct}); # DOCTYPE (quirks)
2730    
2731     redo A;
2732     } elsif ($self->{nc} == -1) {
2733    
2734     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2735     $self->{state} = DATA_STATE;
2736 wakaba 1.5 $self->{s_kwd} = '';
2737 wakaba 1.1 ## reconsume
2738    
2739     return ($self->{ct}); # DOCTYPE (quirks)
2740    
2741     redo A;
2742     } else {
2743    
2744     $self->{ct}->{name} = chr $self->{nc};
2745     delete $self->{ct}->{quirks};
2746     $self->{state} = DOCTYPE_NAME_STATE;
2747    
2748     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2749     $self->{line_prev} = $self->{line};
2750     $self->{column_prev} = $self->{column};
2751     $self->{column}++;
2752     $self->{nc}
2753     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2754     } else {
2755     $self->{set_nc}->($self);
2756     }
2757    
2758     redo A;
2759     }
2760     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2761     ## ISSUE: Redundant "First," in the spec.
2762     if ($is_space->{$self->{nc}}) {
2763    
2764     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2765    
2766     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2767     $self->{line_prev} = $self->{line};
2768     $self->{column_prev} = $self->{column};
2769     $self->{column}++;
2770     $self->{nc}
2771     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2772     } else {
2773     $self->{set_nc}->($self);
2774     }
2775    
2776     redo A;
2777     } elsif ($self->{nc} == 0x003E) { # >
2778    
2779     $self->{state} = DATA_STATE;
2780 wakaba 1.5 $self->{s_kwd} = '';
2781 wakaba 1.1
2782     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2783     $self->{line_prev} = $self->{line};
2784     $self->{column_prev} = $self->{column};
2785     $self->{column}++;
2786     $self->{nc}
2787     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2788     } else {
2789     $self->{set_nc}->($self);
2790     }
2791    
2792    
2793     return ($self->{ct}); # DOCTYPE
2794    
2795     redo A;
2796     } elsif ($self->{nc} == -1) {
2797    
2798     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2799     $self->{state} = DATA_STATE;
2800 wakaba 1.5 $self->{s_kwd} = '';
2801 wakaba 1.1 ## reconsume
2802    
2803     $self->{ct}->{quirks} = 1;
2804     return ($self->{ct}); # DOCTYPE
2805    
2806     redo A;
2807     } else {
2808    
2809     $self->{ct}->{name}
2810     .= chr ($self->{nc}); # DOCTYPE
2811     ## Stay in the state
2812    
2813     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2814     $self->{line_prev} = $self->{line};
2815     $self->{column_prev} = $self->{column};
2816     $self->{column}++;
2817     $self->{nc}
2818     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2819     } else {
2820     $self->{set_nc}->($self);
2821     }
2822    
2823     redo A;
2824     }
2825     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2826     if ($is_space->{$self->{nc}}) {
2827    
2828     ## Stay in the state
2829    
2830     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2831     $self->{line_prev} = $self->{line};
2832     $self->{column_prev} = $self->{column};
2833     $self->{column}++;
2834     $self->{nc}
2835     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2836     } else {
2837     $self->{set_nc}->($self);
2838     }
2839    
2840     redo A;
2841     } elsif ($self->{nc} == 0x003E) { # >
2842    
2843     $self->{state} = DATA_STATE;
2844 wakaba 1.5 $self->{s_kwd} = '';
2845 wakaba 1.1
2846     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2847     $self->{line_prev} = $self->{line};
2848     $self->{column_prev} = $self->{column};
2849     $self->{column}++;
2850     $self->{nc}
2851     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2852     } else {
2853     $self->{set_nc}->($self);
2854     }
2855    
2856    
2857     return ($self->{ct}); # DOCTYPE
2858    
2859     redo A;
2860     } elsif ($self->{nc} == -1) {
2861    
2862     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2863     $self->{state} = DATA_STATE;
2864 wakaba 1.5 $self->{s_kwd} = '';
2865 wakaba 1.1 ## reconsume
2866    
2867     $self->{ct}->{quirks} = 1;
2868     return ($self->{ct}); # DOCTYPE
2869    
2870     redo A;
2871     } elsif ($self->{nc} == 0x0050 or # P
2872     $self->{nc} == 0x0070) { # p
2873     $self->{state} = PUBLIC_STATE;
2874     $self->{s_kwd} = chr $self->{nc};
2875    
2876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2877     $self->{line_prev} = $self->{line};
2878     $self->{column_prev} = $self->{column};
2879     $self->{column}++;
2880     $self->{nc}
2881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2882     } else {
2883     $self->{set_nc}->($self);
2884     }
2885    
2886     redo A;
2887     } elsif ($self->{nc} == 0x0053 or # S
2888     $self->{nc} == 0x0073) { # s
2889     $self->{state} = SYSTEM_STATE;
2890     $self->{s_kwd} = chr $self->{nc};
2891    
2892     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2893     $self->{line_prev} = $self->{line};
2894     $self->{column_prev} = $self->{column};
2895     $self->{column}++;
2896     $self->{nc}
2897     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2898     } else {
2899     $self->{set_nc}->($self);
2900     }
2901    
2902     redo A;
2903     } else {
2904    
2905     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2906     $self->{ct}->{quirks} = 1;
2907    
2908     $self->{state} = BOGUS_DOCTYPE_STATE;
2909    
2910     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2911     $self->{line_prev} = $self->{line};
2912     $self->{column_prev} = $self->{column};
2913     $self->{column}++;
2914     $self->{nc}
2915     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2916     } else {
2917     $self->{set_nc}->($self);
2918     }
2919    
2920     redo A;
2921     }
2922     } elsif ($self->{state} == PUBLIC_STATE) {
2923     ## ASCII case-insensitive
2924     if ($self->{nc} == [
2925     undef,
2926     0x0055, # U
2927     0x0042, # B
2928     0x004C, # L
2929     0x0049, # I
2930     ]->[length $self->{s_kwd}] or
2931     $self->{nc} == [
2932     undef,
2933     0x0075, # u
2934     0x0062, # b
2935     0x006C, # l
2936     0x0069, # i
2937     ]->[length $self->{s_kwd}]) {
2938    
2939     ## Stay in the state.
2940     $self->{s_kwd} .= chr $self->{nc};
2941    
2942     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2943     $self->{line_prev} = $self->{line};
2944     $self->{column_prev} = $self->{column};
2945     $self->{column}++;
2946     $self->{nc}
2947     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2948     } else {
2949     $self->{set_nc}->($self);
2950     }
2951    
2952     redo A;
2953     } elsif ((length $self->{s_kwd}) == 5 and
2954     ($self->{nc} == 0x0043 or # C
2955     $self->{nc} == 0x0063)) { # c
2956    
2957     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2958    
2959     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2960     $self->{line_prev} = $self->{line};
2961     $self->{column_prev} = $self->{column};
2962     $self->{column}++;
2963     $self->{nc}
2964     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2965     } else {
2966     $self->{set_nc}->($self);
2967     }
2968    
2969     redo A;
2970     } else {
2971    
2972     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2973     line => $self->{line_prev},
2974     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2975     $self->{ct}->{quirks} = 1;
2976    
2977     $self->{state} = BOGUS_DOCTYPE_STATE;
2978     ## Reconsume.
2979     redo A;
2980     }
2981     } elsif ($self->{state} == SYSTEM_STATE) {
2982     ## ASCII case-insensitive
2983     if ($self->{nc} == [
2984     undef,
2985     0x0059, # Y
2986     0x0053, # S
2987     0x0054, # T
2988     0x0045, # E
2989     ]->[length $self->{s_kwd}] or
2990     $self->{nc} == [
2991     undef,
2992     0x0079, # y
2993     0x0073, # s
2994     0x0074, # t
2995     0x0065, # e
2996     ]->[length $self->{s_kwd}]) {
2997    
2998     ## Stay in the state.
2999     $self->{s_kwd} .= chr $self->{nc};
3000    
3001     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3002     $self->{line_prev} = $self->{line};
3003     $self->{column_prev} = $self->{column};
3004     $self->{column}++;
3005     $self->{nc}
3006     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3007     } else {
3008     $self->{set_nc}->($self);
3009     }
3010    
3011     redo A;
3012     } elsif ((length $self->{s_kwd}) == 5 and
3013     ($self->{nc} == 0x004D or # M
3014     $self->{nc} == 0x006D)) { # m
3015    
3016     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3017    
3018     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3019     $self->{line_prev} = $self->{line};
3020     $self->{column_prev} = $self->{column};
3021     $self->{column}++;
3022     $self->{nc}
3023     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3024     } else {
3025     $self->{set_nc}->($self);
3026     }
3027    
3028     redo A;
3029     } else {
3030    
3031     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
3032     line => $self->{line_prev},
3033     column => $self->{column_prev} + 1 - length $self->{s_kwd});
3034     $self->{ct}->{quirks} = 1;
3035    
3036     $self->{state} = BOGUS_DOCTYPE_STATE;
3037     ## Reconsume.
3038     redo A;
3039     }
3040     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3041     if ($is_space->{$self->{nc}}) {
3042    
3043     ## Stay in the state
3044    
3045     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3046     $self->{line_prev} = $self->{line};
3047     $self->{column_prev} = $self->{column};
3048     $self->{column}++;
3049     $self->{nc}
3050     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3051     } else {
3052     $self->{set_nc}->($self);
3053     }
3054    
3055     redo A;
3056     } elsif ($self->{nc} eq 0x0022) { # "
3057    
3058     $self->{ct}->{pubid} = ''; # DOCTYPE
3059     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
3060    
3061     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3062     $self->{line_prev} = $self->{line};
3063     $self->{column_prev} = $self->{column};
3064     $self->{column}++;
3065     $self->{nc}
3066     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3067     } else {
3068     $self->{set_nc}->($self);
3069     }
3070    
3071     redo A;
3072     } elsif ($self->{nc} eq 0x0027) { # '
3073    
3074     $self->{ct}->{pubid} = ''; # DOCTYPE
3075     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3076    
3077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3078     $self->{line_prev} = $self->{line};
3079     $self->{column_prev} = $self->{column};
3080     $self->{column}++;
3081     $self->{nc}
3082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3083     } else {
3084     $self->{set_nc}->($self);
3085     }
3086    
3087     redo A;
3088     } elsif ($self->{nc} eq 0x003E) { # >
3089    
3090     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3091    
3092     $self->{state} = DATA_STATE;
3093 wakaba 1.5 $self->{s_kwd} = '';
3094 wakaba 1.1
3095     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3096     $self->{line_prev} = $self->{line};
3097     $self->{column_prev} = $self->{column};
3098     $self->{column}++;
3099     $self->{nc}
3100     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3101     } else {
3102     $self->{set_nc}->($self);
3103     }
3104    
3105    
3106     $self->{ct}->{quirks} = 1;
3107     return ($self->{ct}); # DOCTYPE
3108    
3109     redo A;
3110     } elsif ($self->{nc} == -1) {
3111    
3112     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3113    
3114     $self->{state} = DATA_STATE;
3115 wakaba 1.5 $self->{s_kwd} = '';
3116 wakaba 1.1 ## reconsume
3117    
3118     $self->{ct}->{quirks} = 1;
3119     return ($self->{ct}); # DOCTYPE
3120    
3121     redo A;
3122     } else {
3123    
3124     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3125     $self->{ct}->{quirks} = 1;
3126    
3127     $self->{state} = BOGUS_DOCTYPE_STATE;
3128    
3129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3130     $self->{line_prev} = $self->{line};
3131     $self->{column_prev} = $self->{column};
3132     $self->{column}++;
3133     $self->{nc}
3134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3135     } else {
3136     $self->{set_nc}->($self);
3137     }
3138    
3139     redo A;
3140     }
3141     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3142     if ($self->{nc} == 0x0022) { # "
3143    
3144     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3145    
3146     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3147     $self->{line_prev} = $self->{line};
3148     $self->{column_prev} = $self->{column};
3149     $self->{column}++;
3150     $self->{nc}
3151     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3152     } else {
3153     $self->{set_nc}->($self);
3154     }
3155    
3156     redo A;
3157     } elsif ($self->{nc} == 0x003E) { # >
3158    
3159     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3160    
3161     $self->{state} = DATA_STATE;
3162 wakaba 1.5 $self->{s_kwd} = '';
3163 wakaba 1.1
3164     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3165     $self->{line_prev} = $self->{line};
3166     $self->{column_prev} = $self->{column};
3167     $self->{column}++;
3168     $self->{nc}
3169     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3170     } else {
3171     $self->{set_nc}->($self);
3172     }
3173    
3174    
3175     $self->{ct}->{quirks} = 1;
3176     return ($self->{ct}); # DOCTYPE
3177    
3178     redo A;
3179     } elsif ($self->{nc} == -1) {
3180    
3181     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3182    
3183     $self->{state} = DATA_STATE;
3184 wakaba 1.5 $self->{s_kwd} = '';
3185 wakaba 1.1 ## reconsume
3186    
3187     $self->{ct}->{quirks} = 1;
3188     return ($self->{ct}); # DOCTYPE
3189    
3190     redo A;
3191     } else {
3192    
3193     $self->{ct}->{pubid} # DOCTYPE
3194     .= chr $self->{nc};
3195     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3196     length $self->{ct}->{pubid});
3197    
3198     ## Stay in the state
3199    
3200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3201     $self->{line_prev} = $self->{line};
3202     $self->{column_prev} = $self->{column};
3203     $self->{column}++;
3204     $self->{nc}
3205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3206     } else {
3207     $self->{set_nc}->($self);
3208     }
3209    
3210     redo A;
3211     }
3212     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3213     if ($self->{nc} == 0x0027) { # '
3214    
3215     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3216    
3217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3218     $self->{line_prev} = $self->{line};
3219     $self->{column_prev} = $self->{column};
3220     $self->{column}++;
3221     $self->{nc}
3222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3223     } else {
3224     $self->{set_nc}->($self);
3225     }
3226    
3227     redo A;
3228     } elsif ($self->{nc} == 0x003E) { # >
3229    
3230     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3231    
3232     $self->{state} = DATA_STATE;
3233 wakaba 1.5 $self->{s_kwd} = '';
3234 wakaba 1.1
3235     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3236     $self->{line_prev} = $self->{line};
3237     $self->{column_prev} = $self->{column};
3238     $self->{column}++;
3239     $self->{nc}
3240     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3241     } else {
3242     $self->{set_nc}->($self);
3243     }
3244    
3245    
3246     $self->{ct}->{quirks} = 1;
3247     return ($self->{ct}); # DOCTYPE
3248    
3249     redo A;
3250     } elsif ($self->{nc} == -1) {
3251    
3252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3253    
3254     $self->{state} = DATA_STATE;
3255 wakaba 1.5 $self->{s_kwd} = '';
3256 wakaba 1.1 ## reconsume
3257    
3258     $self->{ct}->{quirks} = 1;
3259     return ($self->{ct}); # DOCTYPE
3260    
3261     redo A;
3262     } else {
3263    
3264     $self->{ct}->{pubid} # DOCTYPE
3265     .= chr $self->{nc};
3266     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3267     length $self->{ct}->{pubid});
3268    
3269     ## Stay in the state
3270    
3271     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3272     $self->{line_prev} = $self->{line};
3273     $self->{column_prev} = $self->{column};
3274     $self->{column}++;
3275     $self->{nc}
3276     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3277     } else {
3278     $self->{set_nc}->($self);
3279     }
3280    
3281     redo A;
3282     }
3283     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3284     if ($is_space->{$self->{nc}}) {
3285    
3286     ## Stay in the state
3287    
3288     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3289     $self->{line_prev} = $self->{line};
3290     $self->{column_prev} = $self->{column};
3291     $self->{column}++;
3292     $self->{nc}
3293     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3294     } else {
3295     $self->{set_nc}->($self);
3296     }
3297    
3298     redo A;
3299     } elsif ($self->{nc} == 0x0022) { # "
3300    
3301     $self->{ct}->{sysid} = ''; # DOCTYPE
3302     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3303    
3304     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3305     $self->{line_prev} = $self->{line};
3306     $self->{column_prev} = $self->{column};
3307     $self->{column}++;
3308     $self->{nc}
3309     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3310     } else {
3311     $self->{set_nc}->($self);
3312     }
3313    
3314     redo A;
3315     } elsif ($self->{nc} == 0x0027) { # '
3316    
3317     $self->{ct}->{sysid} = ''; # DOCTYPE
3318     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3319    
3320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3321     $self->{line_prev} = $self->{line};
3322     $self->{column_prev} = $self->{column};
3323     $self->{column}++;
3324     $self->{nc}
3325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3326     } else {
3327     $self->{set_nc}->($self);
3328     }
3329    
3330     redo A;
3331     } elsif ($self->{nc} == 0x003E) { # >
3332    
3333     $self->{state} = DATA_STATE;
3334 wakaba 1.5 $self->{s_kwd} = '';
3335 wakaba 1.1
3336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3337     $self->{line_prev} = $self->{line};
3338     $self->{column_prev} = $self->{column};
3339     $self->{column}++;
3340     $self->{nc}
3341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3342     } else {
3343     $self->{set_nc}->($self);
3344     }
3345    
3346    
3347     return ($self->{ct}); # DOCTYPE
3348    
3349     redo A;
3350     } elsif ($self->{nc} == -1) {
3351    
3352     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3353    
3354     $self->{state} = DATA_STATE;
3355 wakaba 1.5 $self->{s_kwd} = '';
3356 wakaba 1.1 ## reconsume
3357    
3358     $self->{ct}->{quirks} = 1;
3359     return ($self->{ct}); # DOCTYPE
3360    
3361     redo A;
3362     } else {
3363    
3364     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3365     $self->{ct}->{quirks} = 1;
3366    
3367     $self->{state} = BOGUS_DOCTYPE_STATE;
3368    
3369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370     $self->{line_prev} = $self->{line};
3371     $self->{column_prev} = $self->{column};
3372     $self->{column}++;
3373     $self->{nc}
3374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3375     } else {
3376     $self->{set_nc}->($self);
3377     }
3378    
3379     redo A;
3380     }
3381     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3382     if ($is_space->{$self->{nc}}) {
3383    
3384     ## Stay in the state
3385    
3386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3387     $self->{line_prev} = $self->{line};
3388     $self->{column_prev} = $self->{column};
3389     $self->{column}++;
3390     $self->{nc}
3391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3392     } else {
3393     $self->{set_nc}->($self);
3394     }
3395    
3396     redo A;
3397     } elsif ($self->{nc} == 0x0022) { # "
3398    
3399     $self->{ct}->{sysid} = ''; # DOCTYPE
3400     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3401    
3402     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3403     $self->{line_prev} = $self->{line};
3404     $self->{column_prev} = $self->{column};
3405     $self->{column}++;
3406     $self->{nc}
3407     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3408     } else {
3409     $self->{set_nc}->($self);
3410     }
3411    
3412     redo A;
3413     } elsif ($self->{nc} == 0x0027) { # '
3414    
3415     $self->{ct}->{sysid} = ''; # DOCTYPE
3416     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3417    
3418     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3419     $self->{line_prev} = $self->{line};
3420     $self->{column_prev} = $self->{column};
3421     $self->{column}++;
3422     $self->{nc}
3423     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3424     } else {
3425     $self->{set_nc}->($self);
3426     }
3427    
3428     redo A;
3429     } elsif ($self->{nc} == 0x003E) { # >
3430    
3431     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3432     $self->{state} = DATA_STATE;
3433 wakaba 1.5 $self->{s_kwd} = '';
3434 wakaba 1.1
3435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3436     $self->{line_prev} = $self->{line};
3437     $self->{column_prev} = $self->{column};
3438     $self->{column}++;
3439     $self->{nc}
3440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3441     } else {
3442     $self->{set_nc}->($self);
3443     }
3444    
3445    
3446     $self->{ct}->{quirks} = 1;
3447     return ($self->{ct}); # DOCTYPE
3448    
3449     redo A;
3450     } elsif ($self->{nc} == -1) {
3451    
3452     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3453    
3454     $self->{state} = DATA_STATE;
3455 wakaba 1.5 $self->{s_kwd} = '';
3456 wakaba 1.1 ## reconsume
3457    
3458     $self->{ct}->{quirks} = 1;
3459     return ($self->{ct}); # DOCTYPE
3460    
3461     redo A;
3462     } else {
3463    
3464     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3465     $self->{ct}->{quirks} = 1;
3466    
3467     $self->{state} = BOGUS_DOCTYPE_STATE;
3468    
3469     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3470     $self->{line_prev} = $self->{line};
3471     $self->{column_prev} = $self->{column};
3472     $self->{column}++;
3473     $self->{nc}
3474     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3475     } else {
3476     $self->{set_nc}->($self);
3477     }
3478    
3479     redo A;
3480     }
3481     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3482     if ($self->{nc} == 0x0022) { # "
3483    
3484     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3485    
3486     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3487     $self->{line_prev} = $self->{line};
3488     $self->{column_prev} = $self->{column};
3489     $self->{column}++;
3490     $self->{nc}
3491     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3492     } else {
3493     $self->{set_nc}->($self);
3494     }
3495    
3496     redo A;
3497     } elsif ($self->{nc} == 0x003E) { # >
3498    
3499     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3500    
3501     $self->{state} = DATA_STATE;
3502 wakaba 1.5 $self->{s_kwd} = '';
3503 wakaba 1.1
3504     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3505     $self->{line_prev} = $self->{line};
3506     $self->{column_prev} = $self->{column};
3507     $self->{column}++;
3508     $self->{nc}
3509     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3510     } else {
3511     $self->{set_nc}->($self);
3512     }
3513    
3514    
3515     $self->{ct}->{quirks} = 1;
3516     return ($self->{ct}); # DOCTYPE
3517    
3518     redo A;
3519     } elsif ($self->{nc} == -1) {
3520    
3521     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3522    
3523     $self->{state} = DATA_STATE;
3524 wakaba 1.5 $self->{s_kwd} = '';
3525 wakaba 1.1 ## reconsume
3526    
3527     $self->{ct}->{quirks} = 1;
3528     return ($self->{ct}); # DOCTYPE
3529    
3530     redo A;
3531     } else {
3532    
3533     $self->{ct}->{sysid} # DOCTYPE
3534     .= chr $self->{nc};
3535     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3536     length $self->{ct}->{sysid});
3537    
3538     ## Stay in the state
3539    
3540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3541     $self->{line_prev} = $self->{line};
3542     $self->{column_prev} = $self->{column};
3543     $self->{column}++;
3544     $self->{nc}
3545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3546     } else {
3547     $self->{set_nc}->($self);
3548     }
3549    
3550     redo A;
3551     }
3552     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3553     if ($self->{nc} == 0x0027) { # '
3554    
3555     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3556    
3557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3558     $self->{line_prev} = $self->{line};
3559     $self->{column_prev} = $self->{column};
3560     $self->{column}++;
3561     $self->{nc}
3562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3563     } else {
3564     $self->{set_nc}->($self);
3565     }
3566    
3567     redo A;
3568     } elsif ($self->{nc} == 0x003E) { # >
3569    
3570     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3571    
3572     $self->{state} = DATA_STATE;
3573 wakaba 1.5 $self->{s_kwd} = '';
3574 wakaba 1.1
3575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3576     $self->{line_prev} = $self->{line};
3577     $self->{column_prev} = $self->{column};
3578     $self->{column}++;
3579     $self->{nc}
3580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3581     } else {
3582     $self->{set_nc}->($self);
3583     }
3584    
3585    
3586     $self->{ct}->{quirks} = 1;
3587     return ($self->{ct}); # DOCTYPE
3588    
3589     redo A;
3590     } elsif ($self->{nc} == -1) {
3591    
3592     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3593    
3594     $self->{state} = DATA_STATE;
3595 wakaba 1.5 $self->{s_kwd} = '';
3596 wakaba 1.1 ## reconsume
3597    
3598     $self->{ct}->{quirks} = 1;
3599     return ($self->{ct}); # DOCTYPE
3600    
3601     redo A;
3602     } else {
3603    
3604     $self->{ct}->{sysid} # DOCTYPE
3605     .= chr $self->{nc};
3606     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3607     length $self->{ct}->{sysid});
3608    
3609     ## Stay in the state
3610    
3611     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3612     $self->{line_prev} = $self->{line};
3613     $self->{column_prev} = $self->{column};
3614     $self->{column}++;
3615     $self->{nc}
3616     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3617     } else {
3618     $self->{set_nc}->($self);
3619     }
3620    
3621     redo A;
3622     }
3623     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3624     if ($is_space->{$self->{nc}}) {
3625    
3626     ## Stay in the state
3627    
3628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3629     $self->{line_prev} = $self->{line};
3630     $self->{column_prev} = $self->{column};
3631     $self->{column}++;
3632     $self->{nc}
3633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3634     } else {
3635     $self->{set_nc}->($self);
3636     }
3637    
3638     redo A;
3639     } elsif ($self->{nc} == 0x003E) { # >
3640    
3641     $self->{state} = DATA_STATE;
3642 wakaba 1.5 $self->{s_kwd} = '';
3643 wakaba 1.1
3644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3645     $self->{line_prev} = $self->{line};
3646     $self->{column_prev} = $self->{column};
3647     $self->{column}++;
3648     $self->{nc}
3649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3650     } else {
3651     $self->{set_nc}->($self);
3652     }
3653    
3654    
3655     return ($self->{ct}); # DOCTYPE
3656    
3657     redo A;
3658     } elsif ($self->{nc} == -1) {
3659    
3660     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3661     $self->{state} = DATA_STATE;
3662 wakaba 1.5 $self->{s_kwd} = '';
3663 wakaba 1.1 ## reconsume
3664    
3665     $self->{ct}->{quirks} = 1;
3666     return ($self->{ct}); # DOCTYPE
3667    
3668     redo A;
3669     } else {
3670    
3671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3672     #$self->{ct}->{quirks} = 1;
3673    
3674     $self->{state} = BOGUS_DOCTYPE_STATE;
3675    
3676     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3677     $self->{line_prev} = $self->{line};
3678     $self->{column_prev} = $self->{column};
3679     $self->{column}++;
3680     $self->{nc}
3681     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3682     } else {
3683     $self->{set_nc}->($self);
3684     }
3685    
3686     redo A;
3687     }
3688     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3689     if ($self->{nc} == 0x003E) { # >
3690    
3691     $self->{state} = DATA_STATE;
3692 wakaba 1.5 $self->{s_kwd} = '';
3693 wakaba 1.1
3694     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3695     $self->{line_prev} = $self->{line};
3696     $self->{column_prev} = $self->{column};
3697     $self->{column}++;
3698     $self->{nc}
3699     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3700     } else {
3701     $self->{set_nc}->($self);
3702     }
3703    
3704    
3705     return ($self->{ct}); # DOCTYPE
3706    
3707     redo A;
3708     } elsif ($self->{nc} == -1) {
3709    
3710     $self->{state} = DATA_STATE;
3711 wakaba 1.5 $self->{s_kwd} = '';
3712 wakaba 1.1 ## reconsume
3713    
3714     return ($self->{ct}); # DOCTYPE
3715    
3716     redo A;
3717     } else {
3718    
3719     my $s = '';
3720     $self->{read_until}->($s, q[>], 0);
3721    
3722     ## Stay in the state
3723    
3724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3725     $self->{line_prev} = $self->{line};
3726     $self->{column_prev} = $self->{column};
3727     $self->{column}++;
3728     $self->{nc}
3729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3730     } else {
3731     $self->{set_nc}->($self);
3732     }
3733    
3734     redo A;
3735     }
3736     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3737     ## NOTE: "CDATA section state" in the state is jointly implemented
3738     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3739     ## and |CDATA_SECTION_MSE2_STATE|.
3740 wakaba 1.10
3741     ## XML5: "CDATA state".
3742 wakaba 1.1
3743     if ($self->{nc} == 0x005D) { # ]
3744    
3745     $self->{state} = CDATA_SECTION_MSE1_STATE;
3746    
3747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3748     $self->{line_prev} = $self->{line};
3749     $self->{column_prev} = $self->{column};
3750     $self->{column}++;
3751     $self->{nc}
3752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3753     } else {
3754     $self->{set_nc}->($self);
3755     }
3756    
3757     redo A;
3758     } elsif ($self->{nc} == -1) {
3759 wakaba 1.6 if ($self->{is_xml}) {
3760 wakaba 1.8
3761 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3762 wakaba 1.8 } else {
3763    
3764 wakaba 1.6 }
3765    
3766 wakaba 1.1 $self->{state} = DATA_STATE;
3767 wakaba 1.5 $self->{s_kwd} = '';
3768 wakaba 1.10 ## Reconsume.
3769 wakaba 1.1 if (length $self->{ct}->{data}) { # character
3770    
3771     return ($self->{ct}); # character
3772     } else {
3773    
3774     ## No token to emit. $self->{ct} is discarded.
3775     }
3776     redo A;
3777     } else {
3778    
3779     $self->{ct}->{data} .= chr $self->{nc};
3780     $self->{read_until}->($self->{ct}->{data},
3781     q<]>,
3782     length $self->{ct}->{data});
3783    
3784     ## Stay in the state.
3785    
3786     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3787     $self->{line_prev} = $self->{line};
3788     $self->{column_prev} = $self->{column};
3789     $self->{column}++;
3790     $self->{nc}
3791     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3792     } else {
3793     $self->{set_nc}->($self);
3794     }
3795    
3796     redo A;
3797     }
3798    
3799     ## ISSUE: "text tokens" in spec.
3800     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3801 wakaba 1.10 ## XML5: "CDATA bracket state".
3802    
3803 wakaba 1.1 if ($self->{nc} == 0x005D) { # ]
3804    
3805     $self->{state} = CDATA_SECTION_MSE2_STATE;
3806    
3807     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3808     $self->{line_prev} = $self->{line};
3809     $self->{column_prev} = $self->{column};
3810     $self->{column}++;
3811     $self->{nc}
3812     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3813     } else {
3814     $self->{set_nc}->($self);
3815     }
3816    
3817     redo A;
3818     } else {
3819    
3820 wakaba 1.10 ## XML5: If EOF, "]" is not appended and changed to the data state.
3821 wakaba 1.1 $self->{ct}->{data} .= ']';
3822 wakaba 1.10 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state.
3823 wakaba 1.1 ## Reconsume.
3824     redo A;
3825     }
3826     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3827 wakaba 1.10 ## XML5: "CDATA end state".
3828    
3829 wakaba 1.1 if ($self->{nc} == 0x003E) { # >
3830     $self->{state} = DATA_STATE;
3831 wakaba 1.5 $self->{s_kwd} = '';
3832 wakaba 1.1
3833     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3834     $self->{line_prev} = $self->{line};
3835     $self->{column_prev} = $self->{column};
3836     $self->{column}++;
3837     $self->{nc}
3838     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3839     } else {
3840     $self->{set_nc}->($self);
3841     }
3842    
3843     if (length $self->{ct}->{data}) { # character
3844    
3845     return ($self->{ct}); # character
3846     } else {
3847    
3848     ## No token to emit. $self->{ct} is discarded.
3849     }
3850     redo A;
3851     } elsif ($self->{nc} == 0x005D) { # ]
3852     # character
3853     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3854     ## Stay in the state.
3855    
3856     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3857     $self->{line_prev} = $self->{line};
3858     $self->{column_prev} = $self->{column};
3859     $self->{column}++;
3860     $self->{nc}
3861     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3862     } else {
3863     $self->{set_nc}->($self);
3864     }
3865    
3866     redo A;
3867     } else {
3868    
3869     $self->{ct}->{data} .= ']]'; # character
3870     $self->{state} = CDATA_SECTION_STATE;
3871 wakaba 1.10 ## Reconsume. ## XML5: Emit.
3872 wakaba 1.1 redo A;
3873     }
3874     } elsif ($self->{state} == ENTITY_STATE) {
3875     if ($is_space->{$self->{nc}} or
3876     {
3877     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3878     $self->{entity_add} => 1,
3879     }->{$self->{nc}}) {
3880    
3881     ## Don't consume
3882     ## No error
3883     ## Return nothing.
3884     #
3885     } elsif ($self->{nc} == 0x0023) { # #
3886    
3887     $self->{state} = ENTITY_HASH_STATE;
3888     $self->{s_kwd} = '#';
3889    
3890     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3891     $self->{line_prev} = $self->{line};
3892     $self->{column_prev} = $self->{column};
3893     $self->{column}++;
3894     $self->{nc}
3895     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3896     } else {
3897     $self->{set_nc}->($self);
3898     }
3899    
3900     redo A;
3901     } elsif ((0x0041 <= $self->{nc} and
3902     $self->{nc} <= 0x005A) or # A..Z
3903     (0x0061 <= $self->{nc} and
3904     $self->{nc} <= 0x007A)) { # a..z
3905    
3906     require Whatpm::_NamedEntityList;
3907     $self->{state} = ENTITY_NAME_STATE;
3908     $self->{s_kwd} = chr $self->{nc};
3909     $self->{entity__value} = $self->{s_kwd};
3910     $self->{entity__match} = 0;
3911    
3912     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3913     $self->{line_prev} = $self->{line};
3914     $self->{column_prev} = $self->{column};
3915     $self->{column}++;
3916     $self->{nc}
3917     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3918     } else {
3919     $self->{set_nc}->($self);
3920     }
3921    
3922     redo A;
3923     } else {
3924    
3925     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3926     ## Return nothing.
3927     #
3928     }
3929    
3930     ## NOTE: No character is consumed by the "consume a character
3931     ## reference" algorithm. In other word, there is an "&" character
3932     ## that does not introduce a character reference, which would be
3933     ## appended to the parent element or the attribute value in later
3934     ## process of the tokenizer.
3935    
3936     if ($self->{prev_state} == DATA_STATE) {
3937    
3938     $self->{state} = $self->{prev_state};
3939 wakaba 1.5 $self->{s_kwd} = '';
3940 wakaba 1.1 ## Reconsume.
3941     return ({type => CHARACTER_TOKEN, data => '&',
3942     line => $self->{line_prev},
3943     column => $self->{column_prev},
3944     });
3945     redo A;
3946     } else {
3947    
3948     $self->{ca}->{value} .= '&';
3949     $self->{state} = $self->{prev_state};
3950 wakaba 1.5 $self->{s_kwd} = '';
3951 wakaba 1.1 ## Reconsume.
3952     redo A;
3953     }
3954     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3955     if ($self->{nc} == 0x0078 or # x
3956     $self->{nc} == 0x0058) { # X
3957    
3958     $self->{state} = HEXREF_X_STATE;
3959     $self->{s_kwd} .= chr $self->{nc};
3960    
3961     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3962     $self->{line_prev} = $self->{line};
3963     $self->{column_prev} = $self->{column};
3964     $self->{column}++;
3965     $self->{nc}
3966     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3967     } else {
3968     $self->{set_nc}->($self);
3969     }
3970    
3971     redo A;
3972     } elsif (0x0030 <= $self->{nc} and
3973     $self->{nc} <= 0x0039) { # 0..9
3974    
3975     $self->{state} = NCR_NUM_STATE;
3976     $self->{s_kwd} = $self->{nc} - 0x0030;
3977    
3978     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3979     $self->{line_prev} = $self->{line};
3980     $self->{column_prev} = $self->{column};
3981     $self->{column}++;
3982     $self->{nc}
3983     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3984     } else {
3985     $self->{set_nc}->($self);
3986     }
3987    
3988     redo A;
3989     } else {
3990     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3991     line => $self->{line_prev},
3992     column => $self->{column_prev} - 1);
3993    
3994     ## NOTE: According to the spec algorithm, nothing is returned,
3995     ## and then "&#" is appended to the parent element or the attribute
3996     ## value in the later processing.
3997    
3998     if ($self->{prev_state} == DATA_STATE) {
3999    
4000     $self->{state} = $self->{prev_state};
4001 wakaba 1.5 $self->{s_kwd} = '';
4002 wakaba 1.1 ## Reconsume.
4003     return ({type => CHARACTER_TOKEN,
4004     data => '&#',
4005     line => $self->{line_prev},
4006     column => $self->{column_prev} - 1,
4007     });
4008     redo A;
4009     } else {
4010    
4011     $self->{ca}->{value} .= '&#';
4012     $self->{state} = $self->{prev_state};
4013 wakaba 1.5 $self->{s_kwd} = '';
4014 wakaba 1.1 ## Reconsume.
4015     redo A;
4016     }
4017     }
4018     } elsif ($self->{state} == NCR_NUM_STATE) {
4019     if (0x0030 <= $self->{nc} and
4020     $self->{nc} <= 0x0039) { # 0..9
4021    
4022     $self->{s_kwd} *= 10;
4023     $self->{s_kwd} += $self->{nc} - 0x0030;
4024    
4025     ## Stay in the state.
4026    
4027     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4028     $self->{line_prev} = $self->{line};
4029     $self->{column_prev} = $self->{column};
4030     $self->{column}++;
4031     $self->{nc}
4032     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4033     } else {
4034     $self->{set_nc}->($self);
4035     }
4036    
4037     redo A;
4038     } elsif ($self->{nc} == 0x003B) { # ;
4039    
4040    
4041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4042     $self->{line_prev} = $self->{line};
4043     $self->{column_prev} = $self->{column};
4044     $self->{column}++;
4045     $self->{nc}
4046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4047     } else {
4048     $self->{set_nc}->($self);
4049     }
4050    
4051     #
4052     } else {
4053    
4054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4055     ## Reconsume.
4056     #
4057     }
4058    
4059     my $code = $self->{s_kwd};
4060     my $l = $self->{line_prev};
4061     my $c = $self->{column_prev};
4062     if ($charref_map->{$code}) {
4063    
4064     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4065     text => (sprintf 'U+%04X', $code),
4066     line => $l, column => $c);
4067     $code = $charref_map->{$code};
4068     } elsif ($code > 0x10FFFF) {
4069    
4070     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4071     text => (sprintf 'U-%08X', $code),
4072     line => $l, column => $c);
4073     $code = 0xFFFD;
4074     }
4075    
4076     if ($self->{prev_state} == DATA_STATE) {
4077    
4078     $self->{state} = $self->{prev_state};
4079 wakaba 1.5 $self->{s_kwd} = '';
4080 wakaba 1.1 ## Reconsume.
4081     return ({type => CHARACTER_TOKEN, data => chr $code,
4082 wakaba 1.7 has_reference => 1,
4083 wakaba 1.1 line => $l, column => $c,
4084     });
4085     redo A;
4086     } else {
4087    
4088     $self->{ca}->{value} .= chr $code;
4089     $self->{ca}->{has_reference} = 1;
4090     $self->{state} = $self->{prev_state};
4091 wakaba 1.5 $self->{s_kwd} = '';
4092 wakaba 1.1 ## Reconsume.
4093     redo A;
4094     }
4095     } elsif ($self->{state} == HEXREF_X_STATE) {
4096     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4097     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4098     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4099     # 0..9, A..F, a..f
4100    
4101     $self->{state} = HEXREF_HEX_STATE;
4102     $self->{s_kwd} = 0;
4103     ## Reconsume.
4104     redo A;
4105     } else {
4106     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4107     line => $self->{line_prev},
4108     column => $self->{column_prev} - 2);
4109    
4110     ## NOTE: According to the spec algorithm, nothing is returned,
4111     ## and then "&#" followed by "X" or "x" is appended to the parent
4112     ## element or the attribute value in the later processing.
4113    
4114     if ($self->{prev_state} == DATA_STATE) {
4115    
4116     $self->{state} = $self->{prev_state};
4117 wakaba 1.5 $self->{s_kwd} = '';
4118 wakaba 1.1 ## Reconsume.
4119     return ({type => CHARACTER_TOKEN,
4120     data => '&' . $self->{s_kwd},
4121     line => $self->{line_prev},
4122     column => $self->{column_prev} - length $self->{s_kwd},
4123     });
4124     redo A;
4125     } else {
4126    
4127     $self->{ca}->{value} .= '&' . $self->{s_kwd};
4128     $self->{state} = $self->{prev_state};
4129 wakaba 1.5 $self->{s_kwd} = '';
4130 wakaba 1.1 ## Reconsume.
4131     redo A;
4132     }
4133     }
4134     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4135     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4136     # 0..9
4137    
4138     $self->{s_kwd} *= 0x10;
4139     $self->{s_kwd} += $self->{nc} - 0x0030;
4140     ## Stay in the state.
4141    
4142     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4143     $self->{line_prev} = $self->{line};
4144     $self->{column_prev} = $self->{column};
4145     $self->{column}++;
4146     $self->{nc}
4147     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4148     } else {
4149     $self->{set_nc}->($self);
4150     }
4151    
4152     redo A;
4153     } elsif (0x0061 <= $self->{nc} and
4154     $self->{nc} <= 0x0066) { # a..f
4155    
4156     $self->{s_kwd} *= 0x10;
4157     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4158     ## Stay in the state.
4159    
4160     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4161     $self->{line_prev} = $self->{line};
4162     $self->{column_prev} = $self->{column};
4163     $self->{column}++;
4164     $self->{nc}
4165     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4166     } else {
4167     $self->{set_nc}->($self);
4168     }
4169    
4170     redo A;
4171     } elsif (0x0041 <= $self->{nc} and
4172     $self->{nc} <= 0x0046) { # A..F
4173    
4174     $self->{s_kwd} *= 0x10;
4175     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4176     ## Stay in the state.
4177    
4178     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4179     $self->{line_prev} = $self->{line};
4180     $self->{column_prev} = $self->{column};
4181     $self->{column}++;
4182     $self->{nc}
4183     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4184     } else {
4185     $self->{set_nc}->($self);
4186     }
4187    
4188     redo A;
4189     } elsif ($self->{nc} == 0x003B) { # ;
4190    
4191    
4192     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4193     $self->{line_prev} = $self->{line};
4194     $self->{column_prev} = $self->{column};
4195     $self->{column}++;
4196     $self->{nc}
4197     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4198     } else {
4199     $self->{set_nc}->($self);
4200     }
4201    
4202     #
4203     } else {
4204    
4205     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4206     line => $self->{line},
4207     column => $self->{column});
4208     ## Reconsume.
4209     #
4210     }
4211    
4212     my $code = $self->{s_kwd};
4213     my $l = $self->{line_prev};
4214     my $c = $self->{column_prev};
4215     if ($charref_map->{$code}) {
4216    
4217     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4218     text => (sprintf 'U+%04X', $code),
4219     line => $l, column => $c);
4220     $code = $charref_map->{$code};
4221     } elsif ($code > 0x10FFFF) {
4222    
4223     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4224     text => (sprintf 'U-%08X', $code),
4225     line => $l, column => $c);
4226     $code = 0xFFFD;
4227     }
4228    
4229     if ($self->{prev_state} == DATA_STATE) {
4230    
4231     $self->{state} = $self->{prev_state};
4232 wakaba 1.5 $self->{s_kwd} = '';
4233 wakaba 1.1 ## Reconsume.
4234     return ({type => CHARACTER_TOKEN, data => chr $code,
4235 wakaba 1.7 has_reference => 1,
4236 wakaba 1.1 line => $l, column => $c,
4237     });
4238     redo A;
4239     } else {
4240    
4241     $self->{ca}->{value} .= chr $code;
4242     $self->{ca}->{has_reference} = 1;
4243     $self->{state} = $self->{prev_state};
4244 wakaba 1.5 $self->{s_kwd} = '';
4245 wakaba 1.1 ## Reconsume.
4246     redo A;
4247     }
4248     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4249     if (length $self->{s_kwd} < 30 and
4250     ## NOTE: Some number greater than the maximum length of entity name
4251     ((0x0041 <= $self->{nc} and # a
4252     $self->{nc} <= 0x005A) or # x
4253     (0x0061 <= $self->{nc} and # a
4254     $self->{nc} <= 0x007A) or # z
4255     (0x0030 <= $self->{nc} and # 0
4256     $self->{nc} <= 0x0039) or # 9
4257     $self->{nc} == 0x003B)) { # ;
4258     our $EntityChar;
4259     $self->{s_kwd} .= chr $self->{nc};
4260     if (defined $EntityChar->{$self->{s_kwd}}) {
4261     if ($self->{nc} == 0x003B) { # ;
4262    
4263     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4264     $self->{entity__match} = 1;
4265    
4266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4267     $self->{line_prev} = $self->{line};
4268     $self->{column_prev} = $self->{column};
4269     $self->{column}++;
4270     $self->{nc}
4271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4272     } else {
4273     $self->{set_nc}->($self);
4274     }
4275    
4276     #
4277     } else {
4278    
4279     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4280     $self->{entity__match} = -1;
4281     ## Stay in the state.
4282    
4283     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4284     $self->{line_prev} = $self->{line};
4285     $self->{column_prev} = $self->{column};
4286     $self->{column}++;
4287     $self->{nc}
4288     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4289     } else {
4290     $self->{set_nc}->($self);
4291     }
4292    
4293     redo A;
4294     }
4295     } else {
4296    
4297     $self->{entity__value} .= chr $self->{nc};
4298     $self->{entity__match} *= 2;
4299     ## Stay in the state.
4300    
4301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4302     $self->{line_prev} = $self->{line};
4303     $self->{column_prev} = $self->{column};
4304     $self->{column}++;
4305     $self->{nc}
4306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4307     } else {
4308     $self->{set_nc}->($self);
4309     }
4310    
4311     redo A;
4312     }
4313     }
4314    
4315     my $data;
4316     my $has_ref;
4317     if ($self->{entity__match} > 0) {
4318    
4319     $data = $self->{entity__value};
4320     $has_ref = 1;
4321     #
4322     } elsif ($self->{entity__match} < 0) {
4323     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4324     if ($self->{prev_state} != DATA_STATE and # in attribute
4325     $self->{entity__match} < -1) {
4326    
4327     $data = '&' . $self->{s_kwd};
4328     #
4329     } else {
4330    
4331     $data = $self->{entity__value};
4332     $has_ref = 1;
4333     #
4334     }
4335     } else {
4336    
4337     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4338     line => $self->{line_prev},
4339     column => $self->{column_prev} - length $self->{s_kwd});
4340     $data = '&' . $self->{s_kwd};
4341     #
4342     }
4343    
4344     ## NOTE: In these cases, when a character reference is found,
4345     ## it is consumed and a character token is returned, or, otherwise,
4346     ## nothing is consumed and returned, according to the spec algorithm.
4347     ## In this implementation, anything that has been examined by the
4348     ## tokenizer is appended to the parent element or the attribute value
4349     ## as string, either literal string when no character reference or
4350     ## entity-replaced string otherwise, in this stage, since any characters
4351     ## that would not be consumed are appended in the data state or in an
4352     ## appropriate attribute value state anyway.
4353    
4354     if ($self->{prev_state} == DATA_STATE) {
4355    
4356     $self->{state} = $self->{prev_state};
4357 wakaba 1.5 $self->{s_kwd} = '';
4358 wakaba 1.1 ## Reconsume.
4359     return ({type => CHARACTER_TOKEN,
4360     data => $data,
4361 wakaba 1.7 has_reference => $has_ref,
4362 wakaba 1.1 line => $self->{line_prev},
4363     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4364     });
4365     redo A;
4366     } else {
4367    
4368     $self->{ca}->{value} .= $data;
4369     $self->{ca}->{has_reference} = 1 if $has_ref;
4370     $self->{state} = $self->{prev_state};
4371 wakaba 1.5 $self->{s_kwd} = '';
4372 wakaba 1.1 ## Reconsume.
4373     redo A;
4374     }
4375 wakaba 1.8
4376     ## XML-only states
4377    
4378     } elsif ($self->{state} == PI_STATE) {
4379     if ($is_space->{$self->{nc}} or
4380     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4381     $self->{nc} == -1) {
4382     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4383     line => $self->{line_prev},
4384     column => $self->{column_prev}
4385     - 1 * ($self->{nc} != -1));
4386     $self->{state} = BOGUS_COMMENT_STATE;
4387     ## Reconsume.
4388     $self->{ct} = {type => COMMENT_TOKEN,
4389     data => '?',
4390     line => $self->{line_prev},
4391     column => $self->{column_prev}
4392     - 1 * ($self->{nc} != -1),
4393     };
4394     redo A;
4395     } else {
4396     $self->{ct} = {type => PI_TOKEN,
4397     target => chr $self->{nc},
4398     data => '',
4399     line => $self->{line_prev},
4400     column => $self->{column_prev} - 1,
4401     };
4402     $self->{state} = PI_TARGET_STATE;
4403    
4404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4405     $self->{line_prev} = $self->{line};
4406     $self->{column_prev} = $self->{column};
4407     $self->{column}++;
4408     $self->{nc}
4409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4410     } else {
4411     $self->{set_nc}->($self);
4412     }
4413    
4414     redo A;
4415     }
4416     } elsif ($self->{state} == PI_TARGET_STATE) {
4417     if ($is_space->{$self->{nc}}) {
4418     $self->{state} = PI_TARGET_AFTER_STATE;
4419    
4420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4421     $self->{line_prev} = $self->{line};
4422     $self->{column_prev} = $self->{column};
4423     $self->{column}++;
4424     $self->{nc}
4425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4426     } else {
4427     $self->{set_nc}->($self);
4428     }
4429    
4430     redo A;
4431     } elsif ($self->{nc} == -1) {
4432     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4433     $self->{state} = DATA_STATE;
4434     $self->{s_kwd} = '';
4435     ## Reconsume.
4436     return ($self->{ct}); # pi
4437     redo A;
4438     } elsif ($self->{nc} == 0x003F) { # ?
4439     $self->{state} = PI_AFTER_STATE;
4440    
4441     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4442     $self->{line_prev} = $self->{line};
4443     $self->{column_prev} = $self->{column};
4444     $self->{column}++;
4445     $self->{nc}
4446     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4447     } else {
4448     $self->{set_nc}->($self);
4449     }
4450    
4451     redo A;
4452     } else {
4453     ## XML5: typo ("tag name" -> "target")
4454     $self->{ct}->{target} .= chr $self->{nc}; # pi
4455    
4456     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4457     $self->{line_prev} = $self->{line};
4458     $self->{column_prev} = $self->{column};
4459     $self->{column}++;
4460     $self->{nc}
4461     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4462     } else {
4463     $self->{set_nc}->($self);
4464     }
4465    
4466     redo A;
4467     }
4468     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4469     if ($is_space->{$self->{nc}}) {
4470     ## Stay in the state.
4471    
4472     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4473     $self->{line_prev} = $self->{line};
4474     $self->{column_prev} = $self->{column};
4475     $self->{column}++;
4476     $self->{nc}
4477     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4478     } else {
4479     $self->{set_nc}->($self);
4480     }
4481    
4482     redo A;
4483     } else {
4484     $self->{state} = PI_DATA_STATE;
4485     ## Reprocess.
4486     redo A;
4487     }
4488     } elsif ($self->{state} == PI_DATA_STATE) {
4489     if ($self->{nc} == 0x003F) { # ?
4490     $self->{state} = PI_DATA_AFTER_STATE;
4491    
4492     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4493     $self->{line_prev} = $self->{line};
4494     $self->{column_prev} = $self->{column};
4495     $self->{column}++;
4496     $self->{nc}
4497     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4498     } else {
4499     $self->{set_nc}->($self);
4500     }
4501    
4502     redo A;
4503     } elsif ($self->{nc} == -1) {
4504     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4505     $self->{state} = DATA_STATE;
4506     $self->{s_kwd} = '';
4507     ## Reprocess.
4508     return ($self->{ct}); # pi
4509     redo A;
4510     } else {
4511     $self->{ct}->{data} .= chr $self->{nc}; # pi
4512     $self->{read_until}->($self->{ct}->{data}, q[?],
4513     length $self->{ct}->{data});
4514     ## Stay in the state.
4515    
4516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4517     $self->{line_prev} = $self->{line};
4518     $self->{column_prev} = $self->{column};
4519     $self->{column}++;
4520     $self->{nc}
4521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4522     } else {
4523     $self->{set_nc}->($self);
4524     }
4525    
4526     ## Reprocess.
4527     redo A;
4528     }
4529     } elsif ($self->{state} == PI_AFTER_STATE) {
4530     if ($self->{nc} == 0x003E) { # >
4531     $self->{state} = DATA_STATE;
4532     $self->{s_kwd} = '';
4533    
4534     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4535     $self->{line_prev} = $self->{line};
4536     $self->{column_prev} = $self->{column};
4537     $self->{column}++;
4538     $self->{nc}
4539     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4540     } else {
4541     $self->{set_nc}->($self);
4542     }
4543    
4544     return ($self->{ct}); # pi
4545     redo A;
4546     } elsif ($self->{nc} == 0x003F) { # ?
4547     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4548     line => $self->{line_prev},
4549     column => $self->{column_prev}); ## XML5: no error
4550     $self->{ct}->{data} .= '?';
4551     $self->{state} = PI_DATA_AFTER_STATE;
4552    
4553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4554     $self->{line_prev} = $self->{line};
4555     $self->{column_prev} = $self->{column};
4556     $self->{column}++;
4557     $self->{nc}
4558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4559     } else {
4560     $self->{set_nc}->($self);
4561     }
4562    
4563     redo A;
4564     } else {
4565     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4566     line => $self->{line_prev},
4567     column => $self->{column_prev}
4568     + 1 * ($self->{nc} == -1)); ## XML5: no error
4569     $self->{ct}->{data} .= '?'; ## XML5: not appended
4570     $self->{state} = PI_DATA_STATE;
4571     ## Reprocess.
4572     redo A;
4573     }
4574     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4575     ## XML5: Same as "pi after state" in XML5
4576     if ($self->{nc} == 0x003E) { # >
4577     $self->{state} = DATA_STATE;
4578     $self->{s_kwd} = '';
4579    
4580     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4581     $self->{line_prev} = $self->{line};
4582     $self->{column_prev} = $self->{column};
4583     $self->{column}++;
4584     $self->{nc}
4585     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4586     } else {
4587     $self->{set_nc}->($self);
4588     }
4589    
4590     return ($self->{ct}); # pi
4591     redo A;
4592     } elsif ($self->{nc} == 0x003F) { # ?
4593     $self->{ct}->{data} .= '?';
4594     ## Stay in the state.
4595    
4596     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4597     $self->{line_prev} = $self->{line};
4598     $self->{column_prev} = $self->{column};
4599     $self->{column}++;
4600     $self->{nc}
4601     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4602     } else {
4603     $self->{set_nc}->($self);
4604     }
4605    
4606     redo A;
4607     } else {
4608     $self->{ct}->{data} .= '?'; ## XML5: not appended
4609     $self->{state} = PI_DATA_STATE;
4610     ## Reprocess.
4611     redo A;
4612     }
4613    
4614 wakaba 1.1 } else {
4615     die "$0: $self->{state}: Unknown state";
4616     }
4617     } # A
4618    
4619     die "$0: _get_next_token: unexpected case";
4620     } # _get_next_token
4621    
4622     1;
4623 wakaba 1.10 ## $Date: 2008/10/15 08:05:47 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24