/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations) (download)
Wed Oct 15 08:05:47 2008 UTC (16 years ago) by wakaba
Branch: MAIN
Changes since 1.8: +23 -3 lines
++ whatpm/t/ChangeLog	15 Oct 2008 08:04:32 -0000
	* XML-Parser.t: "xml/ns-elements-1.dat" added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 08:05:44 -0000
	* ns-elements-1.dat: New test data file.

	* ns-attrs-1.dat: New test data added.

	* elements-1.dat: New test data file.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 08:03:32 -0000
	* Tokenizer.pm.src: XML tag name start charcter support for start
	tags.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 08:04:01 -0000
	* Parser.pm.src: Bug fixes for the handling of ":" in the element
	type names and attribute names.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.9 our $VERSION=do{my @r=(q$Revision: 1.8 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117 wakaba 1.8 ## XML states
118     sub PI_STATE () { 51 }
119     sub PI_TARGET_STATE () { 52 }
120     sub PI_TARGET_AFTER_STATE () { 53 }
121     sub PI_DATA_STATE () { 54 }
122     sub PI_AFTER_STATE () { 55 }
123     sub PI_DATA_AFTER_STATE () { 56 }
124    
125 wakaba 1.1 ## Tree constructor state constants (see Whatpm::HTML for the full
126     ## list and descriptions)
127    
128     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
129     sub FOREIGN_EL () { 0b1_00000000000 }
130    
131     ## Character reference mappings
132    
133     my $charref_map = {
134     0x0D => 0x000A,
135     0x80 => 0x20AC,
136     0x81 => 0xFFFD,
137     0x82 => 0x201A,
138     0x83 => 0x0192,
139     0x84 => 0x201E,
140     0x85 => 0x2026,
141     0x86 => 0x2020,
142     0x87 => 0x2021,
143     0x88 => 0x02C6,
144     0x89 => 0x2030,
145     0x8A => 0x0160,
146     0x8B => 0x2039,
147     0x8C => 0x0152,
148     0x8D => 0xFFFD,
149     0x8E => 0x017D,
150     0x8F => 0xFFFD,
151     0x90 => 0xFFFD,
152     0x91 => 0x2018,
153     0x92 => 0x2019,
154     0x93 => 0x201C,
155     0x94 => 0x201D,
156     0x95 => 0x2022,
157     0x96 => 0x2013,
158     0x97 => 0x2014,
159     0x98 => 0x02DC,
160     0x99 => 0x2122,
161     0x9A => 0x0161,
162     0x9B => 0x203A,
163     0x9C => 0x0153,
164     0x9D => 0xFFFD,
165     0x9E => 0x017E,
166     0x9F => 0x0178,
167     }; # $charref_map
168     $charref_map->{$_} = 0xFFFD
169     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
170     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
171     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
172     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
173     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
174     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
175     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
176    
177     ## Implementations MUST act as if state machine in the spec
178    
179     sub _initialize_tokenizer ($) {
180     my $self = shift;
181    
182     ## NOTE: Fields set by |new| constructor:
183     #$self->{level}
184     #$self->{set_nc}
185     #$self->{parse_error}
186 wakaba 1.3 #$self->{is_xml} (if XML)
187 wakaba 1.1
188     $self->{state} = DATA_STATE; # MUST
189 wakaba 1.5 $self->{s_kwd} = ''; # state keyword
190 wakaba 1.1 #$self->{entity__value}; # initialized when used
191     #$self->{entity__match}; # initialized when used
192     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
193     undef $self->{ct}; # current token
194     undef $self->{ca}; # current attribute
195     undef $self->{last_stag_name}; # last emitted start tag name
196     #$self->{prev_state}; # initialized when used
197     delete $self->{self_closing};
198     $self->{char_buffer} = '';
199     $self->{char_buffer_pos} = 0;
200     $self->{nc} = -1; # next input character
201     #$self->{next_nc}
202    
203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
204     $self->{line_prev} = $self->{line};
205     $self->{column_prev} = $self->{column};
206     $self->{column}++;
207     $self->{nc}
208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
209     } else {
210     $self->{set_nc}->($self);
211     }
212    
213     $self->{token} = [];
214     # $self->{escape}
215     } # _initialize_tokenizer
216    
217     ## A token has:
218     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
219     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
220     ## ->{name} (DOCTYPE_TOKEN)
221     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
222     ## ->{pubid} (DOCTYPE_TOKEN)
223     ## ->{sysid} (DOCTYPE_TOKEN)
224     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
225     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
226     ## ->{name}
227     ## ->{value}
228     ## ->{has_reference} == 1 or 0
229     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
230 wakaba 1.7 ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN)
231 wakaba 1.1 ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
232     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
233     ## while the token is pushed back to the stack.
234    
235     ## Emitted token MUST immediately be handled by the tree construction state.
236    
237     ## Before each step, UA MAY check to see if either one of the scripts in
238     ## "list of scripts that will execute as soon as possible" or the first
239     ## script in the "list of scripts that will execute asynchronously",
240     ## has completed loading. If one has, then it MUST be executed
241     ## and removed from the list.
242    
243     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
244     ## (This requirement was dropped from HTML5 spec, unfortunately.)
245    
246     my $is_space = {
247     0x0009 => 1, # CHARACTER TABULATION (HT)
248     0x000A => 1, # LINE FEED (LF)
249     #0x000B => 0, # LINE TABULATION (VT)
250     0x000C => 1, # FORM FEED (FF)
251     #0x000D => 1, # CARRIAGE RETURN (CR)
252     0x0020 => 1, # SPACE (SP)
253     };
254    
255     sub _get_next_token ($) {
256     my $self = shift;
257    
258     if ($self->{self_closing}) {
259     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
260     ## NOTE: The |self_closing| flag is only set by start tag token.
261     ## In addition, when a start tag token is emitted, it is always set to
262     ## |ct|.
263     delete $self->{self_closing};
264     }
265    
266     if (@{$self->{token}}) {
267     $self->{self_closing} = $self->{token}->[0]->{self_closing};
268     return shift @{$self->{token}};
269     }
270    
271     A: {
272     if ($self->{state} == PCDATA_STATE) {
273     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
274    
275     if ($self->{nc} == 0x0026) { # &
276    
277     ## NOTE: In the spec, the tokenizer is switched to the
278     ## "entity data state". In this implementation, the tokenizer
279     ## is switched to the |ENTITY_STATE|, which is an implementation
280     ## of the "consume a character reference" algorithm.
281     $self->{entity_add} = -1;
282     $self->{prev_state} = DATA_STATE;
283     $self->{state} = ENTITY_STATE;
284    
285     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
286     $self->{line_prev} = $self->{line};
287     $self->{column_prev} = $self->{column};
288     $self->{column}++;
289     $self->{nc}
290     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
291     } else {
292     $self->{set_nc}->($self);
293     }
294    
295     redo A;
296     } elsif ($self->{nc} == 0x003C) { # <
297    
298     $self->{state} = TAG_OPEN_STATE;
299    
300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
301     $self->{line_prev} = $self->{line};
302     $self->{column_prev} = $self->{column};
303     $self->{column}++;
304     $self->{nc}
305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
306     } else {
307     $self->{set_nc}->($self);
308     }
309    
310     redo A;
311     } elsif ($self->{nc} == -1) {
312    
313     return ({type => END_OF_FILE_TOKEN,
314     line => $self->{line}, column => $self->{column}});
315     last A; ## TODO: ok?
316     } else {
317    
318     #
319     }
320    
321     # Anything else
322     my $token = {type => CHARACTER_TOKEN,
323     data => chr $self->{nc},
324     line => $self->{line}, column => $self->{column},
325     };
326     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
327    
328     ## Stay in the state.
329    
330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
331     $self->{line_prev} = $self->{line};
332     $self->{column_prev} = $self->{column};
333     $self->{column}++;
334     $self->{nc}
335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
336     } else {
337     $self->{set_nc}->($self);
338     }
339    
340     return ($token);
341     redo A;
342     } elsif ($self->{state} == DATA_STATE) {
343     $self->{s_kwd} = '' unless defined $self->{s_kwd};
344     if ($self->{nc} == 0x0026) { # &
345     $self->{s_kwd} = '';
346     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
347     not $self->{escape}) {
348    
349     ## NOTE: In the spec, the tokenizer is switched to the
350     ## "entity data state". In this implementation, the tokenizer
351     ## is switched to the |ENTITY_STATE|, which is an implementation
352     ## of the "consume a character reference" algorithm.
353     $self->{entity_add} = -1;
354     $self->{prev_state} = DATA_STATE;
355     $self->{state} = ENTITY_STATE;
356    
357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
358     $self->{line_prev} = $self->{line};
359     $self->{column_prev} = $self->{column};
360     $self->{column}++;
361     $self->{nc}
362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
363     } else {
364     $self->{set_nc}->($self);
365     }
366    
367     redo A;
368     } else {
369    
370     #
371     }
372     } elsif ($self->{nc} == 0x002D) { # -
373     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
374 wakaba 1.5 if ($self->{s_kwd} eq '<!-') {
375 wakaba 1.1
376     $self->{escape} = 1; # unless $self->{escape};
377     $self->{s_kwd} = '--';
378     #
379 wakaba 1.5 } elsif ($self->{s_kwd} eq '-') {
380 wakaba 1.1
381     $self->{s_kwd} = '--';
382     #
383 wakaba 1.5 } elsif ($self->{s_kwd} eq '<!' or $self->{s_kwd} eq '-') {
384    
385     $self->{s_kwd} .= '-';
386     #
387 wakaba 1.1 } else {
388    
389 wakaba 1.5 $self->{s_kwd} = '-';
390 wakaba 1.1 #
391     }
392     }
393    
394     #
395     } elsif ($self->{nc} == 0x0021) { # !
396     if (length $self->{s_kwd}) {
397    
398     $self->{s_kwd} .= '!';
399     #
400     } else {
401    
402     #$self->{s_kwd} = '';
403     #
404     }
405     #
406     } elsif ($self->{nc} == 0x003C) { # <
407     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
408     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
409     not $self->{escape})) {
410    
411     $self->{state} = TAG_OPEN_STATE;
412    
413     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
414     $self->{line_prev} = $self->{line};
415     $self->{column_prev} = $self->{column};
416     $self->{column}++;
417     $self->{nc}
418     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
419     } else {
420     $self->{set_nc}->($self);
421     }
422    
423     redo A;
424     } else {
425    
426     $self->{s_kwd} = '';
427     #
428     }
429     } elsif ($self->{nc} == 0x003E) { # >
430     if ($self->{escape} and
431     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
432     if ($self->{s_kwd} eq '--') {
433    
434     delete $self->{escape};
435 wakaba 1.5 #
436 wakaba 1.1 } else {
437    
438 wakaba 1.5 #
439 wakaba 1.1 }
440 wakaba 1.5 } elsif ($self->{is_xml} and $self->{s_kwd} eq ']]') {
441    
442     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched mse', ## TODO: type
443     line => $self->{line_prev},
444     column => $self->{column_prev} - 1);
445     #
446 wakaba 1.1 } else {
447    
448 wakaba 1.5 #
449 wakaba 1.1 }
450    
451     $self->{s_kwd} = '';
452     #
453 wakaba 1.5 } elsif ($self->{nc} == 0x005D) { # ]
454     if ($self->{s_kwd} eq ']' or $self->{s_kwd} eq '') {
455    
456     $self->{s_kwd} .= ']';
457     } elsif ($self->{s_kwd} eq ']]') {
458    
459     #
460     } else {
461    
462     $self->{s_kwd} = '';
463     }
464     #
465 wakaba 1.1 } elsif ($self->{nc} == -1) {
466    
467     $self->{s_kwd} = '';
468     return ({type => END_OF_FILE_TOKEN,
469     line => $self->{line}, column => $self->{column}});
470     last A; ## TODO: ok?
471     } else {
472    
473     $self->{s_kwd} = '';
474     #
475     }
476    
477     # Anything else
478     my $token = {type => CHARACTER_TOKEN,
479     data => chr $self->{nc},
480     line => $self->{line}, column => $self->{column},
481     };
482 wakaba 1.5 if ($self->{read_until}->($token->{data}, q{-!<>&\]},
483 wakaba 1.1 length $token->{data})) {
484     $self->{s_kwd} = '';
485     }
486    
487     ## Stay in the data state.
488 wakaba 1.5 if (not $self->{is_xml} and
489     $self->{content_model} == PCDATA_CONTENT_MODEL) {
490 wakaba 1.1
491     $self->{state} = PCDATA_STATE;
492     } else {
493    
494     ## Stay in the state.
495     }
496    
497     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
498     $self->{line_prev} = $self->{line};
499     $self->{column_prev} = $self->{column};
500     $self->{column}++;
501     $self->{nc}
502     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
503     } else {
504     $self->{set_nc}->($self);
505     }
506    
507     return ($token);
508     redo A;
509     } elsif ($self->{state} == TAG_OPEN_STATE) {
510     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
511     if ($self->{nc} == 0x002F) { # /
512    
513    
514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
515     $self->{line_prev} = $self->{line};
516     $self->{column_prev} = $self->{column};
517     $self->{column}++;
518     $self->{nc}
519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
520     } else {
521     $self->{set_nc}->($self);
522     }
523    
524     $self->{state} = CLOSE_TAG_OPEN_STATE;
525     redo A;
526     } elsif ($self->{nc} == 0x0021) { # !
527    
528     $self->{s_kwd} = '<' unless $self->{escape};
529     #
530     } else {
531    
532     #
533     }
534    
535     ## reconsume
536     $self->{state} = DATA_STATE;
537 wakaba 1.5 $self->{s_kwd} = '';
538 wakaba 1.1 return ({type => CHARACTER_TOKEN, data => '<',
539     line => $self->{line_prev},
540     column => $self->{column_prev},
541     });
542     redo A;
543     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
544     if ($self->{nc} == 0x0021) { # !
545    
546     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
547    
548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
549     $self->{line_prev} = $self->{line};
550     $self->{column_prev} = $self->{column};
551     $self->{column}++;
552     $self->{nc}
553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
554     } else {
555     $self->{set_nc}->($self);
556     }
557    
558     redo A;
559     } elsif ($self->{nc} == 0x002F) { # /
560    
561     $self->{state} = CLOSE_TAG_OPEN_STATE;
562    
563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
564     $self->{line_prev} = $self->{line};
565     $self->{column_prev} = $self->{column};
566     $self->{column}++;
567     $self->{nc}
568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
569     } else {
570     $self->{set_nc}->($self);
571     }
572    
573     redo A;
574     } elsif (0x0041 <= $self->{nc} and
575     $self->{nc} <= 0x005A) { # A..Z
576    
577     $self->{ct}
578     = {type => START_TAG_TOKEN,
579 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
580 wakaba 1.1 line => $self->{line_prev},
581     column => $self->{column_prev}};
582     $self->{state} = TAG_NAME_STATE;
583    
584     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
585     $self->{line_prev} = $self->{line};
586     $self->{column_prev} = $self->{column};
587     $self->{column}++;
588     $self->{nc}
589     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
590     } else {
591     $self->{set_nc}->($self);
592     }
593    
594     redo A;
595     } elsif (0x0061 <= $self->{nc} and
596     $self->{nc} <= 0x007A) { # a..z
597    
598     $self->{ct} = {type => START_TAG_TOKEN,
599     tag_name => chr ($self->{nc}),
600     line => $self->{line_prev},
601     column => $self->{column_prev}};
602     $self->{state} = TAG_NAME_STATE;
603    
604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
605     $self->{line_prev} = $self->{line};
606     $self->{column_prev} = $self->{column};
607     $self->{column}++;
608     $self->{nc}
609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
610     } else {
611     $self->{set_nc}->($self);
612     }
613    
614     redo A;
615     } elsif ($self->{nc} == 0x003E) { # >
616    
617     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
618     line => $self->{line_prev},
619     column => $self->{column_prev});
620     $self->{state} = DATA_STATE;
621 wakaba 1.5 $self->{s_kwd} = '';
622 wakaba 1.1
623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
624     $self->{line_prev} = $self->{line};
625     $self->{column_prev} = $self->{column};
626     $self->{column}++;
627     $self->{nc}
628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
629     } else {
630     $self->{set_nc}->($self);
631     }
632    
633    
634     return ({type => CHARACTER_TOKEN, data => '<>',
635     line => $self->{line_prev},
636     column => $self->{column_prev},
637     });
638    
639     redo A;
640     } elsif ($self->{nc} == 0x003F) { # ?
641 wakaba 1.8 if ($self->{is_xml}) {
642    
643     $self->{state} = PI_STATE;
644    
645     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
646     $self->{line_prev} = $self->{line};
647     $self->{column_prev} = $self->{column};
648     $self->{column}++;
649     $self->{nc}
650     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
651     } else {
652     $self->{set_nc}->($self);
653     }
654    
655     redo A;
656     } else {
657    
658     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
659     line => $self->{line_prev},
660     column => $self->{column_prev});
661     $self->{state} = BOGUS_COMMENT_STATE;
662     $self->{ct} = {type => COMMENT_TOKEN, data => '',
663     line => $self->{line_prev},
664     column => $self->{column_prev},
665     };
666     ## $self->{nc} is intentionally left as is
667     redo A;
668     }
669 wakaba 1.9 } elsif (not $self->{is_xml} or $is_space->{$self->{nc}}) {
670 wakaba 1.1
671     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
672     line => $self->{line_prev},
673     column => $self->{column_prev});
674     $self->{state} = DATA_STATE;
675 wakaba 1.5 $self->{s_kwd} = '';
676 wakaba 1.1 ## reconsume
677    
678     return ({type => CHARACTER_TOKEN, data => '<',
679     line => $self->{line_prev},
680     column => $self->{column_prev},
681     });
682    
683     redo A;
684 wakaba 1.9 } else {
685     ## XML5: "<:" is a parse error.
686    
687     $self->{ct} = {type => START_TAG_TOKEN,
688     tag_name => chr ($self->{nc}),
689     line => $self->{line_prev},
690     column => $self->{column_prev}};
691     $self->{state} = TAG_NAME_STATE;
692    
693     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
694     $self->{line_prev} = $self->{line};
695     $self->{column_prev} = $self->{column};
696     $self->{column}++;
697     $self->{nc}
698     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
699     } else {
700     $self->{set_nc}->($self);
701     }
702    
703     redo A;
704 wakaba 1.1 }
705     } else {
706     die "$0: $self->{content_model} in tag open";
707     }
708     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
709     ## NOTE: The "close tag open state" in the spec is implemented as
710     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
711    
712     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
713     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
714     if (defined $self->{last_stag_name}) {
715     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
716     $self->{s_kwd} = '';
717     ## Reconsume.
718     redo A;
719     } else {
720     ## No start tag token has ever been emitted
721     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
722    
723     $self->{state} = DATA_STATE;
724 wakaba 1.5 $self->{s_kwd} = '';
725 wakaba 1.1 ## Reconsume.
726     return ({type => CHARACTER_TOKEN, data => '</',
727     line => $l, column => $c,
728     });
729     redo A;
730     }
731     }
732    
733     if (0x0041 <= $self->{nc} and
734     $self->{nc} <= 0x005A) { # A..Z
735    
736     $self->{ct}
737     = {type => END_TAG_TOKEN,
738 wakaba 1.4 tag_name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
739 wakaba 1.1 line => $l, column => $c};
740     $self->{state} = TAG_NAME_STATE;
741    
742     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
743     $self->{line_prev} = $self->{line};
744     $self->{column_prev} = $self->{column};
745     $self->{column}++;
746     $self->{nc}
747     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
748     } else {
749     $self->{set_nc}->($self);
750     }
751    
752     redo A;
753     } elsif (0x0061 <= $self->{nc} and
754     $self->{nc} <= 0x007A) { # a..z
755    
756     $self->{ct} = {type => END_TAG_TOKEN,
757     tag_name => chr ($self->{nc}),
758     line => $l, column => $c};
759     $self->{state} = TAG_NAME_STATE;
760    
761     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
762     $self->{line_prev} = $self->{line};
763     $self->{column_prev} = $self->{column};
764     $self->{column}++;
765     $self->{nc}
766     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
767     } else {
768     $self->{set_nc}->($self);
769     }
770    
771     redo A;
772     } elsif ($self->{nc} == 0x003E) { # >
773    
774     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
775     line => $self->{line_prev}, ## "<" in "</>"
776     column => $self->{column_prev} - 1);
777     $self->{state} = DATA_STATE;
778 wakaba 1.5 $self->{s_kwd} = '';
779 wakaba 1.1
780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
781     $self->{line_prev} = $self->{line};
782     $self->{column_prev} = $self->{column};
783     $self->{column}++;
784     $self->{nc}
785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
786     } else {
787     $self->{set_nc}->($self);
788     }
789    
790     redo A;
791     } elsif ($self->{nc} == -1) {
792    
793     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
794 wakaba 1.5 $self->{s_kwd} = '';
795 wakaba 1.1 $self->{state} = DATA_STATE;
796     # reconsume
797    
798     return ({type => CHARACTER_TOKEN, data => '</',
799     line => $l, column => $c,
800     });
801    
802     redo A;
803     } else {
804    
805     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
806     $self->{state} = BOGUS_COMMENT_STATE;
807     $self->{ct} = {type => COMMENT_TOKEN, data => '',
808     line => $self->{line_prev}, # "<" of "</"
809     column => $self->{column_prev} - 1,
810     };
811     ## NOTE: $self->{nc} is intentionally left as is.
812     ## Although the "anything else" case of the spec not explicitly
813     ## states that the next input character is to be reconsumed,
814     ## it will be included to the |data| of the comment token
815     ## generated from the bogus end tag, as defined in the
816     ## "bogus comment state" entry.
817     redo A;
818     }
819     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
820     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
821     if (length $ch) {
822     my $CH = $ch;
823     $ch =~ tr/a-z/A-Z/;
824     my $nch = chr $self->{nc};
825     if ($nch eq $ch or $nch eq $CH) {
826    
827     ## Stay in the state.
828     $self->{s_kwd} .= $nch;
829    
830     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
831     $self->{line_prev} = $self->{line};
832     $self->{column_prev} = $self->{column};
833     $self->{column}++;
834     $self->{nc}
835     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
836     } else {
837     $self->{set_nc}->($self);
838     }
839    
840     redo A;
841     } else {
842    
843     $self->{state} = DATA_STATE;
844 wakaba 1.5 $self->{s_kwd} = '';
845 wakaba 1.1 ## Reconsume.
846     return ({type => CHARACTER_TOKEN,
847     data => '</' . $self->{s_kwd},
848     line => $self->{line_prev},
849     column => $self->{column_prev} - 1 - length $self->{s_kwd},
850     });
851     redo A;
852     }
853     } else { # after "<{tag-name}"
854     unless ($is_space->{$self->{nc}} or
855     {
856     0x003E => 1, # >
857     0x002F => 1, # /
858     -1 => 1, # EOF
859     }->{$self->{nc}}) {
860    
861     ## Reconsume.
862     $self->{state} = DATA_STATE;
863 wakaba 1.5 $self->{s_kwd} = '';
864 wakaba 1.1 return ({type => CHARACTER_TOKEN,
865     data => '</' . $self->{s_kwd},
866     line => $self->{line_prev},
867     column => $self->{column_prev} - 1 - length $self->{s_kwd},
868     });
869     redo A;
870     } else {
871    
872     $self->{ct}
873     = {type => END_TAG_TOKEN,
874     tag_name => $self->{last_stag_name},
875     line => $self->{line_prev},
876     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
877     $self->{state} = TAG_NAME_STATE;
878     ## Reconsume.
879     redo A;
880     }
881     }
882     } elsif ($self->{state} == TAG_NAME_STATE) {
883     if ($is_space->{$self->{nc}}) {
884    
885     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
886    
887     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
888     $self->{line_prev} = $self->{line};
889     $self->{column_prev} = $self->{column};
890     $self->{column}++;
891     $self->{nc}
892     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
893     } else {
894     $self->{set_nc}->($self);
895     }
896    
897     redo A;
898     } elsif ($self->{nc} == 0x003E) { # >
899     if ($self->{ct}->{type} == START_TAG_TOKEN) {
900    
901     $self->{last_stag_name} = $self->{ct}->{tag_name};
902     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
903     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
904     #if ($self->{ct}->{attributes}) {
905     # ## NOTE: This should never be reached.
906     # !!! cp (36);
907     # !!! parse-error (type => 'end tag attribute');
908     #} else {
909    
910     #}
911     } else {
912     die "$0: $self->{ct}->{type}: Unknown token type";
913     }
914     $self->{state} = DATA_STATE;
915 wakaba 1.5 $self->{s_kwd} = '';
916 wakaba 1.1
917     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
918     $self->{line_prev} = $self->{line};
919     $self->{column_prev} = $self->{column};
920     $self->{column}++;
921     $self->{nc}
922     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
923     } else {
924     $self->{set_nc}->($self);
925     }
926    
927    
928     return ($self->{ct}); # start tag or end tag
929    
930     redo A;
931     } elsif (0x0041 <= $self->{nc} and
932     $self->{nc} <= 0x005A) { # A..Z
933    
934 wakaba 1.4 $self->{ct}->{tag_name}
935     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
936 wakaba 1.1 # start tag or end tag
937     ## Stay in this state
938    
939     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
940     $self->{line_prev} = $self->{line};
941     $self->{column_prev} = $self->{column};
942     $self->{column}++;
943     $self->{nc}
944     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
945     } else {
946     $self->{set_nc}->($self);
947     }
948    
949     redo A;
950     } elsif ($self->{nc} == -1) {
951     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
952     if ($self->{ct}->{type} == START_TAG_TOKEN) {
953    
954     $self->{last_stag_name} = $self->{ct}->{tag_name};
955     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
956     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
957     #if ($self->{ct}->{attributes}) {
958     # ## NOTE: This state should never be reached.
959     # !!! cp (40);
960     # !!! parse-error (type => 'end tag attribute');
961     #} else {
962    
963     #}
964     } else {
965     die "$0: $self->{ct}->{type}: Unknown token type";
966     }
967     $self->{state} = DATA_STATE;
968 wakaba 1.5 $self->{s_kwd} = '';
969 wakaba 1.1 # reconsume
970    
971     return ($self->{ct}); # start tag or end tag
972    
973     redo A;
974     } elsif ($self->{nc} == 0x002F) { # /
975    
976     $self->{state} = SELF_CLOSING_START_TAG_STATE;
977    
978     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
979     $self->{line_prev} = $self->{line};
980     $self->{column_prev} = $self->{column};
981     $self->{column}++;
982     $self->{nc}
983     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
984     } else {
985     $self->{set_nc}->($self);
986     }
987    
988     redo A;
989     } else {
990    
991     $self->{ct}->{tag_name} .= chr $self->{nc};
992     # start tag or end tag
993     ## Stay in the state
994    
995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
996     $self->{line_prev} = $self->{line};
997     $self->{column_prev} = $self->{column};
998     $self->{column}++;
999     $self->{nc}
1000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1001     } else {
1002     $self->{set_nc}->($self);
1003     }
1004    
1005     redo A;
1006     }
1007     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
1008     if ($is_space->{$self->{nc}}) {
1009    
1010     ## Stay in the state
1011    
1012     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1013     $self->{line_prev} = $self->{line};
1014     $self->{column_prev} = $self->{column};
1015     $self->{column}++;
1016     $self->{nc}
1017     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1018     } else {
1019     $self->{set_nc}->($self);
1020     }
1021    
1022     redo A;
1023     } elsif ($self->{nc} == 0x003E) { # >
1024     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1025    
1026     $self->{last_stag_name} = $self->{ct}->{tag_name};
1027     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1028     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1029     if ($self->{ct}->{attributes}) {
1030    
1031     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1032     } else {
1033    
1034     }
1035     } else {
1036     die "$0: $self->{ct}->{type}: Unknown token type";
1037     }
1038     $self->{state} = DATA_STATE;
1039 wakaba 1.5 $self->{s_kwd} = '';
1040 wakaba 1.1
1041     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1042     $self->{line_prev} = $self->{line};
1043     $self->{column_prev} = $self->{column};
1044     $self->{column}++;
1045     $self->{nc}
1046     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1047     } else {
1048     $self->{set_nc}->($self);
1049     }
1050    
1051    
1052     return ($self->{ct}); # start tag or end tag
1053    
1054     redo A;
1055     } elsif (0x0041 <= $self->{nc} and
1056     $self->{nc} <= 0x005A) { # A..Z
1057    
1058     $self->{ca}
1059 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1060 wakaba 1.1 value => '',
1061     line => $self->{line}, column => $self->{column}};
1062     $self->{state} = ATTRIBUTE_NAME_STATE;
1063    
1064     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1065     $self->{line_prev} = $self->{line};
1066     $self->{column_prev} = $self->{column};
1067     $self->{column}++;
1068     $self->{nc}
1069     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1070     } else {
1071     $self->{set_nc}->($self);
1072     }
1073    
1074     redo A;
1075     } elsif ($self->{nc} == 0x002F) { # /
1076    
1077     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1078    
1079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1080     $self->{line_prev} = $self->{line};
1081     $self->{column_prev} = $self->{column};
1082     $self->{column}++;
1083     $self->{nc}
1084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1085     } else {
1086     $self->{set_nc}->($self);
1087     }
1088    
1089     redo A;
1090     } elsif ($self->{nc} == -1) {
1091     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1092     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1093    
1094     $self->{last_stag_name} = $self->{ct}->{tag_name};
1095     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1096     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1097     if ($self->{ct}->{attributes}) {
1098    
1099     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1100     } else {
1101    
1102     }
1103     } else {
1104     die "$0: $self->{ct}->{type}: Unknown token type";
1105     }
1106     $self->{state} = DATA_STATE;
1107 wakaba 1.5 $self->{s_kwd} = '';
1108 wakaba 1.1 # reconsume
1109    
1110     return ($self->{ct}); # start tag or end tag
1111    
1112     redo A;
1113     } else {
1114     if ({
1115     0x0022 => 1, # "
1116     0x0027 => 1, # '
1117     0x003D => 1, # =
1118     }->{$self->{nc}}) {
1119    
1120     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1121     } else {
1122    
1123     }
1124     $self->{ca}
1125     = {name => chr ($self->{nc}),
1126     value => '',
1127     line => $self->{line}, column => $self->{column}};
1128     $self->{state} = ATTRIBUTE_NAME_STATE;
1129    
1130     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1131     $self->{line_prev} = $self->{line};
1132     $self->{column_prev} = $self->{column};
1133     $self->{column}++;
1134     $self->{nc}
1135     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1136     } else {
1137     $self->{set_nc}->($self);
1138     }
1139    
1140     redo A;
1141     }
1142     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1143     my $before_leave = sub {
1144     if (exists $self->{ct}->{attributes} # start tag or end tag
1145     ->{$self->{ca}->{name}}) { # MUST
1146    
1147     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1148     ## Discard $self->{ca} # MUST
1149     } else {
1150    
1151     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1152     = $self->{ca};
1153     }
1154     }; # $before_leave
1155    
1156     if ($is_space->{$self->{nc}}) {
1157    
1158     $before_leave->();
1159     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1160    
1161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1162     $self->{line_prev} = $self->{line};
1163     $self->{column_prev} = $self->{column};
1164     $self->{column}++;
1165     $self->{nc}
1166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1167     } else {
1168     $self->{set_nc}->($self);
1169     }
1170    
1171     redo A;
1172     } elsif ($self->{nc} == 0x003D) { # =
1173    
1174     $before_leave->();
1175     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1176    
1177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1178     $self->{line_prev} = $self->{line};
1179     $self->{column_prev} = $self->{column};
1180     $self->{column}++;
1181     $self->{nc}
1182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1183     } else {
1184     $self->{set_nc}->($self);
1185     }
1186    
1187     redo A;
1188     } elsif ($self->{nc} == 0x003E) { # >
1189     $before_leave->();
1190     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1191    
1192     $self->{last_stag_name} = $self->{ct}->{tag_name};
1193     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1194    
1195     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1196     if ($self->{ct}->{attributes}) {
1197     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1198     }
1199     } else {
1200     die "$0: $self->{ct}->{type}: Unknown token type";
1201     }
1202     $self->{state} = DATA_STATE;
1203 wakaba 1.5 $self->{s_kwd} = '';
1204 wakaba 1.1
1205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1206     $self->{line_prev} = $self->{line};
1207     $self->{column_prev} = $self->{column};
1208     $self->{column}++;
1209     $self->{nc}
1210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1211     } else {
1212     $self->{set_nc}->($self);
1213     }
1214    
1215    
1216     return ($self->{ct}); # start tag or end tag
1217    
1218     redo A;
1219     } elsif (0x0041 <= $self->{nc} and
1220     $self->{nc} <= 0x005A) { # A..Z
1221    
1222 wakaba 1.4 $self->{ca}->{name}
1223     .= chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020));
1224 wakaba 1.1 ## Stay in the state
1225    
1226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1227     $self->{line_prev} = $self->{line};
1228     $self->{column_prev} = $self->{column};
1229     $self->{column}++;
1230     $self->{nc}
1231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1232     } else {
1233     $self->{set_nc}->($self);
1234     }
1235    
1236     redo A;
1237     } elsif ($self->{nc} == 0x002F) { # /
1238    
1239     $before_leave->();
1240     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1241    
1242     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1243     $self->{line_prev} = $self->{line};
1244     $self->{column_prev} = $self->{column};
1245     $self->{column}++;
1246     $self->{nc}
1247     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1248     } else {
1249     $self->{set_nc}->($self);
1250     }
1251    
1252     redo A;
1253     } elsif ($self->{nc} == -1) {
1254     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1255     $before_leave->();
1256     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1257    
1258     $self->{last_stag_name} = $self->{ct}->{tag_name};
1259     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1260     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1261     if ($self->{ct}->{attributes}) {
1262    
1263     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1264     } else {
1265     ## NOTE: This state should never be reached.
1266    
1267     }
1268     } else {
1269     die "$0: $self->{ct}->{type}: Unknown token type";
1270     }
1271     $self->{state} = DATA_STATE;
1272 wakaba 1.5 $self->{s_kwd} = '';
1273 wakaba 1.1 # reconsume
1274    
1275     return ($self->{ct}); # start tag or end tag
1276    
1277     redo A;
1278     } else {
1279     if ($self->{nc} == 0x0022 or # "
1280     $self->{nc} == 0x0027) { # '
1281    
1282     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1283     } else {
1284    
1285     }
1286     $self->{ca}->{name} .= chr ($self->{nc});
1287     ## Stay in the state
1288    
1289     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1290     $self->{line_prev} = $self->{line};
1291     $self->{column_prev} = $self->{column};
1292     $self->{column}++;
1293     $self->{nc}
1294     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1295     } else {
1296     $self->{set_nc}->($self);
1297     }
1298    
1299     redo A;
1300     }
1301     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1302     if ($is_space->{$self->{nc}}) {
1303    
1304     ## Stay in the state
1305    
1306     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1307     $self->{line_prev} = $self->{line};
1308     $self->{column_prev} = $self->{column};
1309     $self->{column}++;
1310     $self->{nc}
1311     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1312     } else {
1313     $self->{set_nc}->($self);
1314     }
1315    
1316     redo A;
1317     } elsif ($self->{nc} == 0x003D) { # =
1318    
1319     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1320    
1321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1322     $self->{line_prev} = $self->{line};
1323     $self->{column_prev} = $self->{column};
1324     $self->{column}++;
1325     $self->{nc}
1326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1327     } else {
1328     $self->{set_nc}->($self);
1329     }
1330    
1331     redo A;
1332     } elsif ($self->{nc} == 0x003E) { # >
1333     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1334    
1335     $self->{last_stag_name} = $self->{ct}->{tag_name};
1336     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1337     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1338     if ($self->{ct}->{attributes}) {
1339    
1340     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1341     } else {
1342     ## NOTE: This state should never be reached.
1343    
1344     }
1345     } else {
1346     die "$0: $self->{ct}->{type}: Unknown token type";
1347     }
1348     $self->{state} = DATA_STATE;
1349 wakaba 1.5 $self->{s_kwd} = '';
1350 wakaba 1.1
1351     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1352     $self->{line_prev} = $self->{line};
1353     $self->{column_prev} = $self->{column};
1354     $self->{column}++;
1355     $self->{nc}
1356     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1357     } else {
1358     $self->{set_nc}->($self);
1359     }
1360    
1361    
1362     return ($self->{ct}); # start tag or end tag
1363    
1364     redo A;
1365     } elsif (0x0041 <= $self->{nc} and
1366     $self->{nc} <= 0x005A) { # A..Z
1367    
1368     $self->{ca}
1369 wakaba 1.4 = {name => chr ($self->{nc} + ($self->{is_xml} ? 0 : 0x0020)),
1370 wakaba 1.1 value => '',
1371     line => $self->{line}, column => $self->{column}};
1372     $self->{state} = ATTRIBUTE_NAME_STATE;
1373    
1374     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1375     $self->{line_prev} = $self->{line};
1376     $self->{column_prev} = $self->{column};
1377     $self->{column}++;
1378     $self->{nc}
1379     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1380     } else {
1381     $self->{set_nc}->($self);
1382     }
1383    
1384     redo A;
1385     } elsif ($self->{nc} == 0x002F) { # /
1386    
1387     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1388    
1389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1390     $self->{line_prev} = $self->{line};
1391     $self->{column_prev} = $self->{column};
1392     $self->{column}++;
1393     $self->{nc}
1394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1395     } else {
1396     $self->{set_nc}->($self);
1397     }
1398    
1399     redo A;
1400     } elsif ($self->{nc} == -1) {
1401     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1402     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1403    
1404     $self->{last_stag_name} = $self->{ct}->{tag_name};
1405     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1406     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1407     if ($self->{ct}->{attributes}) {
1408    
1409     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1410     } else {
1411     ## NOTE: This state should never be reached.
1412    
1413     }
1414     } else {
1415     die "$0: $self->{ct}->{type}: Unknown token type";
1416     }
1417 wakaba 1.5 $self->{s_kwd} = '';
1418 wakaba 1.1 $self->{state} = DATA_STATE;
1419     # reconsume
1420    
1421     return ($self->{ct}); # start tag or end tag
1422    
1423     redo A;
1424     } else {
1425     if ($self->{nc} == 0x0022 or # "
1426     $self->{nc} == 0x0027) { # '
1427    
1428     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1429     } else {
1430    
1431     }
1432     $self->{ca}
1433     = {name => chr ($self->{nc}),
1434     value => '',
1435     line => $self->{line}, column => $self->{column}};
1436     $self->{state} = ATTRIBUTE_NAME_STATE;
1437    
1438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1439     $self->{line_prev} = $self->{line};
1440     $self->{column_prev} = $self->{column};
1441     $self->{column}++;
1442     $self->{nc}
1443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1444     } else {
1445     $self->{set_nc}->($self);
1446     }
1447    
1448     redo A;
1449     }
1450     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1451     if ($is_space->{$self->{nc}}) {
1452    
1453     ## Stay in the state
1454    
1455     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1456     $self->{line_prev} = $self->{line};
1457     $self->{column_prev} = $self->{column};
1458     $self->{column}++;
1459     $self->{nc}
1460     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1461     } else {
1462     $self->{set_nc}->($self);
1463     }
1464    
1465     redo A;
1466     } elsif ($self->{nc} == 0x0022) { # "
1467    
1468     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1469    
1470     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1471     $self->{line_prev} = $self->{line};
1472     $self->{column_prev} = $self->{column};
1473     $self->{column}++;
1474     $self->{nc}
1475     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1476     } else {
1477     $self->{set_nc}->($self);
1478     }
1479    
1480     redo A;
1481     } elsif ($self->{nc} == 0x0026) { # &
1482    
1483     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1484     ## reconsume
1485     redo A;
1486     } elsif ($self->{nc} == 0x0027) { # '
1487    
1488     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1489    
1490     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1491     $self->{line_prev} = $self->{line};
1492     $self->{column_prev} = $self->{column};
1493     $self->{column}++;
1494     $self->{nc}
1495     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1496     } else {
1497     $self->{set_nc}->($self);
1498     }
1499    
1500     redo A;
1501     } elsif ($self->{nc} == 0x003E) { # >
1502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1503     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1504    
1505     $self->{last_stag_name} = $self->{ct}->{tag_name};
1506     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1507     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1508     if ($self->{ct}->{attributes}) {
1509    
1510     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1511     } else {
1512     ## NOTE: This state should never be reached.
1513    
1514     }
1515     } else {
1516     die "$0: $self->{ct}->{type}: Unknown token type";
1517     }
1518     $self->{state} = DATA_STATE;
1519 wakaba 1.5 $self->{s_kwd} = '';
1520 wakaba 1.1
1521     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1522     $self->{line_prev} = $self->{line};
1523     $self->{column_prev} = $self->{column};
1524     $self->{column}++;
1525     $self->{nc}
1526     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1527     } else {
1528     $self->{set_nc}->($self);
1529     }
1530    
1531    
1532     return ($self->{ct}); # start tag or end tag
1533    
1534     redo A;
1535     } elsif ($self->{nc} == -1) {
1536     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1537     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1538    
1539     $self->{last_stag_name} = $self->{ct}->{tag_name};
1540     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1541     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1542     if ($self->{ct}->{attributes}) {
1543    
1544     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1545     } else {
1546     ## NOTE: This state should never be reached.
1547    
1548     }
1549     } else {
1550     die "$0: $self->{ct}->{type}: Unknown token type";
1551     }
1552     $self->{state} = DATA_STATE;
1553 wakaba 1.5 $self->{s_kwd} = '';
1554 wakaba 1.1 ## reconsume
1555    
1556     return ($self->{ct}); # start tag or end tag
1557    
1558     redo A;
1559     } else {
1560     if ($self->{nc} == 0x003D) { # =
1561    
1562     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1563     } else {
1564    
1565     }
1566     $self->{ca}->{value} .= chr ($self->{nc});
1567     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1568    
1569     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1570     $self->{line_prev} = $self->{line};
1571     $self->{column_prev} = $self->{column};
1572     $self->{column}++;
1573     $self->{nc}
1574     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1575     } else {
1576     $self->{set_nc}->($self);
1577     }
1578    
1579     redo A;
1580     }
1581     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1582     if ($self->{nc} == 0x0022) { # "
1583    
1584     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1585    
1586     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1587     $self->{line_prev} = $self->{line};
1588     $self->{column_prev} = $self->{column};
1589     $self->{column}++;
1590     $self->{nc}
1591     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1592     } else {
1593     $self->{set_nc}->($self);
1594     }
1595    
1596     redo A;
1597     } elsif ($self->{nc} == 0x0026) { # &
1598    
1599     ## NOTE: In the spec, the tokenizer is switched to the
1600     ## "entity in attribute value state". In this implementation, the
1601     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1602     ## implementation of the "consume a character reference" algorithm.
1603     $self->{prev_state} = $self->{state};
1604     $self->{entity_add} = 0x0022; # "
1605     $self->{state} = ENTITY_STATE;
1606    
1607     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1608     $self->{line_prev} = $self->{line};
1609     $self->{column_prev} = $self->{column};
1610     $self->{column}++;
1611     $self->{nc}
1612     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1613     } else {
1614     $self->{set_nc}->($self);
1615     }
1616    
1617     redo A;
1618     } elsif ($self->{nc} == -1) {
1619     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1620     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1621    
1622     $self->{last_stag_name} = $self->{ct}->{tag_name};
1623     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1624     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1625     if ($self->{ct}->{attributes}) {
1626    
1627     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1628     } else {
1629     ## NOTE: This state should never be reached.
1630    
1631     }
1632     } else {
1633     die "$0: $self->{ct}->{type}: Unknown token type";
1634     }
1635     $self->{state} = DATA_STATE;
1636 wakaba 1.5 $self->{s_kwd} = '';
1637 wakaba 1.1 ## reconsume
1638    
1639     return ($self->{ct}); # start tag or end tag
1640    
1641     redo A;
1642     } else {
1643    
1644     $self->{ca}->{value} .= chr ($self->{nc});
1645     $self->{read_until}->($self->{ca}->{value},
1646     q["&],
1647     length $self->{ca}->{value});
1648    
1649     ## Stay in the state
1650    
1651     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1652     $self->{line_prev} = $self->{line};
1653     $self->{column_prev} = $self->{column};
1654     $self->{column}++;
1655     $self->{nc}
1656     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1657     } else {
1658     $self->{set_nc}->($self);
1659     }
1660    
1661     redo A;
1662     }
1663     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1664     if ($self->{nc} == 0x0027) { # '
1665    
1666     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1667    
1668     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1669     $self->{line_prev} = $self->{line};
1670     $self->{column_prev} = $self->{column};
1671     $self->{column}++;
1672     $self->{nc}
1673     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1674     } else {
1675     $self->{set_nc}->($self);
1676     }
1677    
1678     redo A;
1679     } elsif ($self->{nc} == 0x0026) { # &
1680    
1681     ## NOTE: In the spec, the tokenizer is switched to the
1682     ## "entity in attribute value state". In this implementation, the
1683     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1684     ## implementation of the "consume a character reference" algorithm.
1685     $self->{entity_add} = 0x0027; # '
1686     $self->{prev_state} = $self->{state};
1687     $self->{state} = ENTITY_STATE;
1688    
1689     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1690     $self->{line_prev} = $self->{line};
1691     $self->{column_prev} = $self->{column};
1692     $self->{column}++;
1693     $self->{nc}
1694     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1695     } else {
1696     $self->{set_nc}->($self);
1697     }
1698    
1699     redo A;
1700     } elsif ($self->{nc} == -1) {
1701     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1702     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1703    
1704     $self->{last_stag_name} = $self->{ct}->{tag_name};
1705     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1706     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1707     if ($self->{ct}->{attributes}) {
1708    
1709     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1710     } else {
1711     ## NOTE: This state should never be reached.
1712    
1713     }
1714     } else {
1715     die "$0: $self->{ct}->{type}: Unknown token type";
1716     }
1717     $self->{state} = DATA_STATE;
1718 wakaba 1.5 $self->{s_kwd} = '';
1719 wakaba 1.1 ## reconsume
1720    
1721     return ($self->{ct}); # start tag or end tag
1722    
1723     redo A;
1724     } else {
1725    
1726     $self->{ca}->{value} .= chr ($self->{nc});
1727     $self->{read_until}->($self->{ca}->{value},
1728     q['&],
1729     length $self->{ca}->{value});
1730    
1731     ## Stay in the state
1732    
1733     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1734     $self->{line_prev} = $self->{line};
1735     $self->{column_prev} = $self->{column};
1736     $self->{column}++;
1737     $self->{nc}
1738     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1739     } else {
1740     $self->{set_nc}->($self);
1741     }
1742    
1743     redo A;
1744     }
1745     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1746     if ($is_space->{$self->{nc}}) {
1747    
1748     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1749    
1750     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1751     $self->{line_prev} = $self->{line};
1752     $self->{column_prev} = $self->{column};
1753     $self->{column}++;
1754     $self->{nc}
1755     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1756     } else {
1757     $self->{set_nc}->($self);
1758     }
1759    
1760     redo A;
1761     } elsif ($self->{nc} == 0x0026) { # &
1762    
1763     ## NOTE: In the spec, the tokenizer is switched to the
1764     ## "entity in attribute value state". In this implementation, the
1765     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1766     ## implementation of the "consume a character reference" algorithm.
1767     $self->{entity_add} = -1;
1768     $self->{prev_state} = $self->{state};
1769     $self->{state} = ENTITY_STATE;
1770    
1771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1772     $self->{line_prev} = $self->{line};
1773     $self->{column_prev} = $self->{column};
1774     $self->{column}++;
1775     $self->{nc}
1776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1777     } else {
1778     $self->{set_nc}->($self);
1779     }
1780    
1781     redo A;
1782     } elsif ($self->{nc} == 0x003E) { # >
1783     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1784    
1785     $self->{last_stag_name} = $self->{ct}->{tag_name};
1786     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1787     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1788     if ($self->{ct}->{attributes}) {
1789    
1790     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1791     } else {
1792     ## NOTE: This state should never be reached.
1793    
1794     }
1795     } else {
1796     die "$0: $self->{ct}->{type}: Unknown token type";
1797     }
1798     $self->{state} = DATA_STATE;
1799 wakaba 1.5 $self->{s_kwd} = '';
1800 wakaba 1.1
1801     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1802     $self->{line_prev} = $self->{line};
1803     $self->{column_prev} = $self->{column};
1804     $self->{column}++;
1805     $self->{nc}
1806     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1807     } else {
1808     $self->{set_nc}->($self);
1809     }
1810    
1811    
1812     return ($self->{ct}); # start tag or end tag
1813    
1814     redo A;
1815     } elsif ($self->{nc} == -1) {
1816     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1817     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1818    
1819     $self->{last_stag_name} = $self->{ct}->{tag_name};
1820     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1821     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1822     if ($self->{ct}->{attributes}) {
1823    
1824     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1825     } else {
1826     ## NOTE: This state should never be reached.
1827    
1828     }
1829     } else {
1830     die "$0: $self->{ct}->{type}: Unknown token type";
1831     }
1832     $self->{state} = DATA_STATE;
1833 wakaba 1.5 $self->{s_kwd} = '';
1834 wakaba 1.1 ## reconsume
1835    
1836     return ($self->{ct}); # start tag or end tag
1837    
1838     redo A;
1839     } else {
1840     if ({
1841     0x0022 => 1, # "
1842     0x0027 => 1, # '
1843     0x003D => 1, # =
1844     }->{$self->{nc}}) {
1845    
1846     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1847     } else {
1848    
1849     }
1850     $self->{ca}->{value} .= chr ($self->{nc});
1851     $self->{read_until}->($self->{ca}->{value},
1852     q["'=& >],
1853     length $self->{ca}->{value});
1854    
1855     ## Stay in the state
1856    
1857     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1858     $self->{line_prev} = $self->{line};
1859     $self->{column_prev} = $self->{column};
1860     $self->{column}++;
1861     $self->{nc}
1862     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1863     } else {
1864     $self->{set_nc}->($self);
1865     }
1866    
1867     redo A;
1868     }
1869     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1870     if ($is_space->{$self->{nc}}) {
1871    
1872     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1873    
1874     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1875     $self->{line_prev} = $self->{line};
1876     $self->{column_prev} = $self->{column};
1877     $self->{column}++;
1878     $self->{nc}
1879     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1880     } else {
1881     $self->{set_nc}->($self);
1882     }
1883    
1884     redo A;
1885     } elsif ($self->{nc} == 0x003E) { # >
1886     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1887    
1888     $self->{last_stag_name} = $self->{ct}->{tag_name};
1889     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1890     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1891     if ($self->{ct}->{attributes}) {
1892    
1893     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1894     } else {
1895     ## NOTE: This state should never be reached.
1896    
1897     }
1898     } else {
1899     die "$0: $self->{ct}->{type}: Unknown token type";
1900     }
1901     $self->{state} = DATA_STATE;
1902 wakaba 1.5 $self->{s_kwd} = '';
1903 wakaba 1.1
1904     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1905     $self->{line_prev} = $self->{line};
1906     $self->{column_prev} = $self->{column};
1907     $self->{column}++;
1908     $self->{nc}
1909     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1910     } else {
1911     $self->{set_nc}->($self);
1912     }
1913    
1914    
1915     return ($self->{ct}); # start tag or end tag
1916    
1917     redo A;
1918     } elsif ($self->{nc} == 0x002F) { # /
1919    
1920     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1921    
1922     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1923     $self->{line_prev} = $self->{line};
1924     $self->{column_prev} = $self->{column};
1925     $self->{column}++;
1926     $self->{nc}
1927     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1928     } else {
1929     $self->{set_nc}->($self);
1930     }
1931    
1932     redo A;
1933     } elsif ($self->{nc} == -1) {
1934     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1935     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1936    
1937     $self->{last_stag_name} = $self->{ct}->{tag_name};
1938     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1939     if ($self->{ct}->{attributes}) {
1940    
1941     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1942     } else {
1943     ## NOTE: This state should never be reached.
1944    
1945     }
1946     } else {
1947     die "$0: $self->{ct}->{type}: Unknown token type";
1948     }
1949     $self->{state} = DATA_STATE;
1950 wakaba 1.5 $self->{s_kwd} = '';
1951 wakaba 1.1 ## Reconsume.
1952     return ($self->{ct}); # start tag or end tag
1953     redo A;
1954     } else {
1955    
1956     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1957     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1958     ## reconsume
1959     redo A;
1960     }
1961     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1962     if ($self->{nc} == 0x003E) { # >
1963     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1964    
1965     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1966     ## TODO: Different type than slash in start tag
1967     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1968     if ($self->{ct}->{attributes}) {
1969    
1970     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1971     } else {
1972    
1973     }
1974     ## TODO: Test |<title></title/>|
1975     } else {
1976    
1977     $self->{self_closing} = 1;
1978     }
1979    
1980     $self->{state} = DATA_STATE;
1981 wakaba 1.5 $self->{s_kwd} = '';
1982 wakaba 1.1
1983     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1984     $self->{line_prev} = $self->{line};
1985     $self->{column_prev} = $self->{column};
1986     $self->{column}++;
1987     $self->{nc}
1988     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1989     } else {
1990     $self->{set_nc}->($self);
1991     }
1992    
1993    
1994     return ($self->{ct}); # start tag or end tag
1995    
1996     redo A;
1997     } elsif ($self->{nc} == -1) {
1998     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1999     if ($self->{ct}->{type} == START_TAG_TOKEN) {
2000    
2001     $self->{last_stag_name} = $self->{ct}->{tag_name};
2002     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
2003     if ($self->{ct}->{attributes}) {
2004    
2005     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
2006     } else {
2007     ## NOTE: This state should never be reached.
2008    
2009     }
2010     } else {
2011     die "$0: $self->{ct}->{type}: Unknown token type";
2012     }
2013     $self->{state} = DATA_STATE;
2014 wakaba 1.5 $self->{s_kwd} = '';
2015 wakaba 1.1 ## Reconsume.
2016     return ($self->{ct}); # start tag or end tag
2017     redo A;
2018     } else {
2019    
2020     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
2021     ## TODO: This error type is wrong.
2022     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
2023     ## Reconsume.
2024     redo A;
2025     }
2026     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
2027     ## (only happen if PCDATA state)
2028    
2029     ## NOTE: Unlike spec's "bogus comment state", this implementation
2030     ## consumes characters one-by-one basis.
2031    
2032     if ($self->{nc} == 0x003E) { # >
2033    
2034     $self->{state} = DATA_STATE;
2035 wakaba 1.5 $self->{s_kwd} = '';
2036 wakaba 1.1
2037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2038     $self->{line_prev} = $self->{line};
2039     $self->{column_prev} = $self->{column};
2040     $self->{column}++;
2041     $self->{nc}
2042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2043     } else {
2044     $self->{set_nc}->($self);
2045     }
2046    
2047    
2048     return ($self->{ct}); # comment
2049     redo A;
2050     } elsif ($self->{nc} == -1) {
2051    
2052     $self->{state} = DATA_STATE;
2053 wakaba 1.5 $self->{s_kwd} = '';
2054 wakaba 1.1 ## reconsume
2055    
2056     return ($self->{ct}); # comment
2057     redo A;
2058     } else {
2059    
2060     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2061     $self->{read_until}->($self->{ct}->{data},
2062     q[>],
2063     length $self->{ct}->{data});
2064    
2065     ## Stay in the state.
2066    
2067     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2068     $self->{line_prev} = $self->{line};
2069     $self->{column_prev} = $self->{column};
2070     $self->{column}++;
2071     $self->{nc}
2072     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2073     } else {
2074     $self->{set_nc}->($self);
2075     }
2076    
2077     redo A;
2078     }
2079     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
2080     ## (only happen if PCDATA state)
2081    
2082     if ($self->{nc} == 0x002D) { # -
2083    
2084     $self->{state} = MD_HYPHEN_STATE;
2085    
2086     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2087     $self->{line_prev} = $self->{line};
2088     $self->{column_prev} = $self->{column};
2089     $self->{column}++;
2090     $self->{nc}
2091     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2092     } else {
2093     $self->{set_nc}->($self);
2094     }
2095    
2096     redo A;
2097     } elsif ($self->{nc} == 0x0044 or # D
2098     $self->{nc} == 0x0064) { # d
2099     ## ASCII case-insensitive.
2100    
2101     $self->{state} = MD_DOCTYPE_STATE;
2102     $self->{s_kwd} = chr $self->{nc};
2103    
2104     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2105     $self->{line_prev} = $self->{line};
2106     $self->{column_prev} = $self->{column};
2107     $self->{column}++;
2108     $self->{nc}
2109     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2110     } else {
2111     $self->{set_nc}->($self);
2112     }
2113    
2114     redo A;
2115 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2116     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2117     $self->{is_xml}) and
2118 wakaba 1.1 $self->{nc} == 0x005B) { # [
2119    
2120     $self->{state} = MD_CDATA_STATE;
2121     $self->{s_kwd} = '[';
2122    
2123     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2124     $self->{line_prev} = $self->{line};
2125     $self->{column_prev} = $self->{column};
2126     $self->{column}++;
2127     $self->{nc}
2128     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2129     } else {
2130     $self->{set_nc}->($self);
2131     }
2132    
2133     redo A;
2134     } else {
2135    
2136     }
2137    
2138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2139     line => $self->{line_prev},
2140     column => $self->{column_prev} - 1);
2141     ## Reconsume.
2142     $self->{state} = BOGUS_COMMENT_STATE;
2143     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2144     line => $self->{line_prev},
2145     column => $self->{column_prev} - 1,
2146     };
2147     redo A;
2148     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2149     if ($self->{nc} == 0x002D) { # -
2150    
2151     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2152     line => $self->{line_prev},
2153     column => $self->{column_prev} - 2,
2154     };
2155     $self->{state} = COMMENT_START_STATE;
2156    
2157     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2158     $self->{line_prev} = $self->{line};
2159     $self->{column_prev} = $self->{column};
2160     $self->{column}++;
2161     $self->{nc}
2162     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2163     } else {
2164     $self->{set_nc}->($self);
2165     }
2166    
2167     redo A;
2168     } else {
2169    
2170     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2171     line => $self->{line_prev},
2172     column => $self->{column_prev} - 2);
2173     $self->{state} = BOGUS_COMMENT_STATE;
2174     ## Reconsume.
2175     $self->{ct} = {type => COMMENT_TOKEN,
2176     data => '-',
2177     line => $self->{line_prev},
2178     column => $self->{column_prev} - 2,
2179     };
2180     redo A;
2181     }
2182     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2183     ## ASCII case-insensitive.
2184     if ($self->{nc} == [
2185     undef,
2186     0x004F, # O
2187     0x0043, # C
2188     0x0054, # T
2189     0x0059, # Y
2190     0x0050, # P
2191     ]->[length $self->{s_kwd}] or
2192     $self->{nc} == [
2193     undef,
2194     0x006F, # o
2195     0x0063, # c
2196     0x0074, # t
2197     0x0079, # y
2198     0x0070, # p
2199     ]->[length $self->{s_kwd}]) {
2200    
2201     ## Stay in the state.
2202     $self->{s_kwd} .= chr $self->{nc};
2203    
2204     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2205     $self->{line_prev} = $self->{line};
2206     $self->{column_prev} = $self->{column};
2207     $self->{column}++;
2208     $self->{nc}
2209     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2210     } else {
2211     $self->{set_nc}->($self);
2212     }
2213    
2214     redo A;
2215     } elsif ((length $self->{s_kwd}) == 6 and
2216     ($self->{nc} == 0x0045 or # E
2217     $self->{nc} == 0x0065)) { # e
2218    
2219     $self->{state} = DOCTYPE_STATE;
2220     $self->{ct} = {type => DOCTYPE_TOKEN,
2221     quirks => 1,
2222     line => $self->{line_prev},
2223     column => $self->{column_prev} - 7,
2224     };
2225    
2226     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2227     $self->{line_prev} = $self->{line};
2228     $self->{column_prev} = $self->{column};
2229     $self->{column}++;
2230     $self->{nc}
2231     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2232     } else {
2233     $self->{set_nc}->($self);
2234     }
2235    
2236     redo A;
2237     } else {
2238    
2239     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2240     line => $self->{line_prev},
2241     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2242     $self->{state} = BOGUS_COMMENT_STATE;
2243     ## Reconsume.
2244     $self->{ct} = {type => COMMENT_TOKEN,
2245     data => $self->{s_kwd},
2246     line => $self->{line_prev},
2247     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2248     };
2249     redo A;
2250     }
2251     } elsif ($self->{state} == MD_CDATA_STATE) {
2252     if ($self->{nc} == {
2253     '[' => 0x0043, # C
2254     '[C' => 0x0044, # D
2255     '[CD' => 0x0041, # A
2256     '[CDA' => 0x0054, # T
2257     '[CDAT' => 0x0041, # A
2258     }->{$self->{s_kwd}}) {
2259    
2260     ## Stay in the state.
2261     $self->{s_kwd} .= chr $self->{nc};
2262    
2263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2264     $self->{line_prev} = $self->{line};
2265     $self->{column_prev} = $self->{column};
2266     $self->{column}++;
2267     $self->{nc}
2268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2269     } else {
2270     $self->{set_nc}->($self);
2271     }
2272    
2273     redo A;
2274     } elsif ($self->{s_kwd} eq '[CDATA' and
2275     $self->{nc} == 0x005B) { # [
2276 wakaba 1.6 if ($self->{is_xml} and
2277     not $self->{tainted} and
2278     @{$self->{open_elements} or []} == 0) {
2279 wakaba 1.8
2280 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element',
2281     line => $self->{line_prev},
2282     column => $self->{column_prev} - 7);
2283     $self->{tainted} = 1;
2284 wakaba 1.8 } else {
2285    
2286 wakaba 1.6 }
2287    
2288 wakaba 1.1 $self->{ct} = {type => CHARACTER_TOKEN,
2289     data => '',
2290     line => $self->{line_prev},
2291     column => $self->{column_prev} - 7};
2292     $self->{state} = CDATA_SECTION_STATE;
2293    
2294     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2295     $self->{line_prev} = $self->{line};
2296     $self->{column_prev} = $self->{column};
2297     $self->{column}++;
2298     $self->{nc}
2299     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2300     } else {
2301     $self->{set_nc}->($self);
2302     }
2303    
2304     redo A;
2305     } else {
2306    
2307     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2308     line => $self->{line_prev},
2309     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2310     $self->{state} = BOGUS_COMMENT_STATE;
2311     ## Reconsume.
2312     $self->{ct} = {type => COMMENT_TOKEN,
2313     data => $self->{s_kwd},
2314     line => $self->{line_prev},
2315     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2316     };
2317     redo A;
2318     }
2319     } elsif ($self->{state} == COMMENT_START_STATE) {
2320     if ($self->{nc} == 0x002D) { # -
2321    
2322     $self->{state} = COMMENT_START_DASH_STATE;
2323    
2324     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2325     $self->{line_prev} = $self->{line};
2326     $self->{column_prev} = $self->{column};
2327     $self->{column}++;
2328     $self->{nc}
2329     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2330     } else {
2331     $self->{set_nc}->($self);
2332     }
2333    
2334     redo A;
2335     } elsif ($self->{nc} == 0x003E) { # >
2336    
2337     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2338     $self->{state} = DATA_STATE;
2339 wakaba 1.5 $self->{s_kwd} = '';
2340 wakaba 1.1
2341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2342     $self->{line_prev} = $self->{line};
2343     $self->{column_prev} = $self->{column};
2344     $self->{column}++;
2345     $self->{nc}
2346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2347     } else {
2348     $self->{set_nc}->($self);
2349     }
2350    
2351    
2352     return ($self->{ct}); # comment
2353    
2354     redo A;
2355     } elsif ($self->{nc} == -1) {
2356    
2357     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2358     $self->{state} = DATA_STATE;
2359 wakaba 1.5 $self->{s_kwd} = '';
2360 wakaba 1.1 ## reconsume
2361    
2362     return ($self->{ct}); # comment
2363    
2364     redo A;
2365     } else {
2366    
2367     $self->{ct}->{data} # comment
2368     .= chr ($self->{nc});
2369     $self->{state} = COMMENT_STATE;
2370    
2371     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2372     $self->{line_prev} = $self->{line};
2373     $self->{column_prev} = $self->{column};
2374     $self->{column}++;
2375     $self->{nc}
2376     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2377     } else {
2378     $self->{set_nc}->($self);
2379     }
2380    
2381     redo A;
2382     }
2383     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2384     if ($self->{nc} == 0x002D) { # -
2385    
2386     $self->{state} = COMMENT_END_STATE;
2387    
2388     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2389     $self->{line_prev} = $self->{line};
2390     $self->{column_prev} = $self->{column};
2391     $self->{column}++;
2392     $self->{nc}
2393     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2394     } else {
2395     $self->{set_nc}->($self);
2396     }
2397    
2398     redo A;
2399     } elsif ($self->{nc} == 0x003E) { # >
2400    
2401     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2402     $self->{state} = DATA_STATE;
2403 wakaba 1.5 $self->{s_kwd} = '';
2404 wakaba 1.1
2405     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2406     $self->{line_prev} = $self->{line};
2407     $self->{column_prev} = $self->{column};
2408     $self->{column}++;
2409     $self->{nc}
2410     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2411     } else {
2412     $self->{set_nc}->($self);
2413     }
2414    
2415    
2416     return ($self->{ct}); # comment
2417    
2418     redo A;
2419     } elsif ($self->{nc} == -1) {
2420    
2421     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2422     $self->{state} = DATA_STATE;
2423 wakaba 1.5 $self->{s_kwd} = '';
2424 wakaba 1.1 ## reconsume
2425    
2426     return ($self->{ct}); # comment
2427    
2428     redo A;
2429     } else {
2430    
2431     $self->{ct}->{data} # comment
2432     .= '-' . chr ($self->{nc});
2433     $self->{state} = COMMENT_STATE;
2434    
2435     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2436     $self->{line_prev} = $self->{line};
2437     $self->{column_prev} = $self->{column};
2438     $self->{column}++;
2439     $self->{nc}
2440     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2441     } else {
2442     $self->{set_nc}->($self);
2443     }
2444    
2445     redo A;
2446     }
2447     } elsif ($self->{state} == COMMENT_STATE) {
2448     if ($self->{nc} == 0x002D) { # -
2449    
2450     $self->{state} = COMMENT_END_DASH_STATE;
2451    
2452     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2453     $self->{line_prev} = $self->{line};
2454     $self->{column_prev} = $self->{column};
2455     $self->{column}++;
2456     $self->{nc}
2457     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2458     } else {
2459     $self->{set_nc}->($self);
2460     }
2461    
2462     redo A;
2463     } elsif ($self->{nc} == -1) {
2464    
2465     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2466     $self->{state} = DATA_STATE;
2467 wakaba 1.5 $self->{s_kwd} = '';
2468 wakaba 1.1 ## reconsume
2469    
2470     return ($self->{ct}); # comment
2471    
2472     redo A;
2473     } else {
2474    
2475     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2476     $self->{read_until}->($self->{ct}->{data},
2477     q[-],
2478     length $self->{ct}->{data});
2479    
2480     ## Stay in the state
2481    
2482     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2483     $self->{line_prev} = $self->{line};
2484     $self->{column_prev} = $self->{column};
2485     $self->{column}++;
2486     $self->{nc}
2487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2488     } else {
2489     $self->{set_nc}->($self);
2490     }
2491    
2492     redo A;
2493     }
2494     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2495     if ($self->{nc} == 0x002D) { # -
2496    
2497     $self->{state} = COMMENT_END_STATE;
2498    
2499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2500     $self->{line_prev} = $self->{line};
2501     $self->{column_prev} = $self->{column};
2502     $self->{column}++;
2503     $self->{nc}
2504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2505     } else {
2506     $self->{set_nc}->($self);
2507     }
2508    
2509     redo A;
2510     } elsif ($self->{nc} == -1) {
2511    
2512     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2513 wakaba 1.5 $self->{s_kwd} = '';
2514 wakaba 1.1 $self->{state} = DATA_STATE;
2515 wakaba 1.5 $self->{s_kwd} = '';
2516 wakaba 1.1 ## reconsume
2517    
2518     return ($self->{ct}); # comment
2519    
2520     redo A;
2521     } else {
2522    
2523     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2524     $self->{state} = COMMENT_STATE;
2525    
2526     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2527     $self->{line_prev} = $self->{line};
2528     $self->{column_prev} = $self->{column};
2529     $self->{column}++;
2530     $self->{nc}
2531     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2532     } else {
2533     $self->{set_nc}->($self);
2534     }
2535    
2536     redo A;
2537     }
2538     } elsif ($self->{state} == COMMENT_END_STATE) {
2539     if ($self->{nc} == 0x003E) { # >
2540    
2541     $self->{state} = DATA_STATE;
2542 wakaba 1.5 $self->{s_kwd} = '';
2543 wakaba 1.1
2544     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2545     $self->{line_prev} = $self->{line};
2546     $self->{column_prev} = $self->{column};
2547     $self->{column}++;
2548     $self->{nc}
2549     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2550     } else {
2551     $self->{set_nc}->($self);
2552     }
2553    
2554    
2555     return ($self->{ct}); # comment
2556    
2557     redo A;
2558     } elsif ($self->{nc} == 0x002D) { # -
2559    
2560     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2561     line => $self->{line_prev},
2562     column => $self->{column_prev});
2563     $self->{ct}->{data} .= '-'; # comment
2564     ## Stay in the state
2565    
2566     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2567     $self->{line_prev} = $self->{line};
2568     $self->{column_prev} = $self->{column};
2569     $self->{column}++;
2570     $self->{nc}
2571     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2572     } else {
2573     $self->{set_nc}->($self);
2574     }
2575    
2576     redo A;
2577     } elsif ($self->{nc} == -1) {
2578    
2579     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2580     $self->{state} = DATA_STATE;
2581 wakaba 1.5 $self->{s_kwd} = '';
2582 wakaba 1.1 ## reconsume
2583    
2584     return ($self->{ct}); # comment
2585    
2586     redo A;
2587     } else {
2588    
2589     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2590     line => $self->{line_prev},
2591     column => $self->{column_prev});
2592     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2593     $self->{state} = COMMENT_STATE;
2594    
2595     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2596     $self->{line_prev} = $self->{line};
2597     $self->{column_prev} = $self->{column};
2598     $self->{column}++;
2599     $self->{nc}
2600     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2601     } else {
2602     $self->{set_nc}->($self);
2603     }
2604    
2605     redo A;
2606     }
2607     } elsif ($self->{state} == DOCTYPE_STATE) {
2608     if ($is_space->{$self->{nc}}) {
2609    
2610     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2611    
2612     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2613     $self->{line_prev} = $self->{line};
2614     $self->{column_prev} = $self->{column};
2615     $self->{column}++;
2616     $self->{nc}
2617     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2618     } else {
2619     $self->{set_nc}->($self);
2620     }
2621    
2622     redo A;
2623     } else {
2624    
2625     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2626     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2627     ## reconsume
2628     redo A;
2629     }
2630     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2631     if ($is_space->{$self->{nc}}) {
2632    
2633     ## Stay in the state
2634    
2635     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2636     $self->{line_prev} = $self->{line};
2637     $self->{column_prev} = $self->{column};
2638     $self->{column}++;
2639     $self->{nc}
2640     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2641     } else {
2642     $self->{set_nc}->($self);
2643     }
2644    
2645     redo A;
2646     } elsif ($self->{nc} == 0x003E) { # >
2647    
2648     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2649     $self->{state} = DATA_STATE;
2650 wakaba 1.5 $self->{s_kwd} = '';
2651 wakaba 1.1
2652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2653     $self->{line_prev} = $self->{line};
2654     $self->{column_prev} = $self->{column};
2655     $self->{column}++;
2656     $self->{nc}
2657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2658     } else {
2659     $self->{set_nc}->($self);
2660     }
2661    
2662    
2663     return ($self->{ct}); # DOCTYPE (quirks)
2664    
2665     redo A;
2666     } elsif ($self->{nc} == -1) {
2667    
2668     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2669     $self->{state} = DATA_STATE;
2670 wakaba 1.5 $self->{s_kwd} = '';
2671 wakaba 1.1 ## reconsume
2672    
2673     return ($self->{ct}); # DOCTYPE (quirks)
2674    
2675     redo A;
2676     } else {
2677    
2678     $self->{ct}->{name} = chr $self->{nc};
2679     delete $self->{ct}->{quirks};
2680     $self->{state} = DOCTYPE_NAME_STATE;
2681    
2682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2683     $self->{line_prev} = $self->{line};
2684     $self->{column_prev} = $self->{column};
2685     $self->{column}++;
2686     $self->{nc}
2687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2688     } else {
2689     $self->{set_nc}->($self);
2690     }
2691    
2692     redo A;
2693     }
2694     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2695     ## ISSUE: Redundant "First," in the spec.
2696     if ($is_space->{$self->{nc}}) {
2697    
2698     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2699    
2700     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2701     $self->{line_prev} = $self->{line};
2702     $self->{column_prev} = $self->{column};
2703     $self->{column}++;
2704     $self->{nc}
2705     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2706     } else {
2707     $self->{set_nc}->($self);
2708     }
2709    
2710     redo A;
2711     } elsif ($self->{nc} == 0x003E) { # >
2712    
2713     $self->{state} = DATA_STATE;
2714 wakaba 1.5 $self->{s_kwd} = '';
2715 wakaba 1.1
2716     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2717     $self->{line_prev} = $self->{line};
2718     $self->{column_prev} = $self->{column};
2719     $self->{column}++;
2720     $self->{nc}
2721     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2722     } else {
2723     $self->{set_nc}->($self);
2724     }
2725    
2726    
2727     return ($self->{ct}); # DOCTYPE
2728    
2729     redo A;
2730     } elsif ($self->{nc} == -1) {
2731    
2732     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2733     $self->{state} = DATA_STATE;
2734 wakaba 1.5 $self->{s_kwd} = '';
2735 wakaba 1.1 ## reconsume
2736    
2737     $self->{ct}->{quirks} = 1;
2738     return ($self->{ct}); # DOCTYPE
2739    
2740     redo A;
2741     } else {
2742    
2743     $self->{ct}->{name}
2744     .= chr ($self->{nc}); # DOCTYPE
2745     ## Stay in the state
2746    
2747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2748     $self->{line_prev} = $self->{line};
2749     $self->{column_prev} = $self->{column};
2750     $self->{column}++;
2751     $self->{nc}
2752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2753     } else {
2754     $self->{set_nc}->($self);
2755     }
2756    
2757     redo A;
2758     }
2759     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2760     if ($is_space->{$self->{nc}}) {
2761    
2762     ## Stay in the state
2763    
2764     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2765     $self->{line_prev} = $self->{line};
2766     $self->{column_prev} = $self->{column};
2767     $self->{column}++;
2768     $self->{nc}
2769     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2770     } else {
2771     $self->{set_nc}->($self);
2772     }
2773    
2774     redo A;
2775     } elsif ($self->{nc} == 0x003E) { # >
2776    
2777     $self->{state} = DATA_STATE;
2778 wakaba 1.5 $self->{s_kwd} = '';
2779 wakaba 1.1
2780     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2781     $self->{line_prev} = $self->{line};
2782     $self->{column_prev} = $self->{column};
2783     $self->{column}++;
2784     $self->{nc}
2785     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2786     } else {
2787     $self->{set_nc}->($self);
2788     }
2789    
2790    
2791     return ($self->{ct}); # DOCTYPE
2792    
2793     redo A;
2794     } elsif ($self->{nc} == -1) {
2795    
2796     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2797     $self->{state} = DATA_STATE;
2798 wakaba 1.5 $self->{s_kwd} = '';
2799 wakaba 1.1 ## reconsume
2800    
2801     $self->{ct}->{quirks} = 1;
2802     return ($self->{ct}); # DOCTYPE
2803    
2804     redo A;
2805     } elsif ($self->{nc} == 0x0050 or # P
2806     $self->{nc} == 0x0070) { # p
2807     $self->{state} = PUBLIC_STATE;
2808     $self->{s_kwd} = chr $self->{nc};
2809    
2810     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2811     $self->{line_prev} = $self->{line};
2812     $self->{column_prev} = $self->{column};
2813     $self->{column}++;
2814     $self->{nc}
2815     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2816     } else {
2817     $self->{set_nc}->($self);
2818     }
2819    
2820     redo A;
2821     } elsif ($self->{nc} == 0x0053 or # S
2822     $self->{nc} == 0x0073) { # s
2823     $self->{state} = SYSTEM_STATE;
2824     $self->{s_kwd} = chr $self->{nc};
2825    
2826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2827     $self->{line_prev} = $self->{line};
2828     $self->{column_prev} = $self->{column};
2829     $self->{column}++;
2830     $self->{nc}
2831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2832     } else {
2833     $self->{set_nc}->($self);
2834     }
2835    
2836     redo A;
2837     } else {
2838    
2839     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2840     $self->{ct}->{quirks} = 1;
2841    
2842     $self->{state} = BOGUS_DOCTYPE_STATE;
2843    
2844     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2845     $self->{line_prev} = $self->{line};
2846     $self->{column_prev} = $self->{column};
2847     $self->{column}++;
2848     $self->{nc}
2849     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2850     } else {
2851     $self->{set_nc}->($self);
2852     }
2853    
2854     redo A;
2855     }
2856     } elsif ($self->{state} == PUBLIC_STATE) {
2857     ## ASCII case-insensitive
2858     if ($self->{nc} == [
2859     undef,
2860     0x0055, # U
2861     0x0042, # B
2862     0x004C, # L
2863     0x0049, # I
2864     ]->[length $self->{s_kwd}] or
2865     $self->{nc} == [
2866     undef,
2867     0x0075, # u
2868     0x0062, # b
2869     0x006C, # l
2870     0x0069, # i
2871     ]->[length $self->{s_kwd}]) {
2872    
2873     ## Stay in the state.
2874     $self->{s_kwd} .= chr $self->{nc};
2875    
2876     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2877     $self->{line_prev} = $self->{line};
2878     $self->{column_prev} = $self->{column};
2879     $self->{column}++;
2880     $self->{nc}
2881     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2882     } else {
2883     $self->{set_nc}->($self);
2884     }
2885    
2886     redo A;
2887     } elsif ((length $self->{s_kwd}) == 5 and
2888     ($self->{nc} == 0x0043 or # C
2889     $self->{nc} == 0x0063)) { # c
2890    
2891     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2892    
2893     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2894     $self->{line_prev} = $self->{line};
2895     $self->{column_prev} = $self->{column};
2896     $self->{column}++;
2897     $self->{nc}
2898     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2899     } else {
2900     $self->{set_nc}->($self);
2901     }
2902    
2903     redo A;
2904     } else {
2905    
2906     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2907     line => $self->{line_prev},
2908     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2909     $self->{ct}->{quirks} = 1;
2910    
2911     $self->{state} = BOGUS_DOCTYPE_STATE;
2912     ## Reconsume.
2913     redo A;
2914     }
2915     } elsif ($self->{state} == SYSTEM_STATE) {
2916     ## ASCII case-insensitive
2917     if ($self->{nc} == [
2918     undef,
2919     0x0059, # Y
2920     0x0053, # S
2921     0x0054, # T
2922     0x0045, # E
2923     ]->[length $self->{s_kwd}] or
2924     $self->{nc} == [
2925     undef,
2926     0x0079, # y
2927     0x0073, # s
2928     0x0074, # t
2929     0x0065, # e
2930     ]->[length $self->{s_kwd}]) {
2931    
2932     ## Stay in the state.
2933     $self->{s_kwd} .= chr $self->{nc};
2934    
2935     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2936     $self->{line_prev} = $self->{line};
2937     $self->{column_prev} = $self->{column};
2938     $self->{column}++;
2939     $self->{nc}
2940     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2941     } else {
2942     $self->{set_nc}->($self);
2943     }
2944    
2945     redo A;
2946     } elsif ((length $self->{s_kwd}) == 5 and
2947     ($self->{nc} == 0x004D or # M
2948     $self->{nc} == 0x006D)) { # m
2949    
2950     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2951    
2952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2953     $self->{line_prev} = $self->{line};
2954     $self->{column_prev} = $self->{column};
2955     $self->{column}++;
2956     $self->{nc}
2957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2958     } else {
2959     $self->{set_nc}->($self);
2960     }
2961    
2962     redo A;
2963     } else {
2964    
2965     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2966     line => $self->{line_prev},
2967     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2968     $self->{ct}->{quirks} = 1;
2969    
2970     $self->{state} = BOGUS_DOCTYPE_STATE;
2971     ## Reconsume.
2972     redo A;
2973     }
2974     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2975     if ($is_space->{$self->{nc}}) {
2976    
2977     ## Stay in the state
2978    
2979     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2980     $self->{line_prev} = $self->{line};
2981     $self->{column_prev} = $self->{column};
2982     $self->{column}++;
2983     $self->{nc}
2984     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2985     } else {
2986     $self->{set_nc}->($self);
2987     }
2988    
2989     redo A;
2990     } elsif ($self->{nc} eq 0x0022) { # "
2991    
2992     $self->{ct}->{pubid} = ''; # DOCTYPE
2993     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2994    
2995     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2996     $self->{line_prev} = $self->{line};
2997     $self->{column_prev} = $self->{column};
2998     $self->{column}++;
2999     $self->{nc}
3000     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3001     } else {
3002     $self->{set_nc}->($self);
3003     }
3004    
3005     redo A;
3006     } elsif ($self->{nc} eq 0x0027) { # '
3007    
3008     $self->{ct}->{pubid} = ''; # DOCTYPE
3009     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
3010    
3011     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3012     $self->{line_prev} = $self->{line};
3013     $self->{column_prev} = $self->{column};
3014     $self->{column}++;
3015     $self->{nc}
3016     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3017     } else {
3018     $self->{set_nc}->($self);
3019     }
3020    
3021     redo A;
3022     } elsif ($self->{nc} eq 0x003E) { # >
3023    
3024     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
3025    
3026     $self->{state} = DATA_STATE;
3027 wakaba 1.5 $self->{s_kwd} = '';
3028 wakaba 1.1
3029     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3030     $self->{line_prev} = $self->{line};
3031     $self->{column_prev} = $self->{column};
3032     $self->{column}++;
3033     $self->{nc}
3034     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3035     } else {
3036     $self->{set_nc}->($self);
3037     }
3038    
3039    
3040     $self->{ct}->{quirks} = 1;
3041     return ($self->{ct}); # DOCTYPE
3042    
3043     redo A;
3044     } elsif ($self->{nc} == -1) {
3045    
3046     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3047    
3048     $self->{state} = DATA_STATE;
3049 wakaba 1.5 $self->{s_kwd} = '';
3050 wakaba 1.1 ## reconsume
3051    
3052     $self->{ct}->{quirks} = 1;
3053     return ($self->{ct}); # DOCTYPE
3054    
3055     redo A;
3056     } else {
3057    
3058     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
3059     $self->{ct}->{quirks} = 1;
3060    
3061     $self->{state} = BOGUS_DOCTYPE_STATE;
3062    
3063     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3064     $self->{line_prev} = $self->{line};
3065     $self->{column_prev} = $self->{column};
3066     $self->{column}++;
3067     $self->{nc}
3068     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3069     } else {
3070     $self->{set_nc}->($self);
3071     }
3072    
3073     redo A;
3074     }
3075     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3076     if ($self->{nc} == 0x0022) { # "
3077    
3078     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3079    
3080     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3081     $self->{line_prev} = $self->{line};
3082     $self->{column_prev} = $self->{column};
3083     $self->{column}++;
3084     $self->{nc}
3085     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3086     } else {
3087     $self->{set_nc}->($self);
3088     }
3089    
3090     redo A;
3091     } elsif ($self->{nc} == 0x003E) { # >
3092    
3093     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3094    
3095     $self->{state} = DATA_STATE;
3096 wakaba 1.5 $self->{s_kwd} = '';
3097 wakaba 1.1
3098     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3099     $self->{line_prev} = $self->{line};
3100     $self->{column_prev} = $self->{column};
3101     $self->{column}++;
3102     $self->{nc}
3103     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3104     } else {
3105     $self->{set_nc}->($self);
3106     }
3107    
3108    
3109     $self->{ct}->{quirks} = 1;
3110     return ($self->{ct}); # DOCTYPE
3111    
3112     redo A;
3113     } elsif ($self->{nc} == -1) {
3114    
3115     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3116    
3117     $self->{state} = DATA_STATE;
3118 wakaba 1.5 $self->{s_kwd} = '';
3119 wakaba 1.1 ## reconsume
3120    
3121     $self->{ct}->{quirks} = 1;
3122     return ($self->{ct}); # DOCTYPE
3123    
3124     redo A;
3125     } else {
3126    
3127     $self->{ct}->{pubid} # DOCTYPE
3128     .= chr $self->{nc};
3129     $self->{read_until}->($self->{ct}->{pubid}, q[">],
3130     length $self->{ct}->{pubid});
3131    
3132     ## Stay in the state
3133    
3134     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3135     $self->{line_prev} = $self->{line};
3136     $self->{column_prev} = $self->{column};
3137     $self->{column}++;
3138     $self->{nc}
3139     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3140     } else {
3141     $self->{set_nc}->($self);
3142     }
3143    
3144     redo A;
3145     }
3146     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3147     if ($self->{nc} == 0x0027) { # '
3148    
3149     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3150    
3151     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3152     $self->{line_prev} = $self->{line};
3153     $self->{column_prev} = $self->{column};
3154     $self->{column}++;
3155     $self->{nc}
3156     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3157     } else {
3158     $self->{set_nc}->($self);
3159     }
3160    
3161     redo A;
3162     } elsif ($self->{nc} == 0x003E) { # >
3163    
3164     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3165    
3166     $self->{state} = DATA_STATE;
3167 wakaba 1.5 $self->{s_kwd} = '';
3168 wakaba 1.1
3169     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3170     $self->{line_prev} = $self->{line};
3171     $self->{column_prev} = $self->{column};
3172     $self->{column}++;
3173     $self->{nc}
3174     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3175     } else {
3176     $self->{set_nc}->($self);
3177     }
3178    
3179    
3180     $self->{ct}->{quirks} = 1;
3181     return ($self->{ct}); # DOCTYPE
3182    
3183     redo A;
3184     } elsif ($self->{nc} == -1) {
3185    
3186     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3187    
3188     $self->{state} = DATA_STATE;
3189 wakaba 1.5 $self->{s_kwd} = '';
3190 wakaba 1.1 ## reconsume
3191    
3192     $self->{ct}->{quirks} = 1;
3193     return ($self->{ct}); # DOCTYPE
3194    
3195     redo A;
3196     } else {
3197    
3198     $self->{ct}->{pubid} # DOCTYPE
3199     .= chr $self->{nc};
3200     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3201     length $self->{ct}->{pubid});
3202    
3203     ## Stay in the state
3204    
3205     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3206     $self->{line_prev} = $self->{line};
3207     $self->{column_prev} = $self->{column};
3208     $self->{column}++;
3209     $self->{nc}
3210     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3211     } else {
3212     $self->{set_nc}->($self);
3213     }
3214    
3215     redo A;
3216     }
3217     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3218     if ($is_space->{$self->{nc}}) {
3219    
3220     ## Stay in the state
3221    
3222     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3223     $self->{line_prev} = $self->{line};
3224     $self->{column_prev} = $self->{column};
3225     $self->{column}++;
3226     $self->{nc}
3227     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3228     } else {
3229     $self->{set_nc}->($self);
3230     }
3231    
3232     redo A;
3233     } elsif ($self->{nc} == 0x0022) { # "
3234    
3235     $self->{ct}->{sysid} = ''; # DOCTYPE
3236     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3237    
3238     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3239     $self->{line_prev} = $self->{line};
3240     $self->{column_prev} = $self->{column};
3241     $self->{column}++;
3242     $self->{nc}
3243     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3244     } else {
3245     $self->{set_nc}->($self);
3246     }
3247    
3248     redo A;
3249     } elsif ($self->{nc} == 0x0027) { # '
3250    
3251     $self->{ct}->{sysid} = ''; # DOCTYPE
3252     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3253    
3254     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3255     $self->{line_prev} = $self->{line};
3256     $self->{column_prev} = $self->{column};
3257     $self->{column}++;
3258     $self->{nc}
3259     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3260     } else {
3261     $self->{set_nc}->($self);
3262     }
3263    
3264     redo A;
3265     } elsif ($self->{nc} == 0x003E) { # >
3266    
3267     $self->{state} = DATA_STATE;
3268 wakaba 1.5 $self->{s_kwd} = '';
3269 wakaba 1.1
3270     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3271     $self->{line_prev} = $self->{line};
3272     $self->{column_prev} = $self->{column};
3273     $self->{column}++;
3274     $self->{nc}
3275     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3276     } else {
3277     $self->{set_nc}->($self);
3278     }
3279    
3280    
3281     return ($self->{ct}); # DOCTYPE
3282    
3283     redo A;
3284     } elsif ($self->{nc} == -1) {
3285    
3286     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3287    
3288     $self->{state} = DATA_STATE;
3289 wakaba 1.5 $self->{s_kwd} = '';
3290 wakaba 1.1 ## reconsume
3291    
3292     $self->{ct}->{quirks} = 1;
3293     return ($self->{ct}); # DOCTYPE
3294    
3295     redo A;
3296     } else {
3297    
3298     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3299     $self->{ct}->{quirks} = 1;
3300    
3301     $self->{state} = BOGUS_DOCTYPE_STATE;
3302    
3303     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3304     $self->{line_prev} = $self->{line};
3305     $self->{column_prev} = $self->{column};
3306     $self->{column}++;
3307     $self->{nc}
3308     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3309     } else {
3310     $self->{set_nc}->($self);
3311     }
3312    
3313     redo A;
3314     }
3315     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3316     if ($is_space->{$self->{nc}}) {
3317    
3318     ## Stay in the state
3319    
3320     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3321     $self->{line_prev} = $self->{line};
3322     $self->{column_prev} = $self->{column};
3323     $self->{column}++;
3324     $self->{nc}
3325     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3326     } else {
3327     $self->{set_nc}->($self);
3328     }
3329    
3330     redo A;
3331     } elsif ($self->{nc} == 0x0022) { # "
3332    
3333     $self->{ct}->{sysid} = ''; # DOCTYPE
3334     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3335    
3336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3337     $self->{line_prev} = $self->{line};
3338     $self->{column_prev} = $self->{column};
3339     $self->{column}++;
3340     $self->{nc}
3341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3342     } else {
3343     $self->{set_nc}->($self);
3344     }
3345    
3346     redo A;
3347     } elsif ($self->{nc} == 0x0027) { # '
3348    
3349     $self->{ct}->{sysid} = ''; # DOCTYPE
3350     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3351    
3352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3353     $self->{line_prev} = $self->{line};
3354     $self->{column_prev} = $self->{column};
3355     $self->{column}++;
3356     $self->{nc}
3357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3358     } else {
3359     $self->{set_nc}->($self);
3360     }
3361    
3362     redo A;
3363     } elsif ($self->{nc} == 0x003E) { # >
3364    
3365     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3366     $self->{state} = DATA_STATE;
3367 wakaba 1.5 $self->{s_kwd} = '';
3368 wakaba 1.1
3369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370     $self->{line_prev} = $self->{line};
3371     $self->{column_prev} = $self->{column};
3372     $self->{column}++;
3373     $self->{nc}
3374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3375     } else {
3376     $self->{set_nc}->($self);
3377     }
3378    
3379    
3380     $self->{ct}->{quirks} = 1;
3381     return ($self->{ct}); # DOCTYPE
3382    
3383     redo A;
3384     } elsif ($self->{nc} == -1) {
3385    
3386     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3387    
3388     $self->{state} = DATA_STATE;
3389 wakaba 1.5 $self->{s_kwd} = '';
3390 wakaba 1.1 ## reconsume
3391    
3392     $self->{ct}->{quirks} = 1;
3393     return ($self->{ct}); # DOCTYPE
3394    
3395     redo A;
3396     } else {
3397    
3398     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3399     $self->{ct}->{quirks} = 1;
3400    
3401     $self->{state} = BOGUS_DOCTYPE_STATE;
3402    
3403     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3404     $self->{line_prev} = $self->{line};
3405     $self->{column_prev} = $self->{column};
3406     $self->{column}++;
3407     $self->{nc}
3408     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3409     } else {
3410     $self->{set_nc}->($self);
3411     }
3412    
3413     redo A;
3414     }
3415     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3416     if ($self->{nc} == 0x0022) { # "
3417    
3418     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3419    
3420     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3421     $self->{line_prev} = $self->{line};
3422     $self->{column_prev} = $self->{column};
3423     $self->{column}++;
3424     $self->{nc}
3425     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3426     } else {
3427     $self->{set_nc}->($self);
3428     }
3429    
3430     redo A;
3431     } elsif ($self->{nc} == 0x003E) { # >
3432    
3433     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3434    
3435     $self->{state} = DATA_STATE;
3436 wakaba 1.5 $self->{s_kwd} = '';
3437 wakaba 1.1
3438     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3439     $self->{line_prev} = $self->{line};
3440     $self->{column_prev} = $self->{column};
3441     $self->{column}++;
3442     $self->{nc}
3443     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3444     } else {
3445     $self->{set_nc}->($self);
3446     }
3447    
3448    
3449     $self->{ct}->{quirks} = 1;
3450     return ($self->{ct}); # DOCTYPE
3451    
3452     redo A;
3453     } elsif ($self->{nc} == -1) {
3454    
3455     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3456    
3457     $self->{state} = DATA_STATE;
3458 wakaba 1.5 $self->{s_kwd} = '';
3459 wakaba 1.1 ## reconsume
3460    
3461     $self->{ct}->{quirks} = 1;
3462     return ($self->{ct}); # DOCTYPE
3463    
3464     redo A;
3465     } else {
3466    
3467     $self->{ct}->{sysid} # DOCTYPE
3468     .= chr $self->{nc};
3469     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3470     length $self->{ct}->{sysid});
3471    
3472     ## Stay in the state
3473    
3474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3475     $self->{line_prev} = $self->{line};
3476     $self->{column_prev} = $self->{column};
3477     $self->{column}++;
3478     $self->{nc}
3479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3480     } else {
3481     $self->{set_nc}->($self);
3482     }
3483    
3484     redo A;
3485     }
3486     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3487     if ($self->{nc} == 0x0027) { # '
3488    
3489     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3490    
3491     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3492     $self->{line_prev} = $self->{line};
3493     $self->{column_prev} = $self->{column};
3494     $self->{column}++;
3495     $self->{nc}
3496     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3497     } else {
3498     $self->{set_nc}->($self);
3499     }
3500    
3501     redo A;
3502     } elsif ($self->{nc} == 0x003E) { # >
3503    
3504     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3505    
3506     $self->{state} = DATA_STATE;
3507 wakaba 1.5 $self->{s_kwd} = '';
3508 wakaba 1.1
3509     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3510     $self->{line_prev} = $self->{line};
3511     $self->{column_prev} = $self->{column};
3512     $self->{column}++;
3513     $self->{nc}
3514     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3515     } else {
3516     $self->{set_nc}->($self);
3517     }
3518    
3519    
3520     $self->{ct}->{quirks} = 1;
3521     return ($self->{ct}); # DOCTYPE
3522    
3523     redo A;
3524     } elsif ($self->{nc} == -1) {
3525    
3526     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3527    
3528     $self->{state} = DATA_STATE;
3529 wakaba 1.5 $self->{s_kwd} = '';
3530 wakaba 1.1 ## reconsume
3531    
3532     $self->{ct}->{quirks} = 1;
3533     return ($self->{ct}); # DOCTYPE
3534    
3535     redo A;
3536     } else {
3537    
3538     $self->{ct}->{sysid} # DOCTYPE
3539     .= chr $self->{nc};
3540     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3541     length $self->{ct}->{sysid});
3542    
3543     ## Stay in the state
3544    
3545     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3546     $self->{line_prev} = $self->{line};
3547     $self->{column_prev} = $self->{column};
3548     $self->{column}++;
3549     $self->{nc}
3550     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3551     } else {
3552     $self->{set_nc}->($self);
3553     }
3554    
3555     redo A;
3556     }
3557     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3558     if ($is_space->{$self->{nc}}) {
3559    
3560     ## Stay in the state
3561    
3562     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3563     $self->{line_prev} = $self->{line};
3564     $self->{column_prev} = $self->{column};
3565     $self->{column}++;
3566     $self->{nc}
3567     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3568     } else {
3569     $self->{set_nc}->($self);
3570     }
3571    
3572     redo A;
3573     } elsif ($self->{nc} == 0x003E) { # >
3574    
3575     $self->{state} = DATA_STATE;
3576 wakaba 1.5 $self->{s_kwd} = '';
3577 wakaba 1.1
3578     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3579     $self->{line_prev} = $self->{line};
3580     $self->{column_prev} = $self->{column};
3581     $self->{column}++;
3582     $self->{nc}
3583     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3584     } else {
3585     $self->{set_nc}->($self);
3586     }
3587    
3588    
3589     return ($self->{ct}); # DOCTYPE
3590    
3591     redo A;
3592     } elsif ($self->{nc} == -1) {
3593    
3594     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3595     $self->{state} = DATA_STATE;
3596 wakaba 1.5 $self->{s_kwd} = '';
3597 wakaba 1.1 ## reconsume
3598    
3599     $self->{ct}->{quirks} = 1;
3600     return ($self->{ct}); # DOCTYPE
3601    
3602     redo A;
3603     } else {
3604    
3605     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3606     #$self->{ct}->{quirks} = 1;
3607    
3608     $self->{state} = BOGUS_DOCTYPE_STATE;
3609    
3610     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3611     $self->{line_prev} = $self->{line};
3612     $self->{column_prev} = $self->{column};
3613     $self->{column}++;
3614     $self->{nc}
3615     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3616     } else {
3617     $self->{set_nc}->($self);
3618     }
3619    
3620     redo A;
3621     }
3622     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3623     if ($self->{nc} == 0x003E) { # >
3624    
3625     $self->{state} = DATA_STATE;
3626 wakaba 1.5 $self->{s_kwd} = '';
3627 wakaba 1.1
3628     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3629     $self->{line_prev} = $self->{line};
3630     $self->{column_prev} = $self->{column};
3631     $self->{column}++;
3632     $self->{nc}
3633     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3634     } else {
3635     $self->{set_nc}->($self);
3636     }
3637    
3638    
3639     return ($self->{ct}); # DOCTYPE
3640    
3641     redo A;
3642     } elsif ($self->{nc} == -1) {
3643    
3644     $self->{state} = DATA_STATE;
3645 wakaba 1.5 $self->{s_kwd} = '';
3646 wakaba 1.1 ## reconsume
3647    
3648     return ($self->{ct}); # DOCTYPE
3649    
3650     redo A;
3651     } else {
3652    
3653     my $s = '';
3654     $self->{read_until}->($s, q[>], 0);
3655    
3656     ## Stay in the state
3657    
3658     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3659     $self->{line_prev} = $self->{line};
3660     $self->{column_prev} = $self->{column};
3661     $self->{column}++;
3662     $self->{nc}
3663     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3664     } else {
3665     $self->{set_nc}->($self);
3666     }
3667    
3668     redo A;
3669     }
3670     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3671     ## NOTE: "CDATA section state" in the state is jointly implemented
3672     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3673     ## and |CDATA_SECTION_MSE2_STATE|.
3674    
3675     if ($self->{nc} == 0x005D) { # ]
3676    
3677     $self->{state} = CDATA_SECTION_MSE1_STATE;
3678    
3679     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3680     $self->{line_prev} = $self->{line};
3681     $self->{column_prev} = $self->{column};
3682     $self->{column}++;
3683     $self->{nc}
3684     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3685     } else {
3686     $self->{set_nc}->($self);
3687     }
3688    
3689     redo A;
3690     } elsif ($self->{nc} == -1) {
3691 wakaba 1.6 if ($self->{is_xml}) {
3692 wakaba 1.8
3693 wakaba 1.6 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type
3694 wakaba 1.8 } else {
3695    
3696 wakaba 1.6 }
3697    
3698 wakaba 1.1 $self->{state} = DATA_STATE;
3699 wakaba 1.5 $self->{s_kwd} = '';
3700 wakaba 1.1
3701     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3702     $self->{line_prev} = $self->{line};
3703     $self->{column_prev} = $self->{column};
3704     $self->{column}++;
3705     $self->{nc}
3706     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3707     } else {
3708     $self->{set_nc}->($self);
3709     }
3710    
3711     if (length $self->{ct}->{data}) { # character
3712    
3713     return ($self->{ct}); # character
3714     } else {
3715    
3716     ## No token to emit. $self->{ct} is discarded.
3717     }
3718     redo A;
3719     } else {
3720    
3721     $self->{ct}->{data} .= chr $self->{nc};
3722     $self->{read_until}->($self->{ct}->{data},
3723     q<]>,
3724     length $self->{ct}->{data});
3725    
3726     ## Stay in the state.
3727    
3728     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3729     $self->{line_prev} = $self->{line};
3730     $self->{column_prev} = $self->{column};
3731     $self->{column}++;
3732     $self->{nc}
3733     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3734     } else {
3735     $self->{set_nc}->($self);
3736     }
3737    
3738     redo A;
3739     }
3740    
3741     ## ISSUE: "text tokens" in spec.
3742     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3743     if ($self->{nc} == 0x005D) { # ]
3744    
3745     $self->{state} = CDATA_SECTION_MSE2_STATE;
3746    
3747     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3748     $self->{line_prev} = $self->{line};
3749     $self->{column_prev} = $self->{column};
3750     $self->{column}++;
3751     $self->{nc}
3752     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3753     } else {
3754     $self->{set_nc}->($self);
3755     }
3756    
3757     redo A;
3758     } else {
3759    
3760     $self->{ct}->{data} .= ']';
3761     $self->{state} = CDATA_SECTION_STATE;
3762     ## Reconsume.
3763     redo A;
3764     }
3765     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3766     if ($self->{nc} == 0x003E) { # >
3767     $self->{state} = DATA_STATE;
3768 wakaba 1.5 $self->{s_kwd} = '';
3769 wakaba 1.1
3770     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3771     $self->{line_prev} = $self->{line};
3772     $self->{column_prev} = $self->{column};
3773     $self->{column}++;
3774     $self->{nc}
3775     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3776     } else {
3777     $self->{set_nc}->($self);
3778     }
3779    
3780     if (length $self->{ct}->{data}) { # character
3781    
3782     return ($self->{ct}); # character
3783     } else {
3784    
3785     ## No token to emit. $self->{ct} is discarded.
3786     }
3787     redo A;
3788     } elsif ($self->{nc} == 0x005D) { # ]
3789     # character
3790     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3791     ## Stay in the state.
3792    
3793     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3794     $self->{line_prev} = $self->{line};
3795     $self->{column_prev} = $self->{column};
3796     $self->{column}++;
3797     $self->{nc}
3798     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3799     } else {
3800     $self->{set_nc}->($self);
3801     }
3802    
3803     redo A;
3804     } else {
3805    
3806     $self->{ct}->{data} .= ']]'; # character
3807     $self->{state} = CDATA_SECTION_STATE;
3808     ## Reconsume.
3809     redo A;
3810     }
3811     } elsif ($self->{state} == ENTITY_STATE) {
3812     if ($is_space->{$self->{nc}} or
3813     {
3814     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3815     $self->{entity_add} => 1,
3816     }->{$self->{nc}}) {
3817    
3818     ## Don't consume
3819     ## No error
3820     ## Return nothing.
3821     #
3822     } elsif ($self->{nc} == 0x0023) { # #
3823    
3824     $self->{state} = ENTITY_HASH_STATE;
3825     $self->{s_kwd} = '#';
3826    
3827     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3828     $self->{line_prev} = $self->{line};
3829     $self->{column_prev} = $self->{column};
3830     $self->{column}++;
3831     $self->{nc}
3832     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3833     } else {
3834     $self->{set_nc}->($self);
3835     }
3836    
3837     redo A;
3838     } elsif ((0x0041 <= $self->{nc} and
3839     $self->{nc} <= 0x005A) or # A..Z
3840     (0x0061 <= $self->{nc} and
3841     $self->{nc} <= 0x007A)) { # a..z
3842    
3843     require Whatpm::_NamedEntityList;
3844     $self->{state} = ENTITY_NAME_STATE;
3845     $self->{s_kwd} = chr $self->{nc};
3846     $self->{entity__value} = $self->{s_kwd};
3847     $self->{entity__match} = 0;
3848    
3849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3850     $self->{line_prev} = $self->{line};
3851     $self->{column_prev} = $self->{column};
3852     $self->{column}++;
3853     $self->{nc}
3854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3855     } else {
3856     $self->{set_nc}->($self);
3857     }
3858    
3859     redo A;
3860     } else {
3861    
3862     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3863     ## Return nothing.
3864     #
3865     }
3866    
3867     ## NOTE: No character is consumed by the "consume a character
3868     ## reference" algorithm. In other word, there is an "&" character
3869     ## that does not introduce a character reference, which would be
3870     ## appended to the parent element or the attribute value in later
3871     ## process of the tokenizer.
3872    
3873     if ($self->{prev_state} == DATA_STATE) {
3874    
3875     $self->{state} = $self->{prev_state};
3876 wakaba 1.5 $self->{s_kwd} = '';
3877 wakaba 1.1 ## Reconsume.
3878     return ({type => CHARACTER_TOKEN, data => '&',
3879     line => $self->{line_prev},
3880     column => $self->{column_prev},
3881     });
3882     redo A;
3883     } else {
3884    
3885     $self->{ca}->{value} .= '&';
3886     $self->{state} = $self->{prev_state};
3887 wakaba 1.5 $self->{s_kwd} = '';
3888 wakaba 1.1 ## Reconsume.
3889     redo A;
3890     }
3891     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3892     if ($self->{nc} == 0x0078 or # x
3893     $self->{nc} == 0x0058) { # X
3894    
3895     $self->{state} = HEXREF_X_STATE;
3896     $self->{s_kwd} .= chr $self->{nc};
3897    
3898     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3899     $self->{line_prev} = $self->{line};
3900     $self->{column_prev} = $self->{column};
3901     $self->{column}++;
3902     $self->{nc}
3903     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3904     } else {
3905     $self->{set_nc}->($self);
3906     }
3907    
3908     redo A;
3909     } elsif (0x0030 <= $self->{nc} and
3910     $self->{nc} <= 0x0039) { # 0..9
3911    
3912     $self->{state} = NCR_NUM_STATE;
3913     $self->{s_kwd} = $self->{nc} - 0x0030;
3914    
3915     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3916     $self->{line_prev} = $self->{line};
3917     $self->{column_prev} = $self->{column};
3918     $self->{column}++;
3919     $self->{nc}
3920     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3921     } else {
3922     $self->{set_nc}->($self);
3923     }
3924    
3925     redo A;
3926     } else {
3927     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3928     line => $self->{line_prev},
3929     column => $self->{column_prev} - 1);
3930    
3931     ## NOTE: According to the spec algorithm, nothing is returned,
3932     ## and then "&#" is appended to the parent element or the attribute
3933     ## value in the later processing.
3934    
3935     if ($self->{prev_state} == DATA_STATE) {
3936    
3937     $self->{state} = $self->{prev_state};
3938 wakaba 1.5 $self->{s_kwd} = '';
3939 wakaba 1.1 ## Reconsume.
3940     return ({type => CHARACTER_TOKEN,
3941     data => '&#',
3942     line => $self->{line_prev},
3943     column => $self->{column_prev} - 1,
3944     });
3945     redo A;
3946     } else {
3947    
3948     $self->{ca}->{value} .= '&#';
3949     $self->{state} = $self->{prev_state};
3950 wakaba 1.5 $self->{s_kwd} = '';
3951 wakaba 1.1 ## Reconsume.
3952     redo A;
3953     }
3954     }
3955     } elsif ($self->{state} == NCR_NUM_STATE) {
3956     if (0x0030 <= $self->{nc} and
3957     $self->{nc} <= 0x0039) { # 0..9
3958    
3959     $self->{s_kwd} *= 10;
3960     $self->{s_kwd} += $self->{nc} - 0x0030;
3961    
3962     ## Stay in the state.
3963    
3964     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3965     $self->{line_prev} = $self->{line};
3966     $self->{column_prev} = $self->{column};
3967     $self->{column}++;
3968     $self->{nc}
3969     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3970     } else {
3971     $self->{set_nc}->($self);
3972     }
3973    
3974     redo A;
3975     } elsif ($self->{nc} == 0x003B) { # ;
3976    
3977    
3978     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3979     $self->{line_prev} = $self->{line};
3980     $self->{column_prev} = $self->{column};
3981     $self->{column}++;
3982     $self->{nc}
3983     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3984     } else {
3985     $self->{set_nc}->($self);
3986     }
3987    
3988     #
3989     } else {
3990    
3991     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3992     ## Reconsume.
3993     #
3994     }
3995    
3996     my $code = $self->{s_kwd};
3997     my $l = $self->{line_prev};
3998     my $c = $self->{column_prev};
3999     if ($charref_map->{$code}) {
4000    
4001     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4002     text => (sprintf 'U+%04X', $code),
4003     line => $l, column => $c);
4004     $code = $charref_map->{$code};
4005     } elsif ($code > 0x10FFFF) {
4006    
4007     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4008     text => (sprintf 'U-%08X', $code),
4009     line => $l, column => $c);
4010     $code = 0xFFFD;
4011     }
4012    
4013     if ($self->{prev_state} == DATA_STATE) {
4014    
4015     $self->{state} = $self->{prev_state};
4016 wakaba 1.5 $self->{s_kwd} = '';
4017 wakaba 1.1 ## Reconsume.
4018     return ({type => CHARACTER_TOKEN, data => chr $code,
4019 wakaba 1.7 has_reference => 1,
4020 wakaba 1.1 line => $l, column => $c,
4021     });
4022     redo A;
4023     } else {
4024    
4025     $self->{ca}->{value} .= chr $code;
4026     $self->{ca}->{has_reference} = 1;
4027     $self->{state} = $self->{prev_state};
4028 wakaba 1.5 $self->{s_kwd} = '';
4029 wakaba 1.1 ## Reconsume.
4030     redo A;
4031     }
4032     } elsif ($self->{state} == HEXREF_X_STATE) {
4033     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
4034     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
4035     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
4036     # 0..9, A..F, a..f
4037    
4038     $self->{state} = HEXREF_HEX_STATE;
4039     $self->{s_kwd} = 0;
4040     ## Reconsume.
4041     redo A;
4042     } else {
4043     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
4044     line => $self->{line_prev},
4045     column => $self->{column_prev} - 2);
4046    
4047     ## NOTE: According to the spec algorithm, nothing is returned,
4048     ## and then "&#" followed by "X" or "x" is appended to the parent
4049     ## element or the attribute value in the later processing.
4050    
4051     if ($self->{prev_state} == DATA_STATE) {
4052    
4053     $self->{state} = $self->{prev_state};
4054 wakaba 1.5 $self->{s_kwd} = '';
4055 wakaba 1.1 ## Reconsume.
4056     return ({type => CHARACTER_TOKEN,
4057     data => '&' . $self->{s_kwd},
4058     line => $self->{line_prev},
4059     column => $self->{column_prev} - length $self->{s_kwd},
4060     });
4061     redo A;
4062     } else {
4063    
4064     $self->{ca}->{value} .= '&' . $self->{s_kwd};
4065     $self->{state} = $self->{prev_state};
4066 wakaba 1.5 $self->{s_kwd} = '';
4067 wakaba 1.1 ## Reconsume.
4068     redo A;
4069     }
4070     }
4071     } elsif ($self->{state} == HEXREF_HEX_STATE) {
4072     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
4073     # 0..9
4074    
4075     $self->{s_kwd} *= 0x10;
4076     $self->{s_kwd} += $self->{nc} - 0x0030;
4077     ## Stay in the state.
4078    
4079     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4080     $self->{line_prev} = $self->{line};
4081     $self->{column_prev} = $self->{column};
4082     $self->{column}++;
4083     $self->{nc}
4084     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4085     } else {
4086     $self->{set_nc}->($self);
4087     }
4088    
4089     redo A;
4090     } elsif (0x0061 <= $self->{nc} and
4091     $self->{nc} <= 0x0066) { # a..f
4092    
4093     $self->{s_kwd} *= 0x10;
4094     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
4095     ## Stay in the state.
4096    
4097     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4098     $self->{line_prev} = $self->{line};
4099     $self->{column_prev} = $self->{column};
4100     $self->{column}++;
4101     $self->{nc}
4102     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4103     } else {
4104     $self->{set_nc}->($self);
4105     }
4106    
4107     redo A;
4108     } elsif (0x0041 <= $self->{nc} and
4109     $self->{nc} <= 0x0046) { # A..F
4110    
4111     $self->{s_kwd} *= 0x10;
4112     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
4113     ## Stay in the state.
4114    
4115     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4116     $self->{line_prev} = $self->{line};
4117     $self->{column_prev} = $self->{column};
4118     $self->{column}++;
4119     $self->{nc}
4120     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4121     } else {
4122     $self->{set_nc}->($self);
4123     }
4124    
4125     redo A;
4126     } elsif ($self->{nc} == 0x003B) { # ;
4127    
4128    
4129     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4130     $self->{line_prev} = $self->{line};
4131     $self->{column_prev} = $self->{column};
4132     $self->{column}++;
4133     $self->{nc}
4134     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4135     } else {
4136     $self->{set_nc}->($self);
4137     }
4138    
4139     #
4140     } else {
4141    
4142     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
4143     line => $self->{line},
4144     column => $self->{column});
4145     ## Reconsume.
4146     #
4147     }
4148    
4149     my $code = $self->{s_kwd};
4150     my $l = $self->{line_prev};
4151     my $c = $self->{column_prev};
4152     if ($charref_map->{$code}) {
4153    
4154     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4155     text => (sprintf 'U+%04X', $code),
4156     line => $l, column => $c);
4157     $code = $charref_map->{$code};
4158     } elsif ($code > 0x10FFFF) {
4159    
4160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
4161     text => (sprintf 'U-%08X', $code),
4162     line => $l, column => $c);
4163     $code = 0xFFFD;
4164     }
4165    
4166     if ($self->{prev_state} == DATA_STATE) {
4167    
4168     $self->{state} = $self->{prev_state};
4169 wakaba 1.5 $self->{s_kwd} = '';
4170 wakaba 1.1 ## Reconsume.
4171     return ({type => CHARACTER_TOKEN, data => chr $code,
4172 wakaba 1.7 has_reference => 1,
4173 wakaba 1.1 line => $l, column => $c,
4174     });
4175     redo A;
4176     } else {
4177    
4178     $self->{ca}->{value} .= chr $code;
4179     $self->{ca}->{has_reference} = 1;
4180     $self->{state} = $self->{prev_state};
4181 wakaba 1.5 $self->{s_kwd} = '';
4182 wakaba 1.1 ## Reconsume.
4183     redo A;
4184     }
4185     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4186     if (length $self->{s_kwd} < 30 and
4187     ## NOTE: Some number greater than the maximum length of entity name
4188     ((0x0041 <= $self->{nc} and # a
4189     $self->{nc} <= 0x005A) or # x
4190     (0x0061 <= $self->{nc} and # a
4191     $self->{nc} <= 0x007A) or # z
4192     (0x0030 <= $self->{nc} and # 0
4193     $self->{nc} <= 0x0039) or # 9
4194     $self->{nc} == 0x003B)) { # ;
4195     our $EntityChar;
4196     $self->{s_kwd} .= chr $self->{nc};
4197     if (defined $EntityChar->{$self->{s_kwd}}) {
4198     if ($self->{nc} == 0x003B) { # ;
4199    
4200     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4201     $self->{entity__match} = 1;
4202    
4203     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4204     $self->{line_prev} = $self->{line};
4205     $self->{column_prev} = $self->{column};
4206     $self->{column}++;
4207     $self->{nc}
4208     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4209     } else {
4210     $self->{set_nc}->($self);
4211     }
4212    
4213     #
4214     } else {
4215    
4216     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4217     $self->{entity__match} = -1;
4218     ## Stay in the state.
4219    
4220     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4221     $self->{line_prev} = $self->{line};
4222     $self->{column_prev} = $self->{column};
4223     $self->{column}++;
4224     $self->{nc}
4225     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4226     } else {
4227     $self->{set_nc}->($self);
4228     }
4229    
4230     redo A;
4231     }
4232     } else {
4233    
4234     $self->{entity__value} .= chr $self->{nc};
4235     $self->{entity__match} *= 2;
4236     ## Stay in the state.
4237    
4238     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4239     $self->{line_prev} = $self->{line};
4240     $self->{column_prev} = $self->{column};
4241     $self->{column}++;
4242     $self->{nc}
4243     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4244     } else {
4245     $self->{set_nc}->($self);
4246     }
4247    
4248     redo A;
4249     }
4250     }
4251    
4252     my $data;
4253     my $has_ref;
4254     if ($self->{entity__match} > 0) {
4255    
4256     $data = $self->{entity__value};
4257     $has_ref = 1;
4258     #
4259     } elsif ($self->{entity__match} < 0) {
4260     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4261     if ($self->{prev_state} != DATA_STATE and # in attribute
4262     $self->{entity__match} < -1) {
4263    
4264     $data = '&' . $self->{s_kwd};
4265     #
4266     } else {
4267    
4268     $data = $self->{entity__value};
4269     $has_ref = 1;
4270     #
4271     }
4272     } else {
4273    
4274     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4275     line => $self->{line_prev},
4276     column => $self->{column_prev} - length $self->{s_kwd});
4277     $data = '&' . $self->{s_kwd};
4278     #
4279     }
4280    
4281     ## NOTE: In these cases, when a character reference is found,
4282     ## it is consumed and a character token is returned, or, otherwise,
4283     ## nothing is consumed and returned, according to the spec algorithm.
4284     ## In this implementation, anything that has been examined by the
4285     ## tokenizer is appended to the parent element or the attribute value
4286     ## as string, either literal string when no character reference or
4287     ## entity-replaced string otherwise, in this stage, since any characters
4288     ## that would not be consumed are appended in the data state or in an
4289     ## appropriate attribute value state anyway.
4290    
4291     if ($self->{prev_state} == DATA_STATE) {
4292    
4293     $self->{state} = $self->{prev_state};
4294 wakaba 1.5 $self->{s_kwd} = '';
4295 wakaba 1.1 ## Reconsume.
4296     return ({type => CHARACTER_TOKEN,
4297     data => $data,
4298 wakaba 1.7 has_reference => $has_ref,
4299 wakaba 1.1 line => $self->{line_prev},
4300     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4301     });
4302     redo A;
4303     } else {
4304    
4305     $self->{ca}->{value} .= $data;
4306     $self->{ca}->{has_reference} = 1 if $has_ref;
4307     $self->{state} = $self->{prev_state};
4308 wakaba 1.5 $self->{s_kwd} = '';
4309 wakaba 1.1 ## Reconsume.
4310     redo A;
4311     }
4312 wakaba 1.8
4313     ## XML-only states
4314    
4315     } elsif ($self->{state} == PI_STATE) {
4316     if ($is_space->{$self->{nc}} or
4317     $self->{nc} == 0x003F or # ? ## XML5: Same as "Anything else"
4318     $self->{nc} == -1) {
4319     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type
4320     line => $self->{line_prev},
4321     column => $self->{column_prev}
4322     - 1 * ($self->{nc} != -1));
4323     $self->{state} = BOGUS_COMMENT_STATE;
4324     ## Reconsume.
4325     $self->{ct} = {type => COMMENT_TOKEN,
4326     data => '?',
4327     line => $self->{line_prev},
4328     column => $self->{column_prev}
4329     - 1 * ($self->{nc} != -1),
4330     };
4331     redo A;
4332     } else {
4333     $self->{ct} = {type => PI_TOKEN,
4334     target => chr $self->{nc},
4335     data => '',
4336     line => $self->{line_prev},
4337     column => $self->{column_prev} - 1,
4338     };
4339     $self->{state} = PI_TARGET_STATE;
4340    
4341     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4342     $self->{line_prev} = $self->{line};
4343     $self->{column_prev} = $self->{column};
4344     $self->{column}++;
4345     $self->{nc}
4346     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4347     } else {
4348     $self->{set_nc}->($self);
4349     }
4350    
4351     redo A;
4352     }
4353     } elsif ($self->{state} == PI_TARGET_STATE) {
4354     if ($is_space->{$self->{nc}}) {
4355     $self->{state} = PI_TARGET_AFTER_STATE;
4356    
4357     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4358     $self->{line_prev} = $self->{line};
4359     $self->{column_prev} = $self->{column};
4360     $self->{column}++;
4361     $self->{nc}
4362     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4363     } else {
4364     $self->{set_nc}->($self);
4365     }
4366    
4367     redo A;
4368     } elsif ($self->{nc} == -1) {
4369     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4370     $self->{state} = DATA_STATE;
4371     $self->{s_kwd} = '';
4372     ## Reconsume.
4373     return ($self->{ct}); # pi
4374     redo A;
4375     } elsif ($self->{nc} == 0x003F) { # ?
4376     $self->{state} = PI_AFTER_STATE;
4377    
4378     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4379     $self->{line_prev} = $self->{line};
4380     $self->{column_prev} = $self->{column};
4381     $self->{column}++;
4382     $self->{nc}
4383     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4384     } else {
4385     $self->{set_nc}->($self);
4386     }
4387    
4388     redo A;
4389     } else {
4390     ## XML5: typo ("tag name" -> "target")
4391     $self->{ct}->{target} .= chr $self->{nc}; # pi
4392    
4393     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4394     $self->{line_prev} = $self->{line};
4395     $self->{column_prev} = $self->{column};
4396     $self->{column}++;
4397     $self->{nc}
4398     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4399     } else {
4400     $self->{set_nc}->($self);
4401     }
4402    
4403     redo A;
4404     }
4405     } elsif ($self->{state} == PI_TARGET_AFTER_STATE) {
4406     if ($is_space->{$self->{nc}}) {
4407     ## Stay in the state.
4408    
4409     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4410     $self->{line_prev} = $self->{line};
4411     $self->{column_prev} = $self->{column};
4412     $self->{column}++;
4413     $self->{nc}
4414     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4415     } else {
4416     $self->{set_nc}->($self);
4417     }
4418    
4419     redo A;
4420     } else {
4421     $self->{state} = PI_DATA_STATE;
4422     ## Reprocess.
4423     redo A;
4424     }
4425     } elsif ($self->{state} == PI_DATA_STATE) {
4426     if ($self->{nc} == 0x003F) { # ?
4427     $self->{state} = PI_DATA_AFTER_STATE;
4428    
4429     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4430     $self->{line_prev} = $self->{line};
4431     $self->{column_prev} = $self->{column};
4432     $self->{column}++;
4433     $self->{nc}
4434     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4435     } else {
4436     $self->{set_nc}->($self);
4437     }
4438    
4439     redo A;
4440     } elsif ($self->{nc} == -1) {
4441     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type
4442     $self->{state} = DATA_STATE;
4443     $self->{s_kwd} = '';
4444     ## Reprocess.
4445     return ($self->{ct}); # pi
4446     redo A;
4447     } else {
4448     $self->{ct}->{data} .= chr $self->{nc}; # pi
4449     $self->{read_until}->($self->{ct}->{data}, q[?],
4450     length $self->{ct}->{data});
4451     ## Stay in the state.
4452    
4453     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4454     $self->{line_prev} = $self->{line};
4455     $self->{column_prev} = $self->{column};
4456     $self->{column}++;
4457     $self->{nc}
4458     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4459     } else {
4460     $self->{set_nc}->($self);
4461     }
4462    
4463     ## Reprocess.
4464     redo A;
4465     }
4466     } elsif ($self->{state} == PI_AFTER_STATE) {
4467     if ($self->{nc} == 0x003E) { # >
4468     $self->{state} = DATA_STATE;
4469     $self->{s_kwd} = '';
4470    
4471     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4472     $self->{line_prev} = $self->{line};
4473     $self->{column_prev} = $self->{column};
4474     $self->{column}++;
4475     $self->{nc}
4476     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4477     } else {
4478     $self->{set_nc}->($self);
4479     }
4480    
4481     return ($self->{ct}); # pi
4482     redo A;
4483     } elsif ($self->{nc} == 0x003F) { # ?
4484     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4485     line => $self->{line_prev},
4486     column => $self->{column_prev}); ## XML5: no error
4487     $self->{ct}->{data} .= '?';
4488     $self->{state} = PI_DATA_AFTER_STATE;
4489    
4490     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4491     $self->{line_prev} = $self->{line};
4492     $self->{column_prev} = $self->{column};
4493     $self->{column}++;
4494     $self->{nc}
4495     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4496     } else {
4497     $self->{set_nc}->($self);
4498     }
4499    
4500     redo A;
4501     } else {
4502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type
4503     line => $self->{line_prev},
4504     column => $self->{column_prev}
4505     + 1 * ($self->{nc} == -1)); ## XML5: no error
4506     $self->{ct}->{data} .= '?'; ## XML5: not appended
4507     $self->{state} = PI_DATA_STATE;
4508     ## Reprocess.
4509     redo A;
4510     }
4511     } elsif ($self->{state} == PI_DATA_AFTER_STATE) {
4512     ## XML5: Same as "pi after state" in XML5
4513     if ($self->{nc} == 0x003E) { # >
4514     $self->{state} = DATA_STATE;
4515     $self->{s_kwd} = '';
4516    
4517     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4518     $self->{line_prev} = $self->{line};
4519     $self->{column_prev} = $self->{column};
4520     $self->{column}++;
4521     $self->{nc}
4522     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4523     } else {
4524     $self->{set_nc}->($self);
4525     }
4526    
4527     return ($self->{ct}); # pi
4528     redo A;
4529     } elsif ($self->{nc} == 0x003F) { # ?
4530     $self->{ct}->{data} .= '?';
4531     ## Stay in the state.
4532    
4533     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4534     $self->{line_prev} = $self->{line};
4535     $self->{column_prev} = $self->{column};
4536     $self->{column}++;
4537     $self->{nc}
4538     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4539     } else {
4540     $self->{set_nc}->($self);
4541     }
4542    
4543     redo A;
4544     } else {
4545     $self->{ct}->{data} .= '?'; ## XML5: not appended
4546     $self->{state} = PI_DATA_STATE;
4547     ## Reprocess.
4548     redo A;
4549     }
4550    
4551 wakaba 1.1 } else {
4552     die "$0: $self->{state}: Unknown state";
4553     }
4554     } # A
4555    
4556     die "$0: _get_next_token: unexpected case";
4557     } # _get_next_token
4558    
4559     1;
4560 wakaba 1.9 ## $Date: 2008/10/15 04:38:22 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24