/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (hide annotations) (download)
Tue Oct 14 05:34:05 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.2: +6 -4 lines
++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 05:33:48 -0000
	* Tokenizer.pm.src: Introduced "in_xml" flag for CDATA section
	support in XML.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	14 Oct 2008 05:34:00 -0000
	* Parser.pm.src: Set |in_xml| flag for tokenizer.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: A bug on end tag handling fixed.

2008-10-14  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3 wakaba 1.3 our $VERSION=do{my @r=(q$Revision: 1.2 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4 wakaba 1.2
5     BEGIN {
6     require Exporter;
7     push our @ISA, 'Exporter';
8    
9     our @EXPORT_OK = qw(
10     DOCTYPE_TOKEN
11     COMMENT_TOKEN
12     START_TAG_TOKEN
13     END_TAG_TOKEN
14     END_OF_FILE_TOKEN
15     CHARACTER_TOKEN
16     PI_TOKEN
17     ABORT_TOKEN
18     );
19    
20     our %EXPORT_TAGS = (
21     token => [qw(
22     DOCTYPE_TOKEN
23     COMMENT_TOKEN
24     START_TAG_TOKEN
25     END_TAG_TOKEN
26     END_OF_FILE_TOKEN
27     CHARACTER_TOKEN
28     PI_TOKEN
29     ABORT_TOKEN
30     )],
31     );
32     }
33    
34     ## Token types
35    
36     sub DOCTYPE_TOKEN () { 1 }
37     sub COMMENT_TOKEN () { 2 }
38     sub START_TAG_TOKEN () { 3 }
39     sub END_TAG_TOKEN () { 4 }
40     sub END_OF_FILE_TOKEN () { 5 }
41     sub CHARACTER_TOKEN () { 6 }
42     sub PI_TOKEN () { 7 } # XML5
43     sub ABORT_TOKEN () { 8 } # Not a token actually
44 wakaba 1.1
45     package Whatpm::HTML;
46    
47 wakaba 1.2 BEGIN { Whatpm::HTML::Tokenizer->import (':token') }
48    
49 wakaba 1.1 ## Content model flags
50    
51     sub CM_ENTITY () { 0b001 } # & markup in data
52     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
53     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
54    
55     sub PLAINTEXT_CONTENT_MODEL () { 0 }
56     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
57     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
58     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
59    
60     ## Tokenizer states
61    
62     sub DATA_STATE () { 0 }
63     #sub ENTITY_DATA_STATE () { 1 }
64     sub TAG_OPEN_STATE () { 2 }
65     sub CLOSE_TAG_OPEN_STATE () { 3 }
66     sub TAG_NAME_STATE () { 4 }
67     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
68     sub ATTRIBUTE_NAME_STATE () { 6 }
69     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
70     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
71     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
72     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
73     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
74     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
75     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
76     sub COMMENT_START_STATE () { 14 }
77     sub COMMENT_START_DASH_STATE () { 15 }
78     sub COMMENT_STATE () { 16 }
79     sub COMMENT_END_STATE () { 17 }
80     sub COMMENT_END_DASH_STATE () { 18 }
81     sub BOGUS_COMMENT_STATE () { 19 }
82     sub DOCTYPE_STATE () { 20 }
83     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
84     sub DOCTYPE_NAME_STATE () { 22 }
85     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
86     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
87     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
88     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
89     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
90     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
91     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
92     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
93     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
94     sub BOGUS_DOCTYPE_STATE () { 32 }
95     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
96     sub SELF_CLOSING_START_TAG_STATE () { 34 }
97     sub CDATA_SECTION_STATE () { 35 }
98     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
99     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
100     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
101     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
102     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
103     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
104     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
105     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
106     ## NOTE: "Entity data state", "entity in attribute value state", and
107     ## "consume a character reference" algorithm are jointly implemented
108     ## using the following six states:
109     sub ENTITY_STATE () { 44 }
110     sub ENTITY_HASH_STATE () { 45 }
111     sub NCR_NUM_STATE () { 46 }
112     sub HEXREF_X_STATE () { 47 }
113     sub HEXREF_HEX_STATE () { 48 }
114     sub ENTITY_NAME_STATE () { 49 }
115     sub PCDATA_STATE () { 50 } # "data state" in the spec
116    
117     ## Tree constructor state constants (see Whatpm::HTML for the full
118     ## list and descriptions)
119    
120     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
121     sub FOREIGN_EL () { 0b1_00000000000 }
122    
123     ## Character reference mappings
124    
125     my $charref_map = {
126     0x0D => 0x000A,
127     0x80 => 0x20AC,
128     0x81 => 0xFFFD,
129     0x82 => 0x201A,
130     0x83 => 0x0192,
131     0x84 => 0x201E,
132     0x85 => 0x2026,
133     0x86 => 0x2020,
134     0x87 => 0x2021,
135     0x88 => 0x02C6,
136     0x89 => 0x2030,
137     0x8A => 0x0160,
138     0x8B => 0x2039,
139     0x8C => 0x0152,
140     0x8D => 0xFFFD,
141     0x8E => 0x017D,
142     0x8F => 0xFFFD,
143     0x90 => 0xFFFD,
144     0x91 => 0x2018,
145     0x92 => 0x2019,
146     0x93 => 0x201C,
147     0x94 => 0x201D,
148     0x95 => 0x2022,
149     0x96 => 0x2013,
150     0x97 => 0x2014,
151     0x98 => 0x02DC,
152     0x99 => 0x2122,
153     0x9A => 0x0161,
154     0x9B => 0x203A,
155     0x9C => 0x0153,
156     0x9D => 0xFFFD,
157     0x9E => 0x017E,
158     0x9F => 0x0178,
159     }; # $charref_map
160     $charref_map->{$_} = 0xFFFD
161     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
162     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
163     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
164     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
165     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
166     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
167     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
168    
169     ## Implementations MUST act as if state machine in the spec
170    
171     sub _initialize_tokenizer ($) {
172     my $self = shift;
173    
174     ## NOTE: Fields set by |new| constructor:
175     #$self->{level}
176     #$self->{set_nc}
177     #$self->{parse_error}
178 wakaba 1.3 #$self->{is_xml} (if XML)
179 wakaba 1.1
180     $self->{state} = DATA_STATE; # MUST
181     #$self->{s_kwd}; # state keyword - initialized when used
182     #$self->{entity__value}; # initialized when used
183     #$self->{entity__match}; # initialized when used
184     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
185     undef $self->{ct}; # current token
186     undef $self->{ca}; # current attribute
187     undef $self->{last_stag_name}; # last emitted start tag name
188     #$self->{prev_state}; # initialized when used
189     delete $self->{self_closing};
190     $self->{char_buffer} = '';
191     $self->{char_buffer_pos} = 0;
192     $self->{nc} = -1; # next input character
193     #$self->{next_nc}
194    
195     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
196     $self->{line_prev} = $self->{line};
197     $self->{column_prev} = $self->{column};
198     $self->{column}++;
199     $self->{nc}
200     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
201     } else {
202     $self->{set_nc}->($self);
203     }
204    
205     $self->{token} = [];
206     # $self->{escape}
207     } # _initialize_tokenizer
208    
209     ## A token has:
210     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
211     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
212     ## ->{name} (DOCTYPE_TOKEN)
213     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
214     ## ->{pubid} (DOCTYPE_TOKEN)
215     ## ->{sysid} (DOCTYPE_TOKEN)
216     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
217     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
218     ## ->{name}
219     ## ->{value}
220     ## ->{has_reference} == 1 or 0
221     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
222     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
223     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
224     ## while the token is pushed back to the stack.
225    
226     ## Emitted token MUST immediately be handled by the tree construction state.
227    
228     ## Before each step, UA MAY check to see if either one of the scripts in
229     ## "list of scripts that will execute as soon as possible" or the first
230     ## script in the "list of scripts that will execute asynchronously",
231     ## has completed loading. If one has, then it MUST be executed
232     ## and removed from the list.
233    
234     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
235     ## (This requirement was dropped from HTML5 spec, unfortunately.)
236    
237     my $is_space = {
238     0x0009 => 1, # CHARACTER TABULATION (HT)
239     0x000A => 1, # LINE FEED (LF)
240     #0x000B => 0, # LINE TABULATION (VT)
241     0x000C => 1, # FORM FEED (FF)
242     #0x000D => 1, # CARRIAGE RETURN (CR)
243     0x0020 => 1, # SPACE (SP)
244     };
245    
246     sub _get_next_token ($) {
247     my $self = shift;
248    
249     if ($self->{self_closing}) {
250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
251     ## NOTE: The |self_closing| flag is only set by start tag token.
252     ## In addition, when a start tag token is emitted, it is always set to
253     ## |ct|.
254     delete $self->{self_closing};
255     }
256    
257     if (@{$self->{token}}) {
258     $self->{self_closing} = $self->{token}->[0]->{self_closing};
259     return shift @{$self->{token}};
260     }
261    
262     A: {
263     if ($self->{state} == PCDATA_STATE) {
264     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
265    
266     if ($self->{nc} == 0x0026) { # &
267    
268     ## NOTE: In the spec, the tokenizer is switched to the
269     ## "entity data state". In this implementation, the tokenizer
270     ## is switched to the |ENTITY_STATE|, which is an implementation
271     ## of the "consume a character reference" algorithm.
272     $self->{entity_add} = -1;
273     $self->{prev_state} = DATA_STATE;
274     $self->{state} = ENTITY_STATE;
275    
276     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
277     $self->{line_prev} = $self->{line};
278     $self->{column_prev} = $self->{column};
279     $self->{column}++;
280     $self->{nc}
281     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
282     } else {
283     $self->{set_nc}->($self);
284     }
285    
286     redo A;
287     } elsif ($self->{nc} == 0x003C) { # <
288    
289     $self->{state} = TAG_OPEN_STATE;
290    
291     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
292     $self->{line_prev} = $self->{line};
293     $self->{column_prev} = $self->{column};
294     $self->{column}++;
295     $self->{nc}
296     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
297     } else {
298     $self->{set_nc}->($self);
299     }
300    
301     redo A;
302     } elsif ($self->{nc} == -1) {
303    
304     return ({type => END_OF_FILE_TOKEN,
305     line => $self->{line}, column => $self->{column}});
306     last A; ## TODO: ok?
307     } else {
308    
309     #
310     }
311    
312     # Anything else
313     my $token = {type => CHARACTER_TOKEN,
314     data => chr $self->{nc},
315     line => $self->{line}, column => $self->{column},
316     };
317     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
318    
319     ## Stay in the state.
320    
321     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
322     $self->{line_prev} = $self->{line};
323     $self->{column_prev} = $self->{column};
324     $self->{column}++;
325     $self->{nc}
326     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
327     } else {
328     $self->{set_nc}->($self);
329     }
330    
331     return ($token);
332     redo A;
333     } elsif ($self->{state} == DATA_STATE) {
334     $self->{s_kwd} = '' unless defined $self->{s_kwd};
335     if ($self->{nc} == 0x0026) { # &
336     $self->{s_kwd} = '';
337     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
338     not $self->{escape}) {
339    
340     ## NOTE: In the spec, the tokenizer is switched to the
341     ## "entity data state". In this implementation, the tokenizer
342     ## is switched to the |ENTITY_STATE|, which is an implementation
343     ## of the "consume a character reference" algorithm.
344     $self->{entity_add} = -1;
345     $self->{prev_state} = DATA_STATE;
346     $self->{state} = ENTITY_STATE;
347    
348     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
349     $self->{line_prev} = $self->{line};
350     $self->{column_prev} = $self->{column};
351     $self->{column}++;
352     $self->{nc}
353     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
354     } else {
355     $self->{set_nc}->($self);
356     }
357    
358     redo A;
359     } else {
360    
361     #
362     }
363     } elsif ($self->{nc} == 0x002D) { # -
364     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
365     $self->{s_kwd} .= '-';
366    
367     if ($self->{s_kwd} eq '<!--') {
368    
369     $self->{escape} = 1; # unless $self->{escape};
370     $self->{s_kwd} = '--';
371     #
372     } elsif ($self->{s_kwd} eq '---') {
373    
374     $self->{s_kwd} = '--';
375     #
376     } else {
377    
378     #
379     }
380     }
381    
382     #
383     } elsif ($self->{nc} == 0x0021) { # !
384     if (length $self->{s_kwd}) {
385    
386     $self->{s_kwd} .= '!';
387     #
388     } else {
389    
390     #$self->{s_kwd} = '';
391     #
392     }
393     #
394     } elsif ($self->{nc} == 0x003C) { # <
395     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
396     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
397     not $self->{escape})) {
398    
399     $self->{state} = TAG_OPEN_STATE;
400    
401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
402     $self->{line_prev} = $self->{line};
403     $self->{column_prev} = $self->{column};
404     $self->{column}++;
405     $self->{nc}
406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
407     } else {
408     $self->{set_nc}->($self);
409     }
410    
411     redo A;
412     } else {
413    
414     $self->{s_kwd} = '';
415     #
416     }
417     } elsif ($self->{nc} == 0x003E) { # >
418     if ($self->{escape} and
419     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
420     if ($self->{s_kwd} eq '--') {
421    
422     delete $self->{escape};
423     } else {
424    
425     }
426     } else {
427    
428     }
429    
430     $self->{s_kwd} = '';
431     #
432     } elsif ($self->{nc} == -1) {
433    
434     $self->{s_kwd} = '';
435     return ({type => END_OF_FILE_TOKEN,
436     line => $self->{line}, column => $self->{column}});
437     last A; ## TODO: ok?
438     } else {
439    
440     $self->{s_kwd} = '';
441     #
442     }
443    
444     # Anything else
445     my $token = {type => CHARACTER_TOKEN,
446     data => chr $self->{nc},
447     line => $self->{line}, column => $self->{column},
448     };
449     if ($self->{read_until}->($token->{data}, q[-!<>&],
450     length $token->{data})) {
451     $self->{s_kwd} = '';
452     }
453    
454     ## Stay in the data state.
455     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
456    
457     $self->{state} = PCDATA_STATE;
458     } else {
459    
460     ## Stay in the state.
461     }
462    
463     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
464     $self->{line_prev} = $self->{line};
465     $self->{column_prev} = $self->{column};
466     $self->{column}++;
467     $self->{nc}
468     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
469     } else {
470     $self->{set_nc}->($self);
471     }
472    
473     return ($token);
474     redo A;
475     } elsif ($self->{state} == TAG_OPEN_STATE) {
476     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
477     if ($self->{nc} == 0x002F) { # /
478    
479    
480     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
481     $self->{line_prev} = $self->{line};
482     $self->{column_prev} = $self->{column};
483     $self->{column}++;
484     $self->{nc}
485     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
486     } else {
487     $self->{set_nc}->($self);
488     }
489    
490     $self->{state} = CLOSE_TAG_OPEN_STATE;
491     redo A;
492     } elsif ($self->{nc} == 0x0021) { # !
493    
494     $self->{s_kwd} = '<' unless $self->{escape};
495     #
496     } else {
497    
498     #
499     }
500    
501     ## reconsume
502     $self->{state} = DATA_STATE;
503     return ({type => CHARACTER_TOKEN, data => '<',
504     line => $self->{line_prev},
505     column => $self->{column_prev},
506     });
507     redo A;
508     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
509     if ($self->{nc} == 0x0021) { # !
510    
511     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
512    
513     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
514     $self->{line_prev} = $self->{line};
515     $self->{column_prev} = $self->{column};
516     $self->{column}++;
517     $self->{nc}
518     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
519     } else {
520     $self->{set_nc}->($self);
521     }
522    
523     redo A;
524     } elsif ($self->{nc} == 0x002F) { # /
525    
526     $self->{state} = CLOSE_TAG_OPEN_STATE;
527    
528     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
529     $self->{line_prev} = $self->{line};
530     $self->{column_prev} = $self->{column};
531     $self->{column}++;
532     $self->{nc}
533     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
534     } else {
535     $self->{set_nc}->($self);
536     }
537    
538     redo A;
539     } elsif (0x0041 <= $self->{nc} and
540     $self->{nc} <= 0x005A) { # A..Z
541    
542     $self->{ct}
543     = {type => START_TAG_TOKEN,
544     tag_name => chr ($self->{nc} + 0x0020),
545     line => $self->{line_prev},
546     column => $self->{column_prev}};
547     $self->{state} = TAG_NAME_STATE;
548    
549     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
550     $self->{line_prev} = $self->{line};
551     $self->{column_prev} = $self->{column};
552     $self->{column}++;
553     $self->{nc}
554     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
555     } else {
556     $self->{set_nc}->($self);
557     }
558    
559     redo A;
560     } elsif (0x0061 <= $self->{nc} and
561     $self->{nc} <= 0x007A) { # a..z
562    
563     $self->{ct} = {type => START_TAG_TOKEN,
564     tag_name => chr ($self->{nc}),
565     line => $self->{line_prev},
566     column => $self->{column_prev}};
567     $self->{state} = TAG_NAME_STATE;
568    
569     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
570     $self->{line_prev} = $self->{line};
571     $self->{column_prev} = $self->{column};
572     $self->{column}++;
573     $self->{nc}
574     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
575     } else {
576     $self->{set_nc}->($self);
577     }
578    
579     redo A;
580     } elsif ($self->{nc} == 0x003E) { # >
581    
582     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
583     line => $self->{line_prev},
584     column => $self->{column_prev});
585     $self->{state} = DATA_STATE;
586    
587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
588     $self->{line_prev} = $self->{line};
589     $self->{column_prev} = $self->{column};
590     $self->{column}++;
591     $self->{nc}
592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
593     } else {
594     $self->{set_nc}->($self);
595     }
596    
597    
598     return ({type => CHARACTER_TOKEN, data => '<>',
599     line => $self->{line_prev},
600     column => $self->{column_prev},
601     });
602    
603     redo A;
604     } elsif ($self->{nc} == 0x003F) { # ?
605    
606     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
607     line => $self->{line_prev},
608     column => $self->{column_prev});
609     $self->{state} = BOGUS_COMMENT_STATE;
610     $self->{ct} = {type => COMMENT_TOKEN, data => '',
611     line => $self->{line_prev},
612     column => $self->{column_prev},
613     };
614     ## $self->{nc} is intentionally left as is
615     redo A;
616     } else {
617    
618     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
619     line => $self->{line_prev},
620     column => $self->{column_prev});
621     $self->{state} = DATA_STATE;
622     ## reconsume
623    
624     return ({type => CHARACTER_TOKEN, data => '<',
625     line => $self->{line_prev},
626     column => $self->{column_prev},
627     });
628    
629     redo A;
630     }
631     } else {
632     die "$0: $self->{content_model} in tag open";
633     }
634     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
635     ## NOTE: The "close tag open state" in the spec is implemented as
636     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
637    
638     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
639     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
640     if (defined $self->{last_stag_name}) {
641     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
642     $self->{s_kwd} = '';
643     ## Reconsume.
644     redo A;
645     } else {
646     ## No start tag token has ever been emitted
647     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
648    
649     $self->{state} = DATA_STATE;
650     ## Reconsume.
651     return ({type => CHARACTER_TOKEN, data => '</',
652     line => $l, column => $c,
653     });
654     redo A;
655     }
656     }
657    
658     if (0x0041 <= $self->{nc} and
659     $self->{nc} <= 0x005A) { # A..Z
660    
661     $self->{ct}
662     = {type => END_TAG_TOKEN,
663     tag_name => chr ($self->{nc} + 0x0020),
664     line => $l, column => $c};
665     $self->{state} = TAG_NAME_STATE;
666    
667     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
668     $self->{line_prev} = $self->{line};
669     $self->{column_prev} = $self->{column};
670     $self->{column}++;
671     $self->{nc}
672     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
673     } else {
674     $self->{set_nc}->($self);
675     }
676    
677     redo A;
678     } elsif (0x0061 <= $self->{nc} and
679     $self->{nc} <= 0x007A) { # a..z
680    
681     $self->{ct} = {type => END_TAG_TOKEN,
682     tag_name => chr ($self->{nc}),
683     line => $l, column => $c};
684     $self->{state} = TAG_NAME_STATE;
685    
686     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
687     $self->{line_prev} = $self->{line};
688     $self->{column_prev} = $self->{column};
689     $self->{column}++;
690     $self->{nc}
691     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
692     } else {
693     $self->{set_nc}->($self);
694     }
695    
696     redo A;
697     } elsif ($self->{nc} == 0x003E) { # >
698    
699     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
700     line => $self->{line_prev}, ## "<" in "</>"
701     column => $self->{column_prev} - 1);
702     $self->{state} = DATA_STATE;
703    
704     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
705     $self->{line_prev} = $self->{line};
706     $self->{column_prev} = $self->{column};
707     $self->{column}++;
708     $self->{nc}
709     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
710     } else {
711     $self->{set_nc}->($self);
712     }
713    
714     redo A;
715     } elsif ($self->{nc} == -1) {
716    
717     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
718     $self->{state} = DATA_STATE;
719     # reconsume
720    
721     return ({type => CHARACTER_TOKEN, data => '</',
722     line => $l, column => $c,
723     });
724    
725     redo A;
726     } else {
727    
728     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
729     $self->{state} = BOGUS_COMMENT_STATE;
730     $self->{ct} = {type => COMMENT_TOKEN, data => '',
731     line => $self->{line_prev}, # "<" of "</"
732     column => $self->{column_prev} - 1,
733     };
734     ## NOTE: $self->{nc} is intentionally left as is.
735     ## Although the "anything else" case of the spec not explicitly
736     ## states that the next input character is to be reconsumed,
737     ## it will be included to the |data| of the comment token
738     ## generated from the bogus end tag, as defined in the
739     ## "bogus comment state" entry.
740     redo A;
741     }
742     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
743     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
744     if (length $ch) {
745     my $CH = $ch;
746     $ch =~ tr/a-z/A-Z/;
747     my $nch = chr $self->{nc};
748     if ($nch eq $ch or $nch eq $CH) {
749    
750     ## Stay in the state.
751     $self->{s_kwd} .= $nch;
752    
753     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
754     $self->{line_prev} = $self->{line};
755     $self->{column_prev} = $self->{column};
756     $self->{column}++;
757     $self->{nc}
758     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
759     } else {
760     $self->{set_nc}->($self);
761     }
762    
763     redo A;
764     } else {
765    
766     $self->{state} = DATA_STATE;
767     ## Reconsume.
768     return ({type => CHARACTER_TOKEN,
769     data => '</' . $self->{s_kwd},
770     line => $self->{line_prev},
771     column => $self->{column_prev} - 1 - length $self->{s_kwd},
772     });
773     redo A;
774     }
775     } else { # after "<{tag-name}"
776     unless ($is_space->{$self->{nc}} or
777     {
778     0x003E => 1, # >
779     0x002F => 1, # /
780     -1 => 1, # EOF
781     }->{$self->{nc}}) {
782    
783     ## Reconsume.
784     $self->{state} = DATA_STATE;
785     return ({type => CHARACTER_TOKEN,
786     data => '</' . $self->{s_kwd},
787     line => $self->{line_prev},
788     column => $self->{column_prev} - 1 - length $self->{s_kwd},
789     });
790     redo A;
791     } else {
792    
793     $self->{ct}
794     = {type => END_TAG_TOKEN,
795     tag_name => $self->{last_stag_name},
796     line => $self->{line_prev},
797     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
798     $self->{state} = TAG_NAME_STATE;
799     ## Reconsume.
800     redo A;
801     }
802     }
803     } elsif ($self->{state} == TAG_NAME_STATE) {
804     if ($is_space->{$self->{nc}}) {
805    
806     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
807    
808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
809     $self->{line_prev} = $self->{line};
810     $self->{column_prev} = $self->{column};
811     $self->{column}++;
812     $self->{nc}
813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
814     } else {
815     $self->{set_nc}->($self);
816     }
817    
818     redo A;
819     } elsif ($self->{nc} == 0x003E) { # >
820     if ($self->{ct}->{type} == START_TAG_TOKEN) {
821    
822     $self->{last_stag_name} = $self->{ct}->{tag_name};
823     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
824     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
825     #if ($self->{ct}->{attributes}) {
826     # ## NOTE: This should never be reached.
827     # !!! cp (36);
828     # !!! parse-error (type => 'end tag attribute');
829     #} else {
830    
831     #}
832     } else {
833     die "$0: $self->{ct}->{type}: Unknown token type";
834     }
835     $self->{state} = DATA_STATE;
836    
837     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
838     $self->{line_prev} = $self->{line};
839     $self->{column_prev} = $self->{column};
840     $self->{column}++;
841     $self->{nc}
842     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
843     } else {
844     $self->{set_nc}->($self);
845     }
846    
847    
848     return ($self->{ct}); # start tag or end tag
849    
850     redo A;
851     } elsif (0x0041 <= $self->{nc} and
852     $self->{nc} <= 0x005A) { # A..Z
853    
854     $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
855     # start tag or end tag
856     ## Stay in this state
857    
858     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
859     $self->{line_prev} = $self->{line};
860     $self->{column_prev} = $self->{column};
861     $self->{column}++;
862     $self->{nc}
863     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
864     } else {
865     $self->{set_nc}->($self);
866     }
867    
868     redo A;
869     } elsif ($self->{nc} == -1) {
870     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
871     if ($self->{ct}->{type} == START_TAG_TOKEN) {
872    
873     $self->{last_stag_name} = $self->{ct}->{tag_name};
874     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
875     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
876     #if ($self->{ct}->{attributes}) {
877     # ## NOTE: This state should never be reached.
878     # !!! cp (40);
879     # !!! parse-error (type => 'end tag attribute');
880     #} else {
881    
882     #}
883     } else {
884     die "$0: $self->{ct}->{type}: Unknown token type";
885     }
886     $self->{state} = DATA_STATE;
887     # reconsume
888    
889     return ($self->{ct}); # start tag or end tag
890    
891     redo A;
892     } elsif ($self->{nc} == 0x002F) { # /
893    
894     $self->{state} = SELF_CLOSING_START_TAG_STATE;
895    
896     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
897     $self->{line_prev} = $self->{line};
898     $self->{column_prev} = $self->{column};
899     $self->{column}++;
900     $self->{nc}
901     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
902     } else {
903     $self->{set_nc}->($self);
904     }
905    
906     redo A;
907     } else {
908    
909     $self->{ct}->{tag_name} .= chr $self->{nc};
910     # start tag or end tag
911     ## Stay in the state
912    
913     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
914     $self->{line_prev} = $self->{line};
915     $self->{column_prev} = $self->{column};
916     $self->{column}++;
917     $self->{nc}
918     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
919     } else {
920     $self->{set_nc}->($self);
921     }
922    
923     redo A;
924     }
925     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
926     if ($is_space->{$self->{nc}}) {
927    
928     ## Stay in the state
929    
930     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
931     $self->{line_prev} = $self->{line};
932     $self->{column_prev} = $self->{column};
933     $self->{column}++;
934     $self->{nc}
935     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
936     } else {
937     $self->{set_nc}->($self);
938     }
939    
940     redo A;
941     } elsif ($self->{nc} == 0x003E) { # >
942     if ($self->{ct}->{type} == START_TAG_TOKEN) {
943    
944     $self->{last_stag_name} = $self->{ct}->{tag_name};
945     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
946     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
947     if ($self->{ct}->{attributes}) {
948    
949     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
950     } else {
951    
952     }
953     } else {
954     die "$0: $self->{ct}->{type}: Unknown token type";
955     }
956     $self->{state} = DATA_STATE;
957    
958     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
959     $self->{line_prev} = $self->{line};
960     $self->{column_prev} = $self->{column};
961     $self->{column}++;
962     $self->{nc}
963     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
964     } else {
965     $self->{set_nc}->($self);
966     }
967    
968    
969     return ($self->{ct}); # start tag or end tag
970    
971     redo A;
972     } elsif (0x0041 <= $self->{nc} and
973     $self->{nc} <= 0x005A) { # A..Z
974    
975     $self->{ca}
976     = {name => chr ($self->{nc} + 0x0020),
977     value => '',
978     line => $self->{line}, column => $self->{column}};
979     $self->{state} = ATTRIBUTE_NAME_STATE;
980    
981     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
982     $self->{line_prev} = $self->{line};
983     $self->{column_prev} = $self->{column};
984     $self->{column}++;
985     $self->{nc}
986     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
987     } else {
988     $self->{set_nc}->($self);
989     }
990    
991     redo A;
992     } elsif ($self->{nc} == 0x002F) { # /
993    
994     $self->{state} = SELF_CLOSING_START_TAG_STATE;
995    
996     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
997     $self->{line_prev} = $self->{line};
998     $self->{column_prev} = $self->{column};
999     $self->{column}++;
1000     $self->{nc}
1001     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1002     } else {
1003     $self->{set_nc}->($self);
1004     }
1005    
1006     redo A;
1007     } elsif ($self->{nc} == -1) {
1008     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1009     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1010    
1011     $self->{last_stag_name} = $self->{ct}->{tag_name};
1012     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1013     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1014     if ($self->{ct}->{attributes}) {
1015    
1016     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1017     } else {
1018    
1019     }
1020     } else {
1021     die "$0: $self->{ct}->{type}: Unknown token type";
1022     }
1023     $self->{state} = DATA_STATE;
1024     # reconsume
1025    
1026     return ($self->{ct}); # start tag or end tag
1027    
1028     redo A;
1029     } else {
1030     if ({
1031     0x0022 => 1, # "
1032     0x0027 => 1, # '
1033     0x003D => 1, # =
1034     }->{$self->{nc}}) {
1035    
1036     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1037     } else {
1038    
1039     }
1040     $self->{ca}
1041     = {name => chr ($self->{nc}),
1042     value => '',
1043     line => $self->{line}, column => $self->{column}};
1044     $self->{state} = ATTRIBUTE_NAME_STATE;
1045    
1046     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1047     $self->{line_prev} = $self->{line};
1048     $self->{column_prev} = $self->{column};
1049     $self->{column}++;
1050     $self->{nc}
1051     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1052     } else {
1053     $self->{set_nc}->($self);
1054     }
1055    
1056     redo A;
1057     }
1058     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1059     my $before_leave = sub {
1060     if (exists $self->{ct}->{attributes} # start tag or end tag
1061     ->{$self->{ca}->{name}}) { # MUST
1062    
1063     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1064     ## Discard $self->{ca} # MUST
1065     } else {
1066    
1067     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1068     = $self->{ca};
1069     }
1070     }; # $before_leave
1071    
1072     if ($is_space->{$self->{nc}}) {
1073    
1074     $before_leave->();
1075     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1076    
1077     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1078     $self->{line_prev} = $self->{line};
1079     $self->{column_prev} = $self->{column};
1080     $self->{column}++;
1081     $self->{nc}
1082     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1083     } else {
1084     $self->{set_nc}->($self);
1085     }
1086    
1087     redo A;
1088     } elsif ($self->{nc} == 0x003D) { # =
1089    
1090     $before_leave->();
1091     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1092    
1093     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1094     $self->{line_prev} = $self->{line};
1095     $self->{column_prev} = $self->{column};
1096     $self->{column}++;
1097     $self->{nc}
1098     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1099     } else {
1100     $self->{set_nc}->($self);
1101     }
1102    
1103     redo A;
1104     } elsif ($self->{nc} == 0x003E) { # >
1105     $before_leave->();
1106     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1107    
1108     $self->{last_stag_name} = $self->{ct}->{tag_name};
1109     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1110    
1111     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1112     if ($self->{ct}->{attributes}) {
1113     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1114     }
1115     } else {
1116     die "$0: $self->{ct}->{type}: Unknown token type";
1117     }
1118     $self->{state} = DATA_STATE;
1119    
1120     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1121     $self->{line_prev} = $self->{line};
1122     $self->{column_prev} = $self->{column};
1123     $self->{column}++;
1124     $self->{nc}
1125     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1126     } else {
1127     $self->{set_nc}->($self);
1128     }
1129    
1130    
1131     return ($self->{ct}); # start tag or end tag
1132    
1133     redo A;
1134     } elsif (0x0041 <= $self->{nc} and
1135     $self->{nc} <= 0x005A) { # A..Z
1136    
1137     $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1138     ## Stay in the state
1139    
1140     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1141     $self->{line_prev} = $self->{line};
1142     $self->{column_prev} = $self->{column};
1143     $self->{column}++;
1144     $self->{nc}
1145     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1146     } else {
1147     $self->{set_nc}->($self);
1148     }
1149    
1150     redo A;
1151     } elsif ($self->{nc} == 0x002F) { # /
1152    
1153     $before_leave->();
1154     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1155    
1156     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1157     $self->{line_prev} = $self->{line};
1158     $self->{column_prev} = $self->{column};
1159     $self->{column}++;
1160     $self->{nc}
1161     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1162     } else {
1163     $self->{set_nc}->($self);
1164     }
1165    
1166     redo A;
1167     } elsif ($self->{nc} == -1) {
1168     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1169     $before_leave->();
1170     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1171    
1172     $self->{last_stag_name} = $self->{ct}->{tag_name};
1173     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1174     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1175     if ($self->{ct}->{attributes}) {
1176    
1177     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1178     } else {
1179     ## NOTE: This state should never be reached.
1180    
1181     }
1182     } else {
1183     die "$0: $self->{ct}->{type}: Unknown token type";
1184     }
1185     $self->{state} = DATA_STATE;
1186     # reconsume
1187    
1188     return ($self->{ct}); # start tag or end tag
1189    
1190     redo A;
1191     } else {
1192     if ($self->{nc} == 0x0022 or # "
1193     $self->{nc} == 0x0027) { # '
1194    
1195     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1196     } else {
1197    
1198     }
1199     $self->{ca}->{name} .= chr ($self->{nc});
1200     ## Stay in the state
1201    
1202     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1203     $self->{line_prev} = $self->{line};
1204     $self->{column_prev} = $self->{column};
1205     $self->{column}++;
1206     $self->{nc}
1207     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1208     } else {
1209     $self->{set_nc}->($self);
1210     }
1211    
1212     redo A;
1213     }
1214     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1215     if ($is_space->{$self->{nc}}) {
1216    
1217     ## Stay in the state
1218    
1219     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1220     $self->{line_prev} = $self->{line};
1221     $self->{column_prev} = $self->{column};
1222     $self->{column}++;
1223     $self->{nc}
1224     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1225     } else {
1226     $self->{set_nc}->($self);
1227     }
1228    
1229     redo A;
1230     } elsif ($self->{nc} == 0x003D) { # =
1231    
1232     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1233    
1234     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1235     $self->{line_prev} = $self->{line};
1236     $self->{column_prev} = $self->{column};
1237     $self->{column}++;
1238     $self->{nc}
1239     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1240     } else {
1241     $self->{set_nc}->($self);
1242     }
1243    
1244     redo A;
1245     } elsif ($self->{nc} == 0x003E) { # >
1246     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1247    
1248     $self->{last_stag_name} = $self->{ct}->{tag_name};
1249     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1250     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1251     if ($self->{ct}->{attributes}) {
1252    
1253     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1254     } else {
1255     ## NOTE: This state should never be reached.
1256    
1257     }
1258     } else {
1259     die "$0: $self->{ct}->{type}: Unknown token type";
1260     }
1261     $self->{state} = DATA_STATE;
1262    
1263     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1264     $self->{line_prev} = $self->{line};
1265     $self->{column_prev} = $self->{column};
1266     $self->{column}++;
1267     $self->{nc}
1268     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1269     } else {
1270     $self->{set_nc}->($self);
1271     }
1272    
1273    
1274     return ($self->{ct}); # start tag or end tag
1275    
1276     redo A;
1277     } elsif (0x0041 <= $self->{nc} and
1278     $self->{nc} <= 0x005A) { # A..Z
1279    
1280     $self->{ca}
1281     = {name => chr ($self->{nc} + 0x0020),
1282     value => '',
1283     line => $self->{line}, column => $self->{column}};
1284     $self->{state} = ATTRIBUTE_NAME_STATE;
1285    
1286     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1287     $self->{line_prev} = $self->{line};
1288     $self->{column_prev} = $self->{column};
1289     $self->{column}++;
1290     $self->{nc}
1291     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1292     } else {
1293     $self->{set_nc}->($self);
1294     }
1295    
1296     redo A;
1297     } elsif ($self->{nc} == 0x002F) { # /
1298    
1299     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1300    
1301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1302     $self->{line_prev} = $self->{line};
1303     $self->{column_prev} = $self->{column};
1304     $self->{column}++;
1305     $self->{nc}
1306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1307     } else {
1308     $self->{set_nc}->($self);
1309     }
1310    
1311     redo A;
1312     } elsif ($self->{nc} == -1) {
1313     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1314     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1315    
1316     $self->{last_stag_name} = $self->{ct}->{tag_name};
1317     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1318     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1319     if ($self->{ct}->{attributes}) {
1320    
1321     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1322     } else {
1323     ## NOTE: This state should never be reached.
1324    
1325     }
1326     } else {
1327     die "$0: $self->{ct}->{type}: Unknown token type";
1328     }
1329     $self->{state} = DATA_STATE;
1330     # reconsume
1331    
1332     return ($self->{ct}); # start tag or end tag
1333    
1334     redo A;
1335     } else {
1336     if ($self->{nc} == 0x0022 or # "
1337     $self->{nc} == 0x0027) { # '
1338    
1339     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1340     } else {
1341    
1342     }
1343     $self->{ca}
1344     = {name => chr ($self->{nc}),
1345     value => '',
1346     line => $self->{line}, column => $self->{column}};
1347     $self->{state} = ATTRIBUTE_NAME_STATE;
1348    
1349     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1350     $self->{line_prev} = $self->{line};
1351     $self->{column_prev} = $self->{column};
1352     $self->{column}++;
1353     $self->{nc}
1354     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1355     } else {
1356     $self->{set_nc}->($self);
1357     }
1358    
1359     redo A;
1360     }
1361     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1362     if ($is_space->{$self->{nc}}) {
1363    
1364     ## Stay in the state
1365    
1366     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1367     $self->{line_prev} = $self->{line};
1368     $self->{column_prev} = $self->{column};
1369     $self->{column}++;
1370     $self->{nc}
1371     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1372     } else {
1373     $self->{set_nc}->($self);
1374     }
1375    
1376     redo A;
1377     } elsif ($self->{nc} == 0x0022) { # "
1378    
1379     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1380    
1381     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1382     $self->{line_prev} = $self->{line};
1383     $self->{column_prev} = $self->{column};
1384     $self->{column}++;
1385     $self->{nc}
1386     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1387     } else {
1388     $self->{set_nc}->($self);
1389     }
1390    
1391     redo A;
1392     } elsif ($self->{nc} == 0x0026) { # &
1393    
1394     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1395     ## reconsume
1396     redo A;
1397     } elsif ($self->{nc} == 0x0027) { # '
1398    
1399     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1400    
1401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1402     $self->{line_prev} = $self->{line};
1403     $self->{column_prev} = $self->{column};
1404     $self->{column}++;
1405     $self->{nc}
1406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1407     } else {
1408     $self->{set_nc}->($self);
1409     }
1410    
1411     redo A;
1412     } elsif ($self->{nc} == 0x003E) { # >
1413     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1414     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1415    
1416     $self->{last_stag_name} = $self->{ct}->{tag_name};
1417     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1418     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1419     if ($self->{ct}->{attributes}) {
1420    
1421     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1422     } else {
1423     ## NOTE: This state should never be reached.
1424    
1425     }
1426     } else {
1427     die "$0: $self->{ct}->{type}: Unknown token type";
1428     }
1429     $self->{state} = DATA_STATE;
1430    
1431     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1432     $self->{line_prev} = $self->{line};
1433     $self->{column_prev} = $self->{column};
1434     $self->{column}++;
1435     $self->{nc}
1436     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1437     } else {
1438     $self->{set_nc}->($self);
1439     }
1440    
1441    
1442     return ($self->{ct}); # start tag or end tag
1443    
1444     redo A;
1445     } elsif ($self->{nc} == -1) {
1446     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1447     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1448    
1449     $self->{last_stag_name} = $self->{ct}->{tag_name};
1450     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1451     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1452     if ($self->{ct}->{attributes}) {
1453    
1454     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1455     } else {
1456     ## NOTE: This state should never be reached.
1457    
1458     }
1459     } else {
1460     die "$0: $self->{ct}->{type}: Unknown token type";
1461     }
1462     $self->{state} = DATA_STATE;
1463     ## reconsume
1464    
1465     return ($self->{ct}); # start tag or end tag
1466    
1467     redo A;
1468     } else {
1469     if ($self->{nc} == 0x003D) { # =
1470    
1471     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1472     } else {
1473    
1474     }
1475     $self->{ca}->{value} .= chr ($self->{nc});
1476     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1477    
1478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1479     $self->{line_prev} = $self->{line};
1480     $self->{column_prev} = $self->{column};
1481     $self->{column}++;
1482     $self->{nc}
1483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1484     } else {
1485     $self->{set_nc}->($self);
1486     }
1487    
1488     redo A;
1489     }
1490     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1491     if ($self->{nc} == 0x0022) { # "
1492    
1493     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1494    
1495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1496     $self->{line_prev} = $self->{line};
1497     $self->{column_prev} = $self->{column};
1498     $self->{column}++;
1499     $self->{nc}
1500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1501     } else {
1502     $self->{set_nc}->($self);
1503     }
1504    
1505     redo A;
1506     } elsif ($self->{nc} == 0x0026) { # &
1507    
1508     ## NOTE: In the spec, the tokenizer is switched to the
1509     ## "entity in attribute value state". In this implementation, the
1510     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1511     ## implementation of the "consume a character reference" algorithm.
1512     $self->{prev_state} = $self->{state};
1513     $self->{entity_add} = 0x0022; # "
1514     $self->{state} = ENTITY_STATE;
1515    
1516     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1517     $self->{line_prev} = $self->{line};
1518     $self->{column_prev} = $self->{column};
1519     $self->{column}++;
1520     $self->{nc}
1521     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1522     } else {
1523     $self->{set_nc}->($self);
1524     }
1525    
1526     redo A;
1527     } elsif ($self->{nc} == -1) {
1528     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1529     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1530    
1531     $self->{last_stag_name} = $self->{ct}->{tag_name};
1532     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1533     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1534     if ($self->{ct}->{attributes}) {
1535    
1536     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1537     } else {
1538     ## NOTE: This state should never be reached.
1539    
1540     }
1541     } else {
1542     die "$0: $self->{ct}->{type}: Unknown token type";
1543     }
1544     $self->{state} = DATA_STATE;
1545     ## reconsume
1546    
1547     return ($self->{ct}); # start tag or end tag
1548    
1549     redo A;
1550     } else {
1551    
1552     $self->{ca}->{value} .= chr ($self->{nc});
1553     $self->{read_until}->($self->{ca}->{value},
1554     q["&],
1555     length $self->{ca}->{value});
1556    
1557     ## Stay in the state
1558    
1559     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1560     $self->{line_prev} = $self->{line};
1561     $self->{column_prev} = $self->{column};
1562     $self->{column}++;
1563     $self->{nc}
1564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1565     } else {
1566     $self->{set_nc}->($self);
1567     }
1568    
1569     redo A;
1570     }
1571     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1572     if ($self->{nc} == 0x0027) { # '
1573    
1574     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1575    
1576     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1577     $self->{line_prev} = $self->{line};
1578     $self->{column_prev} = $self->{column};
1579     $self->{column}++;
1580     $self->{nc}
1581     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1582     } else {
1583     $self->{set_nc}->($self);
1584     }
1585    
1586     redo A;
1587     } elsif ($self->{nc} == 0x0026) { # &
1588    
1589     ## NOTE: In the spec, the tokenizer is switched to the
1590     ## "entity in attribute value state". In this implementation, the
1591     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1592     ## implementation of the "consume a character reference" algorithm.
1593     $self->{entity_add} = 0x0027; # '
1594     $self->{prev_state} = $self->{state};
1595     $self->{state} = ENTITY_STATE;
1596    
1597     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1598     $self->{line_prev} = $self->{line};
1599     $self->{column_prev} = $self->{column};
1600     $self->{column}++;
1601     $self->{nc}
1602     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1603     } else {
1604     $self->{set_nc}->($self);
1605     }
1606    
1607     redo A;
1608     } elsif ($self->{nc} == -1) {
1609     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1610     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1611    
1612     $self->{last_stag_name} = $self->{ct}->{tag_name};
1613     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1614     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1615     if ($self->{ct}->{attributes}) {
1616    
1617     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1618     } else {
1619     ## NOTE: This state should never be reached.
1620    
1621     }
1622     } else {
1623     die "$0: $self->{ct}->{type}: Unknown token type";
1624     }
1625     $self->{state} = DATA_STATE;
1626     ## reconsume
1627    
1628     return ($self->{ct}); # start tag or end tag
1629    
1630     redo A;
1631     } else {
1632    
1633     $self->{ca}->{value} .= chr ($self->{nc});
1634     $self->{read_until}->($self->{ca}->{value},
1635     q['&],
1636     length $self->{ca}->{value});
1637    
1638     ## Stay in the state
1639    
1640     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1641     $self->{line_prev} = $self->{line};
1642     $self->{column_prev} = $self->{column};
1643     $self->{column}++;
1644     $self->{nc}
1645     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1646     } else {
1647     $self->{set_nc}->($self);
1648     }
1649    
1650     redo A;
1651     }
1652     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1653     if ($is_space->{$self->{nc}}) {
1654    
1655     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1656    
1657     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1658     $self->{line_prev} = $self->{line};
1659     $self->{column_prev} = $self->{column};
1660     $self->{column}++;
1661     $self->{nc}
1662     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1663     } else {
1664     $self->{set_nc}->($self);
1665     }
1666    
1667     redo A;
1668     } elsif ($self->{nc} == 0x0026) { # &
1669    
1670     ## NOTE: In the spec, the tokenizer is switched to the
1671     ## "entity in attribute value state". In this implementation, the
1672     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1673     ## implementation of the "consume a character reference" algorithm.
1674     $self->{entity_add} = -1;
1675     $self->{prev_state} = $self->{state};
1676     $self->{state} = ENTITY_STATE;
1677    
1678     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1679     $self->{line_prev} = $self->{line};
1680     $self->{column_prev} = $self->{column};
1681     $self->{column}++;
1682     $self->{nc}
1683     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1684     } else {
1685     $self->{set_nc}->($self);
1686     }
1687    
1688     redo A;
1689     } elsif ($self->{nc} == 0x003E) { # >
1690     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1691    
1692     $self->{last_stag_name} = $self->{ct}->{tag_name};
1693     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1694     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1695     if ($self->{ct}->{attributes}) {
1696    
1697     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1698     } else {
1699     ## NOTE: This state should never be reached.
1700    
1701     }
1702     } else {
1703     die "$0: $self->{ct}->{type}: Unknown token type";
1704     }
1705     $self->{state} = DATA_STATE;
1706    
1707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1708     $self->{line_prev} = $self->{line};
1709     $self->{column_prev} = $self->{column};
1710     $self->{column}++;
1711     $self->{nc}
1712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1713     } else {
1714     $self->{set_nc}->($self);
1715     }
1716    
1717    
1718     return ($self->{ct}); # start tag or end tag
1719    
1720     redo A;
1721     } elsif ($self->{nc} == -1) {
1722     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1723     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1724    
1725     $self->{last_stag_name} = $self->{ct}->{tag_name};
1726     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1727     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1728     if ($self->{ct}->{attributes}) {
1729    
1730     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1731     } else {
1732     ## NOTE: This state should never be reached.
1733    
1734     }
1735     } else {
1736     die "$0: $self->{ct}->{type}: Unknown token type";
1737     }
1738     $self->{state} = DATA_STATE;
1739     ## reconsume
1740    
1741     return ($self->{ct}); # start tag or end tag
1742    
1743     redo A;
1744     } else {
1745     if ({
1746     0x0022 => 1, # "
1747     0x0027 => 1, # '
1748     0x003D => 1, # =
1749     }->{$self->{nc}}) {
1750    
1751     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1752     } else {
1753    
1754     }
1755     $self->{ca}->{value} .= chr ($self->{nc});
1756     $self->{read_until}->($self->{ca}->{value},
1757     q["'=& >],
1758     length $self->{ca}->{value});
1759    
1760     ## Stay in the state
1761    
1762     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1763     $self->{line_prev} = $self->{line};
1764     $self->{column_prev} = $self->{column};
1765     $self->{column}++;
1766     $self->{nc}
1767     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1768     } else {
1769     $self->{set_nc}->($self);
1770     }
1771    
1772     redo A;
1773     }
1774     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1775     if ($is_space->{$self->{nc}}) {
1776    
1777     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1778    
1779     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1780     $self->{line_prev} = $self->{line};
1781     $self->{column_prev} = $self->{column};
1782     $self->{column}++;
1783     $self->{nc}
1784     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1785     } else {
1786     $self->{set_nc}->($self);
1787     }
1788    
1789     redo A;
1790     } elsif ($self->{nc} == 0x003E) { # >
1791     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1792    
1793     $self->{last_stag_name} = $self->{ct}->{tag_name};
1794     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1795     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1796     if ($self->{ct}->{attributes}) {
1797    
1798     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1799     } else {
1800     ## NOTE: This state should never be reached.
1801    
1802     }
1803     } else {
1804     die "$0: $self->{ct}->{type}: Unknown token type";
1805     }
1806     $self->{state} = DATA_STATE;
1807    
1808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1809     $self->{line_prev} = $self->{line};
1810     $self->{column_prev} = $self->{column};
1811     $self->{column}++;
1812     $self->{nc}
1813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1814     } else {
1815     $self->{set_nc}->($self);
1816     }
1817    
1818    
1819     return ($self->{ct}); # start tag or end tag
1820    
1821     redo A;
1822     } elsif ($self->{nc} == 0x002F) { # /
1823    
1824     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1825    
1826     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1827     $self->{line_prev} = $self->{line};
1828     $self->{column_prev} = $self->{column};
1829     $self->{column}++;
1830     $self->{nc}
1831     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1832     } else {
1833     $self->{set_nc}->($self);
1834     }
1835    
1836     redo A;
1837     } elsif ($self->{nc} == -1) {
1838     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1839     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1840    
1841     $self->{last_stag_name} = $self->{ct}->{tag_name};
1842     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1843     if ($self->{ct}->{attributes}) {
1844    
1845     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1846     } else {
1847     ## NOTE: This state should never be reached.
1848    
1849     }
1850     } else {
1851     die "$0: $self->{ct}->{type}: Unknown token type";
1852     }
1853     $self->{state} = DATA_STATE;
1854     ## Reconsume.
1855     return ($self->{ct}); # start tag or end tag
1856     redo A;
1857     } else {
1858    
1859     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1860     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1861     ## reconsume
1862     redo A;
1863     }
1864     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1865     if ($self->{nc} == 0x003E) { # >
1866     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1867    
1868     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1869     ## TODO: Different type than slash in start tag
1870     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1871     if ($self->{ct}->{attributes}) {
1872    
1873     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1874     } else {
1875    
1876     }
1877     ## TODO: Test |<title></title/>|
1878     } else {
1879    
1880     $self->{self_closing} = 1;
1881     }
1882    
1883     $self->{state} = DATA_STATE;
1884    
1885     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1886     $self->{line_prev} = $self->{line};
1887     $self->{column_prev} = $self->{column};
1888     $self->{column}++;
1889     $self->{nc}
1890     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1891     } else {
1892     $self->{set_nc}->($self);
1893     }
1894    
1895    
1896     return ($self->{ct}); # start tag or end tag
1897    
1898     redo A;
1899     } elsif ($self->{nc} == -1) {
1900     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1901     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1902    
1903     $self->{last_stag_name} = $self->{ct}->{tag_name};
1904     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1905     if ($self->{ct}->{attributes}) {
1906    
1907     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1908     } else {
1909     ## NOTE: This state should never be reached.
1910    
1911     }
1912     } else {
1913     die "$0: $self->{ct}->{type}: Unknown token type";
1914     }
1915     $self->{state} = DATA_STATE;
1916     ## Reconsume.
1917     return ($self->{ct}); # start tag or end tag
1918     redo A;
1919     } else {
1920    
1921     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1922     ## TODO: This error type is wrong.
1923     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1924     ## Reconsume.
1925     redo A;
1926     }
1927     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1928     ## (only happen if PCDATA state)
1929    
1930     ## NOTE: Unlike spec's "bogus comment state", this implementation
1931     ## consumes characters one-by-one basis.
1932    
1933     if ($self->{nc} == 0x003E) { # >
1934    
1935     $self->{state} = DATA_STATE;
1936    
1937     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1938     $self->{line_prev} = $self->{line};
1939     $self->{column_prev} = $self->{column};
1940     $self->{column}++;
1941     $self->{nc}
1942     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1943     } else {
1944     $self->{set_nc}->($self);
1945     }
1946    
1947    
1948     return ($self->{ct}); # comment
1949     redo A;
1950     } elsif ($self->{nc} == -1) {
1951    
1952     $self->{state} = DATA_STATE;
1953     ## reconsume
1954    
1955     return ($self->{ct}); # comment
1956     redo A;
1957     } else {
1958    
1959     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1960     $self->{read_until}->($self->{ct}->{data},
1961     q[>],
1962     length $self->{ct}->{data});
1963    
1964     ## Stay in the state.
1965    
1966     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1967     $self->{line_prev} = $self->{line};
1968     $self->{column_prev} = $self->{column};
1969     $self->{column}++;
1970     $self->{nc}
1971     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1972     } else {
1973     $self->{set_nc}->($self);
1974     }
1975    
1976     redo A;
1977     }
1978     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1979     ## (only happen if PCDATA state)
1980    
1981     if ($self->{nc} == 0x002D) { # -
1982    
1983     $self->{state} = MD_HYPHEN_STATE;
1984    
1985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1986     $self->{line_prev} = $self->{line};
1987     $self->{column_prev} = $self->{column};
1988     $self->{column}++;
1989     $self->{nc}
1990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1991     } else {
1992     $self->{set_nc}->($self);
1993     }
1994    
1995     redo A;
1996     } elsif ($self->{nc} == 0x0044 or # D
1997     $self->{nc} == 0x0064) { # d
1998     ## ASCII case-insensitive.
1999    
2000     $self->{state} = MD_DOCTYPE_STATE;
2001     $self->{s_kwd} = chr $self->{nc};
2002    
2003     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2004     $self->{line_prev} = $self->{line};
2005     $self->{column_prev} = $self->{column};
2006     $self->{column}++;
2007     $self->{nc}
2008     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2009     } else {
2010     $self->{set_nc}->($self);
2011     }
2012    
2013     redo A;
2014 wakaba 1.3 } elsif ((($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
2015     $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or
2016     $self->{is_xml}) and
2017 wakaba 1.1 $self->{nc} == 0x005B) { # [
2018    
2019     $self->{state} = MD_CDATA_STATE;
2020     $self->{s_kwd} = '[';
2021    
2022     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2023     $self->{line_prev} = $self->{line};
2024     $self->{column_prev} = $self->{column};
2025     $self->{column}++;
2026     $self->{nc}
2027     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2028     } else {
2029     $self->{set_nc}->($self);
2030     }
2031    
2032     redo A;
2033     } else {
2034    
2035     }
2036    
2037     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2038     line => $self->{line_prev},
2039     column => $self->{column_prev} - 1);
2040     ## Reconsume.
2041     $self->{state} = BOGUS_COMMENT_STATE;
2042     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2043     line => $self->{line_prev},
2044     column => $self->{column_prev} - 1,
2045     };
2046     redo A;
2047     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2048     if ($self->{nc} == 0x002D) { # -
2049    
2050     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2051     line => $self->{line_prev},
2052     column => $self->{column_prev} - 2,
2053     };
2054     $self->{state} = COMMENT_START_STATE;
2055    
2056     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2057     $self->{line_prev} = $self->{line};
2058     $self->{column_prev} = $self->{column};
2059     $self->{column}++;
2060     $self->{nc}
2061     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2062     } else {
2063     $self->{set_nc}->($self);
2064     }
2065    
2066     redo A;
2067     } else {
2068    
2069     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2070     line => $self->{line_prev},
2071     column => $self->{column_prev} - 2);
2072     $self->{state} = BOGUS_COMMENT_STATE;
2073     ## Reconsume.
2074     $self->{ct} = {type => COMMENT_TOKEN,
2075     data => '-',
2076     line => $self->{line_prev},
2077     column => $self->{column_prev} - 2,
2078     };
2079     redo A;
2080     }
2081     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2082     ## ASCII case-insensitive.
2083     if ($self->{nc} == [
2084     undef,
2085     0x004F, # O
2086     0x0043, # C
2087     0x0054, # T
2088     0x0059, # Y
2089     0x0050, # P
2090     ]->[length $self->{s_kwd}] or
2091     $self->{nc} == [
2092     undef,
2093     0x006F, # o
2094     0x0063, # c
2095     0x0074, # t
2096     0x0079, # y
2097     0x0070, # p
2098     ]->[length $self->{s_kwd}]) {
2099    
2100     ## Stay in the state.
2101     $self->{s_kwd} .= chr $self->{nc};
2102    
2103     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2104     $self->{line_prev} = $self->{line};
2105     $self->{column_prev} = $self->{column};
2106     $self->{column}++;
2107     $self->{nc}
2108     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2109     } else {
2110     $self->{set_nc}->($self);
2111     }
2112    
2113     redo A;
2114     } elsif ((length $self->{s_kwd}) == 6 and
2115     ($self->{nc} == 0x0045 or # E
2116     $self->{nc} == 0x0065)) { # e
2117    
2118     $self->{state} = DOCTYPE_STATE;
2119     $self->{ct} = {type => DOCTYPE_TOKEN,
2120     quirks => 1,
2121     line => $self->{line_prev},
2122     column => $self->{column_prev} - 7,
2123     };
2124    
2125     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2126     $self->{line_prev} = $self->{line};
2127     $self->{column_prev} = $self->{column};
2128     $self->{column}++;
2129     $self->{nc}
2130     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2131     } else {
2132     $self->{set_nc}->($self);
2133     }
2134    
2135     redo A;
2136     } else {
2137    
2138     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2139     line => $self->{line_prev},
2140     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2141     $self->{state} = BOGUS_COMMENT_STATE;
2142     ## Reconsume.
2143     $self->{ct} = {type => COMMENT_TOKEN,
2144     data => $self->{s_kwd},
2145     line => $self->{line_prev},
2146     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2147     };
2148     redo A;
2149     }
2150     } elsif ($self->{state} == MD_CDATA_STATE) {
2151     if ($self->{nc} == {
2152     '[' => 0x0043, # C
2153     '[C' => 0x0044, # D
2154     '[CD' => 0x0041, # A
2155     '[CDA' => 0x0054, # T
2156     '[CDAT' => 0x0041, # A
2157     }->{$self->{s_kwd}}) {
2158    
2159     ## Stay in the state.
2160     $self->{s_kwd} .= chr $self->{nc};
2161    
2162     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2163     $self->{line_prev} = $self->{line};
2164     $self->{column_prev} = $self->{column};
2165     $self->{column}++;
2166     $self->{nc}
2167     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2168     } else {
2169     $self->{set_nc}->($self);
2170     }
2171    
2172     redo A;
2173     } elsif ($self->{s_kwd} eq '[CDATA' and
2174     $self->{nc} == 0x005B) { # [
2175    
2176     $self->{ct} = {type => CHARACTER_TOKEN,
2177     data => '',
2178     line => $self->{line_prev},
2179     column => $self->{column_prev} - 7};
2180     $self->{state} = CDATA_SECTION_STATE;
2181    
2182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2183     $self->{line_prev} = $self->{line};
2184     $self->{column_prev} = $self->{column};
2185     $self->{column}++;
2186     $self->{nc}
2187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2188     } else {
2189     $self->{set_nc}->($self);
2190     }
2191    
2192     redo A;
2193     } else {
2194    
2195     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2196     line => $self->{line_prev},
2197     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2198     $self->{state} = BOGUS_COMMENT_STATE;
2199     ## Reconsume.
2200     $self->{ct} = {type => COMMENT_TOKEN,
2201     data => $self->{s_kwd},
2202     line => $self->{line_prev},
2203     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2204     };
2205     redo A;
2206     }
2207     } elsif ($self->{state} == COMMENT_START_STATE) {
2208     if ($self->{nc} == 0x002D) { # -
2209    
2210     $self->{state} = COMMENT_START_DASH_STATE;
2211    
2212     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2213     $self->{line_prev} = $self->{line};
2214     $self->{column_prev} = $self->{column};
2215     $self->{column}++;
2216     $self->{nc}
2217     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2218     } else {
2219     $self->{set_nc}->($self);
2220     }
2221    
2222     redo A;
2223     } elsif ($self->{nc} == 0x003E) { # >
2224    
2225     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2226     $self->{state} = DATA_STATE;
2227    
2228     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2229     $self->{line_prev} = $self->{line};
2230     $self->{column_prev} = $self->{column};
2231     $self->{column}++;
2232     $self->{nc}
2233     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2234     } else {
2235     $self->{set_nc}->($self);
2236     }
2237    
2238    
2239     return ($self->{ct}); # comment
2240    
2241     redo A;
2242     } elsif ($self->{nc} == -1) {
2243    
2244     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2245     $self->{state} = DATA_STATE;
2246     ## reconsume
2247    
2248     return ($self->{ct}); # comment
2249    
2250     redo A;
2251     } else {
2252    
2253     $self->{ct}->{data} # comment
2254     .= chr ($self->{nc});
2255     $self->{state} = COMMENT_STATE;
2256    
2257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2258     $self->{line_prev} = $self->{line};
2259     $self->{column_prev} = $self->{column};
2260     $self->{column}++;
2261     $self->{nc}
2262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2263     } else {
2264     $self->{set_nc}->($self);
2265     }
2266    
2267     redo A;
2268     }
2269     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2270     if ($self->{nc} == 0x002D) { # -
2271    
2272     $self->{state} = COMMENT_END_STATE;
2273    
2274     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2275     $self->{line_prev} = $self->{line};
2276     $self->{column_prev} = $self->{column};
2277     $self->{column}++;
2278     $self->{nc}
2279     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2280     } else {
2281     $self->{set_nc}->($self);
2282     }
2283    
2284     redo A;
2285     } elsif ($self->{nc} == 0x003E) { # >
2286    
2287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2288     $self->{state} = DATA_STATE;
2289    
2290     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2291     $self->{line_prev} = $self->{line};
2292     $self->{column_prev} = $self->{column};
2293     $self->{column}++;
2294     $self->{nc}
2295     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2296     } else {
2297     $self->{set_nc}->($self);
2298     }
2299    
2300    
2301     return ($self->{ct}); # comment
2302    
2303     redo A;
2304     } elsif ($self->{nc} == -1) {
2305    
2306     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2307     $self->{state} = DATA_STATE;
2308     ## reconsume
2309    
2310     return ($self->{ct}); # comment
2311    
2312     redo A;
2313     } else {
2314    
2315     $self->{ct}->{data} # comment
2316     .= '-' . chr ($self->{nc});
2317     $self->{state} = COMMENT_STATE;
2318    
2319     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2320     $self->{line_prev} = $self->{line};
2321     $self->{column_prev} = $self->{column};
2322     $self->{column}++;
2323     $self->{nc}
2324     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2325     } else {
2326     $self->{set_nc}->($self);
2327     }
2328    
2329     redo A;
2330     }
2331     } elsif ($self->{state} == COMMENT_STATE) {
2332     if ($self->{nc} == 0x002D) { # -
2333    
2334     $self->{state} = COMMENT_END_DASH_STATE;
2335    
2336     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2337     $self->{line_prev} = $self->{line};
2338     $self->{column_prev} = $self->{column};
2339     $self->{column}++;
2340     $self->{nc}
2341     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2342     } else {
2343     $self->{set_nc}->($self);
2344     }
2345    
2346     redo A;
2347     } elsif ($self->{nc} == -1) {
2348    
2349     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2350     $self->{state} = DATA_STATE;
2351     ## reconsume
2352    
2353     return ($self->{ct}); # comment
2354    
2355     redo A;
2356     } else {
2357    
2358     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2359     $self->{read_until}->($self->{ct}->{data},
2360     q[-],
2361     length $self->{ct}->{data});
2362    
2363     ## Stay in the state
2364    
2365     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2366     $self->{line_prev} = $self->{line};
2367     $self->{column_prev} = $self->{column};
2368     $self->{column}++;
2369     $self->{nc}
2370     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2371     } else {
2372     $self->{set_nc}->($self);
2373     }
2374    
2375     redo A;
2376     }
2377     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2378     if ($self->{nc} == 0x002D) { # -
2379    
2380     $self->{state} = COMMENT_END_STATE;
2381    
2382     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2383     $self->{line_prev} = $self->{line};
2384     $self->{column_prev} = $self->{column};
2385     $self->{column}++;
2386     $self->{nc}
2387     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2388     } else {
2389     $self->{set_nc}->($self);
2390     }
2391    
2392     redo A;
2393     } elsif ($self->{nc} == -1) {
2394    
2395     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2396     $self->{state} = DATA_STATE;
2397     ## reconsume
2398    
2399     return ($self->{ct}); # comment
2400    
2401     redo A;
2402     } else {
2403    
2404     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2405     $self->{state} = COMMENT_STATE;
2406    
2407     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2408     $self->{line_prev} = $self->{line};
2409     $self->{column_prev} = $self->{column};
2410     $self->{column}++;
2411     $self->{nc}
2412     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2413     } else {
2414     $self->{set_nc}->($self);
2415     }
2416    
2417     redo A;
2418     }
2419     } elsif ($self->{state} == COMMENT_END_STATE) {
2420     if ($self->{nc} == 0x003E) { # >
2421    
2422     $self->{state} = DATA_STATE;
2423    
2424     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2425     $self->{line_prev} = $self->{line};
2426     $self->{column_prev} = $self->{column};
2427     $self->{column}++;
2428     $self->{nc}
2429     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2430     } else {
2431     $self->{set_nc}->($self);
2432     }
2433    
2434    
2435     return ($self->{ct}); # comment
2436    
2437     redo A;
2438     } elsif ($self->{nc} == 0x002D) { # -
2439    
2440     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2441     line => $self->{line_prev},
2442     column => $self->{column_prev});
2443     $self->{ct}->{data} .= '-'; # comment
2444     ## Stay in the state
2445    
2446     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2447     $self->{line_prev} = $self->{line};
2448     $self->{column_prev} = $self->{column};
2449     $self->{column}++;
2450     $self->{nc}
2451     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2452     } else {
2453     $self->{set_nc}->($self);
2454     }
2455    
2456     redo A;
2457     } elsif ($self->{nc} == -1) {
2458    
2459     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2460     $self->{state} = DATA_STATE;
2461     ## reconsume
2462    
2463     return ($self->{ct}); # comment
2464    
2465     redo A;
2466     } else {
2467    
2468     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2469     line => $self->{line_prev},
2470     column => $self->{column_prev});
2471     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2472     $self->{state} = COMMENT_STATE;
2473    
2474     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2475     $self->{line_prev} = $self->{line};
2476     $self->{column_prev} = $self->{column};
2477     $self->{column}++;
2478     $self->{nc}
2479     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2480     } else {
2481     $self->{set_nc}->($self);
2482     }
2483    
2484     redo A;
2485     }
2486     } elsif ($self->{state} == DOCTYPE_STATE) {
2487     if ($is_space->{$self->{nc}}) {
2488    
2489     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2490    
2491     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2492     $self->{line_prev} = $self->{line};
2493     $self->{column_prev} = $self->{column};
2494     $self->{column}++;
2495     $self->{nc}
2496     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2497     } else {
2498     $self->{set_nc}->($self);
2499     }
2500    
2501     redo A;
2502     } else {
2503    
2504     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2505     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2506     ## reconsume
2507     redo A;
2508     }
2509     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2510     if ($is_space->{$self->{nc}}) {
2511    
2512     ## Stay in the state
2513    
2514     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2515     $self->{line_prev} = $self->{line};
2516     $self->{column_prev} = $self->{column};
2517     $self->{column}++;
2518     $self->{nc}
2519     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2520     } else {
2521     $self->{set_nc}->($self);
2522     }
2523    
2524     redo A;
2525     } elsif ($self->{nc} == 0x003E) { # >
2526    
2527     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2528     $self->{state} = DATA_STATE;
2529    
2530     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2531     $self->{line_prev} = $self->{line};
2532     $self->{column_prev} = $self->{column};
2533     $self->{column}++;
2534     $self->{nc}
2535     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2536     } else {
2537     $self->{set_nc}->($self);
2538     }
2539    
2540    
2541     return ($self->{ct}); # DOCTYPE (quirks)
2542    
2543     redo A;
2544     } elsif ($self->{nc} == -1) {
2545    
2546     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2547     $self->{state} = DATA_STATE;
2548     ## reconsume
2549    
2550     return ($self->{ct}); # DOCTYPE (quirks)
2551    
2552     redo A;
2553     } else {
2554    
2555     $self->{ct}->{name} = chr $self->{nc};
2556     delete $self->{ct}->{quirks};
2557     $self->{state} = DOCTYPE_NAME_STATE;
2558    
2559     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2560     $self->{line_prev} = $self->{line};
2561     $self->{column_prev} = $self->{column};
2562     $self->{column}++;
2563     $self->{nc}
2564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2565     } else {
2566     $self->{set_nc}->($self);
2567     }
2568    
2569     redo A;
2570     }
2571     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2572     ## ISSUE: Redundant "First," in the spec.
2573     if ($is_space->{$self->{nc}}) {
2574    
2575     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2576    
2577     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2578     $self->{line_prev} = $self->{line};
2579     $self->{column_prev} = $self->{column};
2580     $self->{column}++;
2581     $self->{nc}
2582     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2583     } else {
2584     $self->{set_nc}->($self);
2585     }
2586    
2587     redo A;
2588     } elsif ($self->{nc} == 0x003E) { # >
2589    
2590     $self->{state} = DATA_STATE;
2591    
2592     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2593     $self->{line_prev} = $self->{line};
2594     $self->{column_prev} = $self->{column};
2595     $self->{column}++;
2596     $self->{nc}
2597     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2598     } else {
2599     $self->{set_nc}->($self);
2600     }
2601    
2602    
2603     return ($self->{ct}); # DOCTYPE
2604    
2605     redo A;
2606     } elsif ($self->{nc} == -1) {
2607    
2608     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2609     $self->{state} = DATA_STATE;
2610     ## reconsume
2611    
2612     $self->{ct}->{quirks} = 1;
2613     return ($self->{ct}); # DOCTYPE
2614    
2615     redo A;
2616     } else {
2617    
2618     $self->{ct}->{name}
2619     .= chr ($self->{nc}); # DOCTYPE
2620     ## Stay in the state
2621    
2622     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2623     $self->{line_prev} = $self->{line};
2624     $self->{column_prev} = $self->{column};
2625     $self->{column}++;
2626     $self->{nc}
2627     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2628     } else {
2629     $self->{set_nc}->($self);
2630     }
2631    
2632     redo A;
2633     }
2634     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2635     if ($is_space->{$self->{nc}}) {
2636    
2637     ## Stay in the state
2638    
2639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2640     $self->{line_prev} = $self->{line};
2641     $self->{column_prev} = $self->{column};
2642     $self->{column}++;
2643     $self->{nc}
2644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2645     } else {
2646     $self->{set_nc}->($self);
2647     }
2648    
2649     redo A;
2650     } elsif ($self->{nc} == 0x003E) { # >
2651    
2652     $self->{state} = DATA_STATE;
2653    
2654     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2655     $self->{line_prev} = $self->{line};
2656     $self->{column_prev} = $self->{column};
2657     $self->{column}++;
2658     $self->{nc}
2659     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2660     } else {
2661     $self->{set_nc}->($self);
2662     }
2663    
2664    
2665     return ($self->{ct}); # DOCTYPE
2666    
2667     redo A;
2668     } elsif ($self->{nc} == -1) {
2669    
2670     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2671     $self->{state} = DATA_STATE;
2672     ## reconsume
2673    
2674     $self->{ct}->{quirks} = 1;
2675     return ($self->{ct}); # DOCTYPE
2676    
2677     redo A;
2678     } elsif ($self->{nc} == 0x0050 or # P
2679     $self->{nc} == 0x0070) { # p
2680     $self->{state} = PUBLIC_STATE;
2681     $self->{s_kwd} = chr $self->{nc};
2682    
2683     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2684     $self->{line_prev} = $self->{line};
2685     $self->{column_prev} = $self->{column};
2686     $self->{column}++;
2687     $self->{nc}
2688     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2689     } else {
2690     $self->{set_nc}->($self);
2691     }
2692    
2693     redo A;
2694     } elsif ($self->{nc} == 0x0053 or # S
2695     $self->{nc} == 0x0073) { # s
2696     $self->{state} = SYSTEM_STATE;
2697     $self->{s_kwd} = chr $self->{nc};
2698    
2699     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2700     $self->{line_prev} = $self->{line};
2701     $self->{column_prev} = $self->{column};
2702     $self->{column}++;
2703     $self->{nc}
2704     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2705     } else {
2706     $self->{set_nc}->($self);
2707     }
2708    
2709     redo A;
2710     } else {
2711    
2712     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2713     $self->{ct}->{quirks} = 1;
2714    
2715     $self->{state} = BOGUS_DOCTYPE_STATE;
2716    
2717     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2718     $self->{line_prev} = $self->{line};
2719     $self->{column_prev} = $self->{column};
2720     $self->{column}++;
2721     $self->{nc}
2722     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2723     } else {
2724     $self->{set_nc}->($self);
2725     }
2726    
2727     redo A;
2728     }
2729     } elsif ($self->{state} == PUBLIC_STATE) {
2730     ## ASCII case-insensitive
2731     if ($self->{nc} == [
2732     undef,
2733     0x0055, # U
2734     0x0042, # B
2735     0x004C, # L
2736     0x0049, # I
2737     ]->[length $self->{s_kwd}] or
2738     $self->{nc} == [
2739     undef,
2740     0x0075, # u
2741     0x0062, # b
2742     0x006C, # l
2743     0x0069, # i
2744     ]->[length $self->{s_kwd}]) {
2745    
2746     ## Stay in the state.
2747     $self->{s_kwd} .= chr $self->{nc};
2748    
2749     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2750     $self->{line_prev} = $self->{line};
2751     $self->{column_prev} = $self->{column};
2752     $self->{column}++;
2753     $self->{nc}
2754     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2755     } else {
2756     $self->{set_nc}->($self);
2757     }
2758    
2759     redo A;
2760     } elsif ((length $self->{s_kwd}) == 5 and
2761     ($self->{nc} == 0x0043 or # C
2762     $self->{nc} == 0x0063)) { # c
2763    
2764     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2765    
2766     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2767     $self->{line_prev} = $self->{line};
2768     $self->{column_prev} = $self->{column};
2769     $self->{column}++;
2770     $self->{nc}
2771     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2772     } else {
2773     $self->{set_nc}->($self);
2774     }
2775    
2776     redo A;
2777     } else {
2778    
2779     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2780     line => $self->{line_prev},
2781     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2782     $self->{ct}->{quirks} = 1;
2783    
2784     $self->{state} = BOGUS_DOCTYPE_STATE;
2785     ## Reconsume.
2786     redo A;
2787     }
2788     } elsif ($self->{state} == SYSTEM_STATE) {
2789     ## ASCII case-insensitive
2790     if ($self->{nc} == [
2791     undef,
2792     0x0059, # Y
2793     0x0053, # S
2794     0x0054, # T
2795     0x0045, # E
2796     ]->[length $self->{s_kwd}] or
2797     $self->{nc} == [
2798     undef,
2799     0x0079, # y
2800     0x0073, # s
2801     0x0074, # t
2802     0x0065, # e
2803     ]->[length $self->{s_kwd}]) {
2804    
2805     ## Stay in the state.
2806     $self->{s_kwd} .= chr $self->{nc};
2807    
2808     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2809     $self->{line_prev} = $self->{line};
2810     $self->{column_prev} = $self->{column};
2811     $self->{column}++;
2812     $self->{nc}
2813     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2814     } else {
2815     $self->{set_nc}->($self);
2816     }
2817    
2818     redo A;
2819     } elsif ((length $self->{s_kwd}) == 5 and
2820     ($self->{nc} == 0x004D or # M
2821     $self->{nc} == 0x006D)) { # m
2822    
2823     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2824    
2825     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2826     $self->{line_prev} = $self->{line};
2827     $self->{column_prev} = $self->{column};
2828     $self->{column}++;
2829     $self->{nc}
2830     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2831     } else {
2832     $self->{set_nc}->($self);
2833     }
2834    
2835     redo A;
2836     } else {
2837    
2838     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2839     line => $self->{line_prev},
2840     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2841     $self->{ct}->{quirks} = 1;
2842    
2843     $self->{state} = BOGUS_DOCTYPE_STATE;
2844     ## Reconsume.
2845     redo A;
2846     }
2847     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2848     if ($is_space->{$self->{nc}}) {
2849    
2850     ## Stay in the state
2851    
2852     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2853     $self->{line_prev} = $self->{line};
2854     $self->{column_prev} = $self->{column};
2855     $self->{column}++;
2856     $self->{nc}
2857     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2858     } else {
2859     $self->{set_nc}->($self);
2860     }
2861    
2862     redo A;
2863     } elsif ($self->{nc} eq 0x0022) { # "
2864    
2865     $self->{ct}->{pubid} = ''; # DOCTYPE
2866     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2867    
2868     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2869     $self->{line_prev} = $self->{line};
2870     $self->{column_prev} = $self->{column};
2871     $self->{column}++;
2872     $self->{nc}
2873     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2874     } else {
2875     $self->{set_nc}->($self);
2876     }
2877    
2878     redo A;
2879     } elsif ($self->{nc} eq 0x0027) { # '
2880    
2881     $self->{ct}->{pubid} = ''; # DOCTYPE
2882     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2883    
2884     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2885     $self->{line_prev} = $self->{line};
2886     $self->{column_prev} = $self->{column};
2887     $self->{column}++;
2888     $self->{nc}
2889     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2890     } else {
2891     $self->{set_nc}->($self);
2892     }
2893    
2894     redo A;
2895     } elsif ($self->{nc} eq 0x003E) { # >
2896    
2897     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2898    
2899     $self->{state} = DATA_STATE;
2900    
2901     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2902     $self->{line_prev} = $self->{line};
2903     $self->{column_prev} = $self->{column};
2904     $self->{column}++;
2905     $self->{nc}
2906     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2907     } else {
2908     $self->{set_nc}->($self);
2909     }
2910    
2911    
2912     $self->{ct}->{quirks} = 1;
2913     return ($self->{ct}); # DOCTYPE
2914    
2915     redo A;
2916     } elsif ($self->{nc} == -1) {
2917    
2918     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2919    
2920     $self->{state} = DATA_STATE;
2921     ## reconsume
2922    
2923     $self->{ct}->{quirks} = 1;
2924     return ($self->{ct}); # DOCTYPE
2925    
2926     redo A;
2927     } else {
2928    
2929     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
2930     $self->{ct}->{quirks} = 1;
2931    
2932     $self->{state} = BOGUS_DOCTYPE_STATE;
2933    
2934     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2935     $self->{line_prev} = $self->{line};
2936     $self->{column_prev} = $self->{column};
2937     $self->{column}++;
2938     $self->{nc}
2939     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2940     } else {
2941     $self->{set_nc}->($self);
2942     }
2943    
2944     redo A;
2945     }
2946     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2947     if ($self->{nc} == 0x0022) { # "
2948    
2949     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2950    
2951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2952     $self->{line_prev} = $self->{line};
2953     $self->{column_prev} = $self->{column};
2954     $self->{column}++;
2955     $self->{nc}
2956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2957     } else {
2958     $self->{set_nc}->($self);
2959     }
2960    
2961     redo A;
2962     } elsif ($self->{nc} == 0x003E) { # >
2963    
2964     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2965    
2966     $self->{state} = DATA_STATE;
2967    
2968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2969     $self->{line_prev} = $self->{line};
2970     $self->{column_prev} = $self->{column};
2971     $self->{column}++;
2972     $self->{nc}
2973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2974     } else {
2975     $self->{set_nc}->($self);
2976     }
2977    
2978    
2979     $self->{ct}->{quirks} = 1;
2980     return ($self->{ct}); # DOCTYPE
2981    
2982     redo A;
2983     } elsif ($self->{nc} == -1) {
2984    
2985     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2986    
2987     $self->{state} = DATA_STATE;
2988     ## reconsume
2989    
2990     $self->{ct}->{quirks} = 1;
2991     return ($self->{ct}); # DOCTYPE
2992    
2993     redo A;
2994     } else {
2995    
2996     $self->{ct}->{pubid} # DOCTYPE
2997     .= chr $self->{nc};
2998     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2999     length $self->{ct}->{pubid});
3000    
3001     ## Stay in the state
3002    
3003     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3004     $self->{line_prev} = $self->{line};
3005     $self->{column_prev} = $self->{column};
3006     $self->{column}++;
3007     $self->{nc}
3008     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3009     } else {
3010     $self->{set_nc}->($self);
3011     }
3012    
3013     redo A;
3014     }
3015     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
3016     if ($self->{nc} == 0x0027) { # '
3017    
3018     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
3019    
3020     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3021     $self->{line_prev} = $self->{line};
3022     $self->{column_prev} = $self->{column};
3023     $self->{column}++;
3024     $self->{nc}
3025     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3026     } else {
3027     $self->{set_nc}->($self);
3028     }
3029    
3030     redo A;
3031     } elsif ($self->{nc} == 0x003E) { # >
3032    
3033     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3034    
3035     $self->{state} = DATA_STATE;
3036    
3037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3038     $self->{line_prev} = $self->{line};
3039     $self->{column_prev} = $self->{column};
3040     $self->{column}++;
3041     $self->{nc}
3042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3043     } else {
3044     $self->{set_nc}->($self);
3045     }
3046    
3047    
3048     $self->{ct}->{quirks} = 1;
3049     return ($self->{ct}); # DOCTYPE
3050    
3051     redo A;
3052     } elsif ($self->{nc} == -1) {
3053    
3054     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3055    
3056     $self->{state} = DATA_STATE;
3057     ## reconsume
3058    
3059     $self->{ct}->{quirks} = 1;
3060     return ($self->{ct}); # DOCTYPE
3061    
3062     redo A;
3063     } else {
3064    
3065     $self->{ct}->{pubid} # DOCTYPE
3066     .= chr $self->{nc};
3067     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3068     length $self->{ct}->{pubid});
3069    
3070     ## Stay in the state
3071    
3072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3073     $self->{line_prev} = $self->{line};
3074     $self->{column_prev} = $self->{column};
3075     $self->{column}++;
3076     $self->{nc}
3077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3078     } else {
3079     $self->{set_nc}->($self);
3080     }
3081    
3082     redo A;
3083     }
3084     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3085     if ($is_space->{$self->{nc}}) {
3086    
3087     ## Stay in the state
3088    
3089     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3090     $self->{line_prev} = $self->{line};
3091     $self->{column_prev} = $self->{column};
3092     $self->{column}++;
3093     $self->{nc}
3094     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3095     } else {
3096     $self->{set_nc}->($self);
3097     }
3098    
3099     redo A;
3100     } elsif ($self->{nc} == 0x0022) { # "
3101    
3102     $self->{ct}->{sysid} = ''; # DOCTYPE
3103     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3104    
3105     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3106     $self->{line_prev} = $self->{line};
3107     $self->{column_prev} = $self->{column};
3108     $self->{column}++;
3109     $self->{nc}
3110     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3111     } else {
3112     $self->{set_nc}->($self);
3113     }
3114    
3115     redo A;
3116     } elsif ($self->{nc} == 0x0027) { # '
3117    
3118     $self->{ct}->{sysid} = ''; # DOCTYPE
3119     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3120    
3121     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3122     $self->{line_prev} = $self->{line};
3123     $self->{column_prev} = $self->{column};
3124     $self->{column}++;
3125     $self->{nc}
3126     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3127     } else {
3128     $self->{set_nc}->($self);
3129     }
3130    
3131     redo A;
3132     } elsif ($self->{nc} == 0x003E) { # >
3133    
3134     $self->{state} = DATA_STATE;
3135    
3136     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3137     $self->{line_prev} = $self->{line};
3138     $self->{column_prev} = $self->{column};
3139     $self->{column}++;
3140     $self->{nc}
3141     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3142     } else {
3143     $self->{set_nc}->($self);
3144     }
3145    
3146    
3147     return ($self->{ct}); # DOCTYPE
3148    
3149     redo A;
3150     } elsif ($self->{nc} == -1) {
3151    
3152     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3153    
3154     $self->{state} = DATA_STATE;
3155     ## reconsume
3156    
3157     $self->{ct}->{quirks} = 1;
3158     return ($self->{ct}); # DOCTYPE
3159    
3160     redo A;
3161     } else {
3162    
3163     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3164     $self->{ct}->{quirks} = 1;
3165    
3166     $self->{state} = BOGUS_DOCTYPE_STATE;
3167    
3168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3169     $self->{line_prev} = $self->{line};
3170     $self->{column_prev} = $self->{column};
3171     $self->{column}++;
3172     $self->{nc}
3173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3174     } else {
3175     $self->{set_nc}->($self);
3176     }
3177    
3178     redo A;
3179     }
3180     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3181     if ($is_space->{$self->{nc}}) {
3182    
3183     ## Stay in the state
3184    
3185     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3186     $self->{line_prev} = $self->{line};
3187     $self->{column_prev} = $self->{column};
3188     $self->{column}++;
3189     $self->{nc}
3190     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3191     } else {
3192     $self->{set_nc}->($self);
3193     }
3194    
3195     redo A;
3196     } elsif ($self->{nc} == 0x0022) { # "
3197    
3198     $self->{ct}->{sysid} = ''; # DOCTYPE
3199     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3200    
3201     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3202     $self->{line_prev} = $self->{line};
3203     $self->{column_prev} = $self->{column};
3204     $self->{column}++;
3205     $self->{nc}
3206     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3207     } else {
3208     $self->{set_nc}->($self);
3209     }
3210    
3211     redo A;
3212     } elsif ($self->{nc} == 0x0027) { # '
3213    
3214     $self->{ct}->{sysid} = ''; # DOCTYPE
3215     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3216    
3217     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3218     $self->{line_prev} = $self->{line};
3219     $self->{column_prev} = $self->{column};
3220     $self->{column}++;
3221     $self->{nc}
3222     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3223     } else {
3224     $self->{set_nc}->($self);
3225     }
3226    
3227     redo A;
3228     } elsif ($self->{nc} == 0x003E) { # >
3229    
3230     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3231     $self->{state} = DATA_STATE;
3232    
3233     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3234     $self->{line_prev} = $self->{line};
3235     $self->{column_prev} = $self->{column};
3236     $self->{column}++;
3237     $self->{nc}
3238     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3239     } else {
3240     $self->{set_nc}->($self);
3241     }
3242    
3243    
3244     $self->{ct}->{quirks} = 1;
3245     return ($self->{ct}); # DOCTYPE
3246    
3247     redo A;
3248     } elsif ($self->{nc} == -1) {
3249    
3250     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3251    
3252     $self->{state} = DATA_STATE;
3253     ## reconsume
3254    
3255     $self->{ct}->{quirks} = 1;
3256     return ($self->{ct}); # DOCTYPE
3257    
3258     redo A;
3259     } else {
3260    
3261     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3262     $self->{ct}->{quirks} = 1;
3263    
3264     $self->{state} = BOGUS_DOCTYPE_STATE;
3265    
3266     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3267     $self->{line_prev} = $self->{line};
3268     $self->{column_prev} = $self->{column};
3269     $self->{column}++;
3270     $self->{nc}
3271     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3272     } else {
3273     $self->{set_nc}->($self);
3274     }
3275    
3276     redo A;
3277     }
3278     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3279     if ($self->{nc} == 0x0022) { # "
3280    
3281     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3282    
3283     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3284     $self->{line_prev} = $self->{line};
3285     $self->{column_prev} = $self->{column};
3286     $self->{column}++;
3287     $self->{nc}
3288     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3289     } else {
3290     $self->{set_nc}->($self);
3291     }
3292    
3293     redo A;
3294     } elsif ($self->{nc} == 0x003E) { # >
3295    
3296     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3297    
3298     $self->{state} = DATA_STATE;
3299    
3300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301     $self->{line_prev} = $self->{line};
3302     $self->{column_prev} = $self->{column};
3303     $self->{column}++;
3304     $self->{nc}
3305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306     } else {
3307     $self->{set_nc}->($self);
3308     }
3309    
3310    
3311     $self->{ct}->{quirks} = 1;
3312     return ($self->{ct}); # DOCTYPE
3313    
3314     redo A;
3315     } elsif ($self->{nc} == -1) {
3316    
3317     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3318    
3319     $self->{state} = DATA_STATE;
3320     ## reconsume
3321    
3322     $self->{ct}->{quirks} = 1;
3323     return ($self->{ct}); # DOCTYPE
3324    
3325     redo A;
3326     } else {
3327    
3328     $self->{ct}->{sysid} # DOCTYPE
3329     .= chr $self->{nc};
3330     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3331     length $self->{ct}->{sysid});
3332    
3333     ## Stay in the state
3334    
3335     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3336     $self->{line_prev} = $self->{line};
3337     $self->{column_prev} = $self->{column};
3338     $self->{column}++;
3339     $self->{nc}
3340     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3341     } else {
3342     $self->{set_nc}->($self);
3343     }
3344    
3345     redo A;
3346     }
3347     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3348     if ($self->{nc} == 0x0027) { # '
3349    
3350     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3351    
3352     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3353     $self->{line_prev} = $self->{line};
3354     $self->{column_prev} = $self->{column};
3355     $self->{column}++;
3356     $self->{nc}
3357     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3358     } else {
3359     $self->{set_nc}->($self);
3360     }
3361    
3362     redo A;
3363     } elsif ($self->{nc} == 0x003E) { # >
3364    
3365     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3366    
3367     $self->{state} = DATA_STATE;
3368    
3369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370     $self->{line_prev} = $self->{line};
3371     $self->{column_prev} = $self->{column};
3372     $self->{column}++;
3373     $self->{nc}
3374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3375     } else {
3376     $self->{set_nc}->($self);
3377     }
3378    
3379    
3380     $self->{ct}->{quirks} = 1;
3381     return ($self->{ct}); # DOCTYPE
3382    
3383     redo A;
3384     } elsif ($self->{nc} == -1) {
3385    
3386     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3387    
3388     $self->{state} = DATA_STATE;
3389     ## reconsume
3390    
3391     $self->{ct}->{quirks} = 1;
3392     return ($self->{ct}); # DOCTYPE
3393    
3394     redo A;
3395     } else {
3396    
3397     $self->{ct}->{sysid} # DOCTYPE
3398     .= chr $self->{nc};
3399     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3400     length $self->{ct}->{sysid});
3401    
3402     ## Stay in the state
3403    
3404     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3405     $self->{line_prev} = $self->{line};
3406     $self->{column_prev} = $self->{column};
3407     $self->{column}++;
3408     $self->{nc}
3409     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3410     } else {
3411     $self->{set_nc}->($self);
3412     }
3413    
3414     redo A;
3415     }
3416     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3417     if ($is_space->{$self->{nc}}) {
3418    
3419     ## Stay in the state
3420    
3421     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3422     $self->{line_prev} = $self->{line};
3423     $self->{column_prev} = $self->{column};
3424     $self->{column}++;
3425     $self->{nc}
3426     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3427     } else {
3428     $self->{set_nc}->($self);
3429     }
3430    
3431     redo A;
3432     } elsif ($self->{nc} == 0x003E) { # >
3433    
3434     $self->{state} = DATA_STATE;
3435    
3436     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3437     $self->{line_prev} = $self->{line};
3438     $self->{column_prev} = $self->{column};
3439     $self->{column}++;
3440     $self->{nc}
3441     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3442     } else {
3443     $self->{set_nc}->($self);
3444     }
3445    
3446    
3447     return ($self->{ct}); # DOCTYPE
3448    
3449     redo A;
3450     } elsif ($self->{nc} == -1) {
3451    
3452     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3453     $self->{state} = DATA_STATE;
3454     ## reconsume
3455    
3456     $self->{ct}->{quirks} = 1;
3457     return ($self->{ct}); # DOCTYPE
3458    
3459     redo A;
3460     } else {
3461    
3462     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3463     #$self->{ct}->{quirks} = 1;
3464    
3465     $self->{state} = BOGUS_DOCTYPE_STATE;
3466    
3467     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3468     $self->{line_prev} = $self->{line};
3469     $self->{column_prev} = $self->{column};
3470     $self->{column}++;
3471     $self->{nc}
3472     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3473     } else {
3474     $self->{set_nc}->($self);
3475     }
3476    
3477     redo A;
3478     }
3479     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3480     if ($self->{nc} == 0x003E) { # >
3481    
3482     $self->{state} = DATA_STATE;
3483    
3484     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3485     $self->{line_prev} = $self->{line};
3486     $self->{column_prev} = $self->{column};
3487     $self->{column}++;
3488     $self->{nc}
3489     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3490     } else {
3491     $self->{set_nc}->($self);
3492     }
3493    
3494    
3495     return ($self->{ct}); # DOCTYPE
3496    
3497     redo A;
3498     } elsif ($self->{nc} == -1) {
3499    
3500     $self->{state} = DATA_STATE;
3501     ## reconsume
3502    
3503     return ($self->{ct}); # DOCTYPE
3504    
3505     redo A;
3506     } else {
3507    
3508     my $s = '';
3509     $self->{read_until}->($s, q[>], 0);
3510    
3511     ## Stay in the state
3512    
3513     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3514     $self->{line_prev} = $self->{line};
3515     $self->{column_prev} = $self->{column};
3516     $self->{column}++;
3517     $self->{nc}
3518     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3519     } else {
3520     $self->{set_nc}->($self);
3521     }
3522    
3523     redo A;
3524     }
3525     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3526     ## NOTE: "CDATA section state" in the state is jointly implemented
3527     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3528     ## and |CDATA_SECTION_MSE2_STATE|.
3529    
3530     if ($self->{nc} == 0x005D) { # ]
3531    
3532     $self->{state} = CDATA_SECTION_MSE1_STATE;
3533    
3534     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3535     $self->{line_prev} = $self->{line};
3536     $self->{column_prev} = $self->{column};
3537     $self->{column}++;
3538     $self->{nc}
3539     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3540     } else {
3541     $self->{set_nc}->($self);
3542     }
3543    
3544     redo A;
3545     } elsif ($self->{nc} == -1) {
3546     $self->{state} = DATA_STATE;
3547    
3548     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3549     $self->{line_prev} = $self->{line};
3550     $self->{column_prev} = $self->{column};
3551     $self->{column}++;
3552     $self->{nc}
3553     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3554     } else {
3555     $self->{set_nc}->($self);
3556     }
3557    
3558     if (length $self->{ct}->{data}) { # character
3559    
3560     return ($self->{ct}); # character
3561     } else {
3562    
3563     ## No token to emit. $self->{ct} is discarded.
3564     }
3565     redo A;
3566     } else {
3567    
3568     $self->{ct}->{data} .= chr $self->{nc};
3569     $self->{read_until}->($self->{ct}->{data},
3570     q<]>,
3571     length $self->{ct}->{data});
3572    
3573     ## Stay in the state.
3574    
3575     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3576     $self->{line_prev} = $self->{line};
3577     $self->{column_prev} = $self->{column};
3578     $self->{column}++;
3579     $self->{nc}
3580     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3581     } else {
3582     $self->{set_nc}->($self);
3583     }
3584    
3585     redo A;
3586     }
3587    
3588     ## ISSUE: "text tokens" in spec.
3589     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3590     if ($self->{nc} == 0x005D) { # ]
3591    
3592     $self->{state} = CDATA_SECTION_MSE2_STATE;
3593    
3594     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3595     $self->{line_prev} = $self->{line};
3596     $self->{column_prev} = $self->{column};
3597     $self->{column}++;
3598     $self->{nc}
3599     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3600     } else {
3601     $self->{set_nc}->($self);
3602     }
3603    
3604     redo A;
3605     } else {
3606    
3607     $self->{ct}->{data} .= ']';
3608     $self->{state} = CDATA_SECTION_STATE;
3609     ## Reconsume.
3610     redo A;
3611     }
3612     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3613     if ($self->{nc} == 0x003E) { # >
3614     $self->{state} = DATA_STATE;
3615    
3616     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3617     $self->{line_prev} = $self->{line};
3618     $self->{column_prev} = $self->{column};
3619     $self->{column}++;
3620     $self->{nc}
3621     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3622     } else {
3623     $self->{set_nc}->($self);
3624     }
3625    
3626     if (length $self->{ct}->{data}) { # character
3627    
3628     return ($self->{ct}); # character
3629     } else {
3630    
3631     ## No token to emit. $self->{ct} is discarded.
3632     }
3633     redo A;
3634     } elsif ($self->{nc} == 0x005D) { # ]
3635     # character
3636     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3637     ## Stay in the state.
3638    
3639     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3640     $self->{line_prev} = $self->{line};
3641     $self->{column_prev} = $self->{column};
3642     $self->{column}++;
3643     $self->{nc}
3644     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3645     } else {
3646     $self->{set_nc}->($self);
3647     }
3648    
3649     redo A;
3650     } else {
3651    
3652     $self->{ct}->{data} .= ']]'; # character
3653     $self->{state} = CDATA_SECTION_STATE;
3654     ## Reconsume.
3655     redo A;
3656     }
3657     } elsif ($self->{state} == ENTITY_STATE) {
3658     if ($is_space->{$self->{nc}} or
3659     {
3660     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3661     $self->{entity_add} => 1,
3662     }->{$self->{nc}}) {
3663    
3664     ## Don't consume
3665     ## No error
3666     ## Return nothing.
3667     #
3668     } elsif ($self->{nc} == 0x0023) { # #
3669    
3670     $self->{state} = ENTITY_HASH_STATE;
3671     $self->{s_kwd} = '#';
3672    
3673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3674     $self->{line_prev} = $self->{line};
3675     $self->{column_prev} = $self->{column};
3676     $self->{column}++;
3677     $self->{nc}
3678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3679     } else {
3680     $self->{set_nc}->($self);
3681     }
3682    
3683     redo A;
3684     } elsif ((0x0041 <= $self->{nc} and
3685     $self->{nc} <= 0x005A) or # A..Z
3686     (0x0061 <= $self->{nc} and
3687     $self->{nc} <= 0x007A)) { # a..z
3688    
3689     require Whatpm::_NamedEntityList;
3690     $self->{state} = ENTITY_NAME_STATE;
3691     $self->{s_kwd} = chr $self->{nc};
3692     $self->{entity__value} = $self->{s_kwd};
3693     $self->{entity__match} = 0;
3694    
3695     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3696     $self->{line_prev} = $self->{line};
3697     $self->{column_prev} = $self->{column};
3698     $self->{column}++;
3699     $self->{nc}
3700     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3701     } else {
3702     $self->{set_nc}->($self);
3703     }
3704    
3705     redo A;
3706     } else {
3707    
3708     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3709     ## Return nothing.
3710     #
3711     }
3712    
3713     ## NOTE: No character is consumed by the "consume a character
3714     ## reference" algorithm. In other word, there is an "&" character
3715     ## that does not introduce a character reference, which would be
3716     ## appended to the parent element or the attribute value in later
3717     ## process of the tokenizer.
3718    
3719     if ($self->{prev_state} == DATA_STATE) {
3720    
3721     $self->{state} = $self->{prev_state};
3722     ## Reconsume.
3723     return ({type => CHARACTER_TOKEN, data => '&',
3724     line => $self->{line_prev},
3725     column => $self->{column_prev},
3726     });
3727     redo A;
3728     } else {
3729    
3730     $self->{ca}->{value} .= '&';
3731     $self->{state} = $self->{prev_state};
3732     ## Reconsume.
3733     redo A;
3734     }
3735     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3736     if ($self->{nc} == 0x0078 or # x
3737     $self->{nc} == 0x0058) { # X
3738    
3739     $self->{state} = HEXREF_X_STATE;
3740     $self->{s_kwd} .= chr $self->{nc};
3741    
3742     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3743     $self->{line_prev} = $self->{line};
3744     $self->{column_prev} = $self->{column};
3745     $self->{column}++;
3746     $self->{nc}
3747     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3748     } else {
3749     $self->{set_nc}->($self);
3750     }
3751    
3752     redo A;
3753     } elsif (0x0030 <= $self->{nc} and
3754     $self->{nc} <= 0x0039) { # 0..9
3755    
3756     $self->{state} = NCR_NUM_STATE;
3757     $self->{s_kwd} = $self->{nc} - 0x0030;
3758    
3759     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3760     $self->{line_prev} = $self->{line};
3761     $self->{column_prev} = $self->{column};
3762     $self->{column}++;
3763     $self->{nc}
3764     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3765     } else {
3766     $self->{set_nc}->($self);
3767     }
3768    
3769     redo A;
3770     } else {
3771     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3772     line => $self->{line_prev},
3773     column => $self->{column_prev} - 1);
3774    
3775     ## NOTE: According to the spec algorithm, nothing is returned,
3776     ## and then "&#" is appended to the parent element or the attribute
3777     ## value in the later processing.
3778    
3779     if ($self->{prev_state} == DATA_STATE) {
3780    
3781     $self->{state} = $self->{prev_state};
3782     ## Reconsume.
3783     return ({type => CHARACTER_TOKEN,
3784     data => '&#',
3785     line => $self->{line_prev},
3786     column => $self->{column_prev} - 1,
3787     });
3788     redo A;
3789     } else {
3790    
3791     $self->{ca}->{value} .= '&#';
3792     $self->{state} = $self->{prev_state};
3793     ## Reconsume.
3794     redo A;
3795     }
3796     }
3797     } elsif ($self->{state} == NCR_NUM_STATE) {
3798     if (0x0030 <= $self->{nc} and
3799     $self->{nc} <= 0x0039) { # 0..9
3800    
3801     $self->{s_kwd} *= 10;
3802     $self->{s_kwd} += $self->{nc} - 0x0030;
3803    
3804     ## Stay in the state.
3805    
3806     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3807     $self->{line_prev} = $self->{line};
3808     $self->{column_prev} = $self->{column};
3809     $self->{column}++;
3810     $self->{nc}
3811     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3812     } else {
3813     $self->{set_nc}->($self);
3814     }
3815    
3816     redo A;
3817     } elsif ($self->{nc} == 0x003B) { # ;
3818    
3819    
3820     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3821     $self->{line_prev} = $self->{line};
3822     $self->{column_prev} = $self->{column};
3823     $self->{column}++;
3824     $self->{nc}
3825     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3826     } else {
3827     $self->{set_nc}->($self);
3828     }
3829    
3830     #
3831     } else {
3832    
3833     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3834     ## Reconsume.
3835     #
3836     }
3837    
3838     my $code = $self->{s_kwd};
3839     my $l = $self->{line_prev};
3840     my $c = $self->{column_prev};
3841     if ($charref_map->{$code}) {
3842    
3843     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3844     text => (sprintf 'U+%04X', $code),
3845     line => $l, column => $c);
3846     $code = $charref_map->{$code};
3847     } elsif ($code > 0x10FFFF) {
3848    
3849     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3850     text => (sprintf 'U-%08X', $code),
3851     line => $l, column => $c);
3852     $code = 0xFFFD;
3853     }
3854    
3855     if ($self->{prev_state} == DATA_STATE) {
3856    
3857     $self->{state} = $self->{prev_state};
3858     ## Reconsume.
3859     return ({type => CHARACTER_TOKEN, data => chr $code,
3860     line => $l, column => $c,
3861     });
3862     redo A;
3863     } else {
3864    
3865     $self->{ca}->{value} .= chr $code;
3866     $self->{ca}->{has_reference} = 1;
3867     $self->{state} = $self->{prev_state};
3868     ## Reconsume.
3869     redo A;
3870     }
3871     } elsif ($self->{state} == HEXREF_X_STATE) {
3872     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3873     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3874     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3875     # 0..9, A..F, a..f
3876    
3877     $self->{state} = HEXREF_HEX_STATE;
3878     $self->{s_kwd} = 0;
3879     ## Reconsume.
3880     redo A;
3881     } else {
3882     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3883     line => $self->{line_prev},
3884     column => $self->{column_prev} - 2);
3885    
3886     ## NOTE: According to the spec algorithm, nothing is returned,
3887     ## and then "&#" followed by "X" or "x" is appended to the parent
3888     ## element or the attribute value in the later processing.
3889    
3890     if ($self->{prev_state} == DATA_STATE) {
3891    
3892     $self->{state} = $self->{prev_state};
3893     ## Reconsume.
3894     return ({type => CHARACTER_TOKEN,
3895     data => '&' . $self->{s_kwd},
3896     line => $self->{line_prev},
3897     column => $self->{column_prev} - length $self->{s_kwd},
3898     });
3899     redo A;
3900     } else {
3901    
3902     $self->{ca}->{value} .= '&' . $self->{s_kwd};
3903     $self->{state} = $self->{prev_state};
3904     ## Reconsume.
3905     redo A;
3906     }
3907     }
3908     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3909     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3910     # 0..9
3911    
3912     $self->{s_kwd} *= 0x10;
3913     $self->{s_kwd} += $self->{nc} - 0x0030;
3914     ## Stay in the state.
3915    
3916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3917     $self->{line_prev} = $self->{line};
3918     $self->{column_prev} = $self->{column};
3919     $self->{column}++;
3920     $self->{nc}
3921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3922     } else {
3923     $self->{set_nc}->($self);
3924     }
3925    
3926     redo A;
3927     } elsif (0x0061 <= $self->{nc} and
3928     $self->{nc} <= 0x0066) { # a..f
3929    
3930     $self->{s_kwd} *= 0x10;
3931     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3932     ## Stay in the state.
3933    
3934     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3935     $self->{line_prev} = $self->{line};
3936     $self->{column_prev} = $self->{column};
3937     $self->{column}++;
3938     $self->{nc}
3939     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3940     } else {
3941     $self->{set_nc}->($self);
3942     }
3943    
3944     redo A;
3945     } elsif (0x0041 <= $self->{nc} and
3946     $self->{nc} <= 0x0046) { # A..F
3947    
3948     $self->{s_kwd} *= 0x10;
3949     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3950     ## Stay in the state.
3951    
3952     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3953     $self->{line_prev} = $self->{line};
3954     $self->{column_prev} = $self->{column};
3955     $self->{column}++;
3956     $self->{nc}
3957     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3958     } else {
3959     $self->{set_nc}->($self);
3960     }
3961    
3962     redo A;
3963     } elsif ($self->{nc} == 0x003B) { # ;
3964    
3965    
3966     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3967     $self->{line_prev} = $self->{line};
3968     $self->{column_prev} = $self->{column};
3969     $self->{column}++;
3970     $self->{nc}
3971     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3972     } else {
3973     $self->{set_nc}->($self);
3974     }
3975    
3976     #
3977     } else {
3978    
3979     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
3980     line => $self->{line},
3981     column => $self->{column});
3982     ## Reconsume.
3983     #
3984     }
3985    
3986     my $code = $self->{s_kwd};
3987     my $l = $self->{line_prev};
3988     my $c = $self->{column_prev};
3989     if ($charref_map->{$code}) {
3990    
3991     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3992     text => (sprintf 'U+%04X', $code),
3993     line => $l, column => $c);
3994     $code = $charref_map->{$code};
3995     } elsif ($code > 0x10FFFF) {
3996    
3997     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3998     text => (sprintf 'U-%08X', $code),
3999     line => $l, column => $c);
4000     $code = 0xFFFD;
4001     }
4002    
4003     if ($self->{prev_state} == DATA_STATE) {
4004    
4005     $self->{state} = $self->{prev_state};
4006     ## Reconsume.
4007     return ({type => CHARACTER_TOKEN, data => chr $code,
4008     line => $l, column => $c,
4009     });
4010     redo A;
4011     } else {
4012    
4013     $self->{ca}->{value} .= chr $code;
4014     $self->{ca}->{has_reference} = 1;
4015     $self->{state} = $self->{prev_state};
4016     ## Reconsume.
4017     redo A;
4018     }
4019     } elsif ($self->{state} == ENTITY_NAME_STATE) {
4020     if (length $self->{s_kwd} < 30 and
4021     ## NOTE: Some number greater than the maximum length of entity name
4022     ((0x0041 <= $self->{nc} and # a
4023     $self->{nc} <= 0x005A) or # x
4024     (0x0061 <= $self->{nc} and # a
4025     $self->{nc} <= 0x007A) or # z
4026     (0x0030 <= $self->{nc} and # 0
4027     $self->{nc} <= 0x0039) or # 9
4028     $self->{nc} == 0x003B)) { # ;
4029     our $EntityChar;
4030     $self->{s_kwd} .= chr $self->{nc};
4031     if (defined $EntityChar->{$self->{s_kwd}}) {
4032     if ($self->{nc} == 0x003B) { # ;
4033    
4034     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4035     $self->{entity__match} = 1;
4036    
4037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4038     $self->{line_prev} = $self->{line};
4039     $self->{column_prev} = $self->{column};
4040     $self->{column}++;
4041     $self->{nc}
4042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4043     } else {
4044     $self->{set_nc}->($self);
4045     }
4046    
4047     #
4048     } else {
4049    
4050     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4051     $self->{entity__match} = -1;
4052     ## Stay in the state.
4053    
4054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4055     $self->{line_prev} = $self->{line};
4056     $self->{column_prev} = $self->{column};
4057     $self->{column}++;
4058     $self->{nc}
4059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4060     } else {
4061     $self->{set_nc}->($self);
4062     }
4063    
4064     redo A;
4065     }
4066     } else {
4067    
4068     $self->{entity__value} .= chr $self->{nc};
4069     $self->{entity__match} *= 2;
4070     ## Stay in the state.
4071    
4072     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4073     $self->{line_prev} = $self->{line};
4074     $self->{column_prev} = $self->{column};
4075     $self->{column}++;
4076     $self->{nc}
4077     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4078     } else {
4079     $self->{set_nc}->($self);
4080     }
4081    
4082     redo A;
4083     }
4084     }
4085    
4086     my $data;
4087     my $has_ref;
4088     if ($self->{entity__match} > 0) {
4089    
4090     $data = $self->{entity__value};
4091     $has_ref = 1;
4092     #
4093     } elsif ($self->{entity__match} < 0) {
4094     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4095     if ($self->{prev_state} != DATA_STATE and # in attribute
4096     $self->{entity__match} < -1) {
4097    
4098     $data = '&' . $self->{s_kwd};
4099     #
4100     } else {
4101    
4102     $data = $self->{entity__value};
4103     $has_ref = 1;
4104     #
4105     }
4106     } else {
4107    
4108     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4109     line => $self->{line_prev},
4110     column => $self->{column_prev} - length $self->{s_kwd});
4111     $data = '&' . $self->{s_kwd};
4112     #
4113     }
4114    
4115     ## NOTE: In these cases, when a character reference is found,
4116     ## it is consumed and a character token is returned, or, otherwise,
4117     ## nothing is consumed and returned, according to the spec algorithm.
4118     ## In this implementation, anything that has been examined by the
4119     ## tokenizer is appended to the parent element or the attribute value
4120     ## as string, either literal string when no character reference or
4121     ## entity-replaced string otherwise, in this stage, since any characters
4122     ## that would not be consumed are appended in the data state or in an
4123     ## appropriate attribute value state anyway.
4124    
4125     if ($self->{prev_state} == DATA_STATE) {
4126    
4127     $self->{state} = $self->{prev_state};
4128     ## Reconsume.
4129     return ({type => CHARACTER_TOKEN,
4130     data => $data,
4131     line => $self->{line_prev},
4132     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4133     });
4134     redo A;
4135     } else {
4136    
4137     $self->{ca}->{value} .= $data;
4138     $self->{ca}->{has_reference} = 1 if $has_ref;
4139     $self->{state} = $self->{prev_state};
4140     ## Reconsume.
4141     redo A;
4142     }
4143     } else {
4144     die "$0: $self->{state}: Unknown state";
4145     }
4146     } # A
4147    
4148     die "$0: _get_next_token: unexpected case";
4149     } # _get_next_token
4150    
4151     1;
4152 wakaba 1.3 ## $Date: 2008/10/14 04:32:49 $

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24