/[suikacvs]/markup/html/whatpm/Whatpm/HTML/Tokenizer.pm
Suika

Contents of /markup/html/whatpm/Whatpm/HTML/Tokenizer.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (hide annotations) (download)
Tue Oct 14 02:27:58 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
++ whatpm/Whatpm/ChangeLog	14 Oct 2008 02:26:16 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Makefile: New rule to make HTML/Tokenizer.pm is added.

	* HTML.pm.src: Tokenizer part moved to another file.

++ whatpm/Whatpm/HTML/ChangeLog	14 Oct 2008 02:25:46 -0000
2008-10-14  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New file.

1 wakaba 1.1 package Whatpm::HTML::Tokenizer;
2     use strict;
3     our $VERSION=do{my @r=(q$Revision: 1.207 $=~/\d+/g);sprintf "%d."."%02d" x $#r,@r};
4    
5     package Whatpm::HTML;
6    
7     ## Content model flags
8    
9     sub CM_ENTITY () { 0b001 } # & markup in data
10     sub CM_LIMITED_MARKUP () { 0b010 } # < markup in data (limited)
11     sub CM_FULL_MARKUP () { 0b100 } # < markup in data (any)
12    
13     sub PLAINTEXT_CONTENT_MODEL () { 0 }
14     sub CDATA_CONTENT_MODEL () { CM_LIMITED_MARKUP }
15     sub RCDATA_CONTENT_MODEL () { CM_ENTITY | CM_LIMITED_MARKUP }
16     sub PCDATA_CONTENT_MODEL () { CM_ENTITY | CM_FULL_MARKUP }
17    
18     ## Tokenizer states
19    
20     sub DATA_STATE () { 0 }
21     #sub ENTITY_DATA_STATE () { 1 }
22     sub TAG_OPEN_STATE () { 2 }
23     sub CLOSE_TAG_OPEN_STATE () { 3 }
24     sub TAG_NAME_STATE () { 4 }
25     sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 }
26     sub ATTRIBUTE_NAME_STATE () { 6 }
27     sub AFTER_ATTRIBUTE_NAME_STATE () { 7 }
28     sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 }
29     sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 }
30     sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 }
31     sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 }
32     #sub ENTITY_IN_ATTRIBUTE_VALUE_STATE () { 12 }
33     sub MARKUP_DECLARATION_OPEN_STATE () { 13 }
34     sub COMMENT_START_STATE () { 14 }
35     sub COMMENT_START_DASH_STATE () { 15 }
36     sub COMMENT_STATE () { 16 }
37     sub COMMENT_END_STATE () { 17 }
38     sub COMMENT_END_DASH_STATE () { 18 }
39     sub BOGUS_COMMENT_STATE () { 19 }
40     sub DOCTYPE_STATE () { 20 }
41     sub BEFORE_DOCTYPE_NAME_STATE () { 21 }
42     sub DOCTYPE_NAME_STATE () { 22 }
43     sub AFTER_DOCTYPE_NAME_STATE () { 23 }
44     sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 }
45     sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 }
46     sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 }
47     sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 }
48     sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 }
49     sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 }
50     sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 }
51     sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 }
52     sub BOGUS_DOCTYPE_STATE () { 32 }
53     sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 }
54     sub SELF_CLOSING_START_TAG_STATE () { 34 }
55     sub CDATA_SECTION_STATE () { 35 }
56     sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec
57     sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec
58     sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec
59     sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec
60     sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec
61     sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec
62     sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec
63     sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec
64     ## NOTE: "Entity data state", "entity in attribute value state", and
65     ## "consume a character reference" algorithm are jointly implemented
66     ## using the following six states:
67     sub ENTITY_STATE () { 44 }
68     sub ENTITY_HASH_STATE () { 45 }
69     sub NCR_NUM_STATE () { 46 }
70     sub HEXREF_X_STATE () { 47 }
71     sub HEXREF_HEX_STATE () { 48 }
72     sub ENTITY_NAME_STATE () { 49 }
73     sub PCDATA_STATE () { 50 } # "data state" in the spec
74    
75     ## Token types
76    
77     sub DOCTYPE_TOKEN () { 1 }
78     sub COMMENT_TOKEN () { 2 }
79     sub START_TAG_TOKEN () { 3 }
80     sub END_TAG_TOKEN () { 4 }
81     sub END_OF_FILE_TOKEN () { 5 }
82     sub CHARACTER_TOKEN () { 6 }
83    
84     ## Tree constructor state constants (see Whatpm::HTML for the full
85     ## list and descriptions)
86    
87     sub IN_FOREIGN_CONTENT_IM () { 0b100000000000 }
88     sub FOREIGN_EL () { 0b1_00000000000 }
89    
90     ## Character reference mappings
91    
92     my $charref_map = {
93     0x0D => 0x000A,
94     0x80 => 0x20AC,
95     0x81 => 0xFFFD,
96     0x82 => 0x201A,
97     0x83 => 0x0192,
98     0x84 => 0x201E,
99     0x85 => 0x2026,
100     0x86 => 0x2020,
101     0x87 => 0x2021,
102     0x88 => 0x02C6,
103     0x89 => 0x2030,
104     0x8A => 0x0160,
105     0x8B => 0x2039,
106     0x8C => 0x0152,
107     0x8D => 0xFFFD,
108     0x8E => 0x017D,
109     0x8F => 0xFFFD,
110     0x90 => 0xFFFD,
111     0x91 => 0x2018,
112     0x92 => 0x2019,
113     0x93 => 0x201C,
114     0x94 => 0x201D,
115     0x95 => 0x2022,
116     0x96 => 0x2013,
117     0x97 => 0x2014,
118     0x98 => 0x02DC,
119     0x99 => 0x2122,
120     0x9A => 0x0161,
121     0x9B => 0x203A,
122     0x9C => 0x0153,
123     0x9D => 0xFFFD,
124     0x9E => 0x017E,
125     0x9F => 0x0178,
126     }; # $charref_map
127     $charref_map->{$_} = 0xFFFD
128     for 0x0000..0x0008, 0x000B, 0x000E..0x001F, 0x007F,
129     0xD800..0xDFFF, 0xFDD0..0xFDDF, ## ISSUE: 0xFDEF
130     0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF,
131     0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
132     0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
133     0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE,
134     0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF;
135    
136     ## Implementations MUST act as if state machine in the spec
137    
138     sub _initialize_tokenizer ($) {
139     my $self = shift;
140    
141     ## NOTE: Fields set by |new| constructor:
142     #$self->{level}
143     #$self->{set_nc}
144     #$self->{parse_error}
145    
146     $self->{state} = DATA_STATE; # MUST
147     #$self->{s_kwd}; # state keyword - initialized when used
148     #$self->{entity__value}; # initialized when used
149     #$self->{entity__match}; # initialized when used
150     $self->{content_model} = PCDATA_CONTENT_MODEL; # be
151     undef $self->{ct}; # current token
152     undef $self->{ca}; # current attribute
153     undef $self->{last_stag_name}; # last emitted start tag name
154     #$self->{prev_state}; # initialized when used
155     delete $self->{self_closing};
156     $self->{char_buffer} = '';
157     $self->{char_buffer_pos} = 0;
158     $self->{nc} = -1; # next input character
159     #$self->{next_nc}
160    
161     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
162     $self->{line_prev} = $self->{line};
163     $self->{column_prev} = $self->{column};
164     $self->{column}++;
165     $self->{nc}
166     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
167     } else {
168     $self->{set_nc}->($self);
169     }
170    
171     $self->{token} = [];
172     # $self->{escape}
173     } # _initialize_tokenizer
174    
175     ## A token has:
176     ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN,
177     ## CHARACTER_TOKEN, or END_OF_FILE_TOKEN
178     ## ->{name} (DOCTYPE_TOKEN)
179     ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN)
180     ## ->{pubid} (DOCTYPE_TOKEN)
181     ## ->{sysid} (DOCTYPE_TOKEN)
182     ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag
183     ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN)
184     ## ->{name}
185     ## ->{value}
186     ## ->{has_reference} == 1 or 0
187     ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN)
188     ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|.
189     ## |->{self_closing}| is used to save the value of |$self->{self_closing}|
190     ## while the token is pushed back to the stack.
191    
192     ## Emitted token MUST immediately be handled by the tree construction state.
193    
194     ## Before each step, UA MAY check to see if either one of the scripts in
195     ## "list of scripts that will execute as soon as possible" or the first
196     ## script in the "list of scripts that will execute asynchronously",
197     ## has completed loading. If one has, then it MUST be executed
198     ## and removed from the list.
199    
200     ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.)
201     ## (This requirement was dropped from HTML5 spec, unfortunately.)
202    
203     my $is_space = {
204     0x0009 => 1, # CHARACTER TABULATION (HT)
205     0x000A => 1, # LINE FEED (LF)
206     #0x000B => 0, # LINE TABULATION (VT)
207     0x000C => 1, # FORM FEED (FF)
208     #0x000D => 1, # CARRIAGE RETURN (CR)
209     0x0020 => 1, # SPACE (SP)
210     };
211    
212     sub _get_next_token ($) {
213     my $self = shift;
214    
215     if ($self->{self_closing}) {
216     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
217     ## NOTE: The |self_closing| flag is only set by start tag token.
218     ## In addition, when a start tag token is emitted, it is always set to
219     ## |ct|.
220     delete $self->{self_closing};
221     }
222    
223     if (@{$self->{token}}) {
224     $self->{self_closing} = $self->{token}->[0]->{self_closing};
225     return shift @{$self->{token}};
226     }
227    
228     A: {
229     if ($self->{state} == PCDATA_STATE) {
230     ## NOTE: Same as |DATA_STATE|, but only for |PCDATA| content model.
231    
232     if ($self->{nc} == 0x0026) { # &
233    
234     ## NOTE: In the spec, the tokenizer is switched to the
235     ## "entity data state". In this implementation, the tokenizer
236     ## is switched to the |ENTITY_STATE|, which is an implementation
237     ## of the "consume a character reference" algorithm.
238     $self->{entity_add} = -1;
239     $self->{prev_state} = DATA_STATE;
240     $self->{state} = ENTITY_STATE;
241    
242     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
243     $self->{line_prev} = $self->{line};
244     $self->{column_prev} = $self->{column};
245     $self->{column}++;
246     $self->{nc}
247     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
248     } else {
249     $self->{set_nc}->($self);
250     }
251    
252     redo A;
253     } elsif ($self->{nc} == 0x003C) { # <
254    
255     $self->{state} = TAG_OPEN_STATE;
256    
257     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
258     $self->{line_prev} = $self->{line};
259     $self->{column_prev} = $self->{column};
260     $self->{column}++;
261     $self->{nc}
262     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
263     } else {
264     $self->{set_nc}->($self);
265     }
266    
267     redo A;
268     } elsif ($self->{nc} == -1) {
269    
270     return ({type => END_OF_FILE_TOKEN,
271     line => $self->{line}, column => $self->{column}});
272     last A; ## TODO: ok?
273     } else {
274    
275     #
276     }
277    
278     # Anything else
279     my $token = {type => CHARACTER_TOKEN,
280     data => chr $self->{nc},
281     line => $self->{line}, column => $self->{column},
282     };
283     $self->{read_until}->($token->{data}, q[<&], length $token->{data});
284    
285     ## Stay in the state.
286    
287     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
288     $self->{line_prev} = $self->{line};
289     $self->{column_prev} = $self->{column};
290     $self->{column}++;
291     $self->{nc}
292     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
293     } else {
294     $self->{set_nc}->($self);
295     }
296    
297     return ($token);
298     redo A;
299     } elsif ($self->{state} == DATA_STATE) {
300     $self->{s_kwd} = '' unless defined $self->{s_kwd};
301     if ($self->{nc} == 0x0026) { # &
302     $self->{s_kwd} = '';
303     if ($self->{content_model} & CM_ENTITY and # PCDATA | RCDATA
304     not $self->{escape}) {
305    
306     ## NOTE: In the spec, the tokenizer is switched to the
307     ## "entity data state". In this implementation, the tokenizer
308     ## is switched to the |ENTITY_STATE|, which is an implementation
309     ## of the "consume a character reference" algorithm.
310     $self->{entity_add} = -1;
311     $self->{prev_state} = DATA_STATE;
312     $self->{state} = ENTITY_STATE;
313    
314     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
315     $self->{line_prev} = $self->{line};
316     $self->{column_prev} = $self->{column};
317     $self->{column}++;
318     $self->{nc}
319     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
320     } else {
321     $self->{set_nc}->($self);
322     }
323    
324     redo A;
325     } else {
326    
327     #
328     }
329     } elsif ($self->{nc} == 0x002D) { # -
330     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
331     $self->{s_kwd} .= '-';
332    
333     if ($self->{s_kwd} eq '<!--') {
334    
335     $self->{escape} = 1; # unless $self->{escape};
336     $self->{s_kwd} = '--';
337     #
338     } elsif ($self->{s_kwd} eq '---') {
339    
340     $self->{s_kwd} = '--';
341     #
342     } else {
343    
344     #
345     }
346     }
347    
348     #
349     } elsif ($self->{nc} == 0x0021) { # !
350     if (length $self->{s_kwd}) {
351    
352     $self->{s_kwd} .= '!';
353     #
354     } else {
355    
356     #$self->{s_kwd} = '';
357     #
358     }
359     #
360     } elsif ($self->{nc} == 0x003C) { # <
361     if ($self->{content_model} & CM_FULL_MARKUP or # PCDATA
362     (($self->{content_model} & CM_LIMITED_MARKUP) and # CDATA | RCDATA
363     not $self->{escape})) {
364    
365     $self->{state} = TAG_OPEN_STATE;
366    
367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
368     $self->{line_prev} = $self->{line};
369     $self->{column_prev} = $self->{column};
370     $self->{column}++;
371     $self->{nc}
372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
373     } else {
374     $self->{set_nc}->($self);
375     }
376    
377     redo A;
378     } else {
379    
380     $self->{s_kwd} = '';
381     #
382     }
383     } elsif ($self->{nc} == 0x003E) { # >
384     if ($self->{escape} and
385     ($self->{content_model} & CM_LIMITED_MARKUP)) { # RCDATA | CDATA
386     if ($self->{s_kwd} eq '--') {
387    
388     delete $self->{escape};
389     } else {
390    
391     }
392     } else {
393    
394     }
395    
396     $self->{s_kwd} = '';
397     #
398     } elsif ($self->{nc} == -1) {
399    
400     $self->{s_kwd} = '';
401     return ({type => END_OF_FILE_TOKEN,
402     line => $self->{line}, column => $self->{column}});
403     last A; ## TODO: ok?
404     } else {
405    
406     $self->{s_kwd} = '';
407     #
408     }
409    
410     # Anything else
411     my $token = {type => CHARACTER_TOKEN,
412     data => chr $self->{nc},
413     line => $self->{line}, column => $self->{column},
414     };
415     if ($self->{read_until}->($token->{data}, q[-!<>&],
416     length $token->{data})) {
417     $self->{s_kwd} = '';
418     }
419    
420     ## Stay in the data state.
421     if ($self->{content_model} == PCDATA_CONTENT_MODEL) {
422    
423     $self->{state} = PCDATA_STATE;
424     } else {
425    
426     ## Stay in the state.
427     }
428    
429     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
430     $self->{line_prev} = $self->{line};
431     $self->{column_prev} = $self->{column};
432     $self->{column}++;
433     $self->{nc}
434     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
435     } else {
436     $self->{set_nc}->($self);
437     }
438    
439     return ($token);
440     redo A;
441     } elsif ($self->{state} == TAG_OPEN_STATE) {
442     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
443     if ($self->{nc} == 0x002F) { # /
444    
445    
446     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
447     $self->{line_prev} = $self->{line};
448     $self->{column_prev} = $self->{column};
449     $self->{column}++;
450     $self->{nc}
451     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
452     } else {
453     $self->{set_nc}->($self);
454     }
455    
456     $self->{state} = CLOSE_TAG_OPEN_STATE;
457     redo A;
458     } elsif ($self->{nc} == 0x0021) { # !
459    
460     $self->{s_kwd} = '<' unless $self->{escape};
461     #
462     } else {
463    
464     #
465     }
466    
467     ## reconsume
468     $self->{state} = DATA_STATE;
469     return ({type => CHARACTER_TOKEN, data => '<',
470     line => $self->{line_prev},
471     column => $self->{column_prev},
472     });
473     redo A;
474     } elsif ($self->{content_model} & CM_FULL_MARKUP) { # PCDATA
475     if ($self->{nc} == 0x0021) { # !
476    
477     $self->{state} = MARKUP_DECLARATION_OPEN_STATE;
478    
479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
480     $self->{line_prev} = $self->{line};
481     $self->{column_prev} = $self->{column};
482     $self->{column}++;
483     $self->{nc}
484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
485     } else {
486     $self->{set_nc}->($self);
487     }
488    
489     redo A;
490     } elsif ($self->{nc} == 0x002F) { # /
491    
492     $self->{state} = CLOSE_TAG_OPEN_STATE;
493    
494     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
495     $self->{line_prev} = $self->{line};
496     $self->{column_prev} = $self->{column};
497     $self->{column}++;
498     $self->{nc}
499     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
500     } else {
501     $self->{set_nc}->($self);
502     }
503    
504     redo A;
505     } elsif (0x0041 <= $self->{nc} and
506     $self->{nc} <= 0x005A) { # A..Z
507    
508     $self->{ct}
509     = {type => START_TAG_TOKEN,
510     tag_name => chr ($self->{nc} + 0x0020),
511     line => $self->{line_prev},
512     column => $self->{column_prev}};
513     $self->{state} = TAG_NAME_STATE;
514    
515     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
516     $self->{line_prev} = $self->{line};
517     $self->{column_prev} = $self->{column};
518     $self->{column}++;
519     $self->{nc}
520     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
521     } else {
522     $self->{set_nc}->($self);
523     }
524    
525     redo A;
526     } elsif (0x0061 <= $self->{nc} and
527     $self->{nc} <= 0x007A) { # a..z
528    
529     $self->{ct} = {type => START_TAG_TOKEN,
530     tag_name => chr ($self->{nc}),
531     line => $self->{line_prev},
532     column => $self->{column_prev}};
533     $self->{state} = TAG_NAME_STATE;
534    
535     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
536     $self->{line_prev} = $self->{line};
537     $self->{column_prev} = $self->{column};
538     $self->{column}++;
539     $self->{nc}
540     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
541     } else {
542     $self->{set_nc}->($self);
543     }
544    
545     redo A;
546     } elsif ($self->{nc} == 0x003E) { # >
547    
548     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty start tag',
549     line => $self->{line_prev},
550     column => $self->{column_prev});
551     $self->{state} = DATA_STATE;
552    
553     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
554     $self->{line_prev} = $self->{line};
555     $self->{column_prev} = $self->{column};
556     $self->{column}++;
557     $self->{nc}
558     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
559     } else {
560     $self->{set_nc}->($self);
561     }
562    
563    
564     return ({type => CHARACTER_TOKEN, data => '<>',
565     line => $self->{line_prev},
566     column => $self->{column_prev},
567     });
568    
569     redo A;
570     } elsif ($self->{nc} == 0x003F) { # ?
571    
572     $self->{parse_error}->(level => $self->{level}->{must}, type => 'pio',
573     line => $self->{line_prev},
574     column => $self->{column_prev});
575     $self->{state} = BOGUS_COMMENT_STATE;
576     $self->{ct} = {type => COMMENT_TOKEN, data => '',
577     line => $self->{line_prev},
578     column => $self->{column_prev},
579     };
580     ## $self->{nc} is intentionally left as is
581     redo A;
582     } else {
583    
584     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago',
585     line => $self->{line_prev},
586     column => $self->{column_prev});
587     $self->{state} = DATA_STATE;
588     ## reconsume
589    
590     return ({type => CHARACTER_TOKEN, data => '<',
591     line => $self->{line_prev},
592     column => $self->{column_prev},
593     });
594    
595     redo A;
596     }
597     } else {
598     die "$0: $self->{content_model} in tag open";
599     }
600     } elsif ($self->{state} == CLOSE_TAG_OPEN_STATE) {
601     ## NOTE: The "close tag open state" in the spec is implemented as
602     ## |CLOSE_TAG_OPEN_STATE| and |CDATA_RCDATA_CLOSE_TAG_STATE|.
603    
604     my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</"
605     if ($self->{content_model} & CM_LIMITED_MARKUP) { # RCDATA | CDATA
606     if (defined $self->{last_stag_name}) {
607     $self->{state} = CDATA_RCDATA_CLOSE_TAG_STATE;
608     $self->{s_kwd} = '';
609     ## Reconsume.
610     redo A;
611     } else {
612     ## No start tag token has ever been emitted
613     ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>.
614    
615     $self->{state} = DATA_STATE;
616     ## Reconsume.
617     return ({type => CHARACTER_TOKEN, data => '</',
618     line => $l, column => $c,
619     });
620     redo A;
621     }
622     }
623    
624     if (0x0041 <= $self->{nc} and
625     $self->{nc} <= 0x005A) { # A..Z
626    
627     $self->{ct}
628     = {type => END_TAG_TOKEN,
629     tag_name => chr ($self->{nc} + 0x0020),
630     line => $l, column => $c};
631     $self->{state} = TAG_NAME_STATE;
632    
633     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
634     $self->{line_prev} = $self->{line};
635     $self->{column_prev} = $self->{column};
636     $self->{column}++;
637     $self->{nc}
638     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
639     } else {
640     $self->{set_nc}->($self);
641     }
642    
643     redo A;
644     } elsif (0x0061 <= $self->{nc} and
645     $self->{nc} <= 0x007A) { # a..z
646    
647     $self->{ct} = {type => END_TAG_TOKEN,
648     tag_name => chr ($self->{nc}),
649     line => $l, column => $c};
650     $self->{state} = TAG_NAME_STATE;
651    
652     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
653     $self->{line_prev} = $self->{line};
654     $self->{column_prev} = $self->{column};
655     $self->{column}++;
656     $self->{nc}
657     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
658     } else {
659     $self->{set_nc}->($self);
660     }
661    
662     redo A;
663     } elsif ($self->{nc} == 0x003E) { # >
664    
665     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty end tag',
666     line => $self->{line_prev}, ## "<" in "</>"
667     column => $self->{column_prev} - 1);
668     $self->{state} = DATA_STATE;
669    
670     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
671     $self->{line_prev} = $self->{line};
672     $self->{column_prev} = $self->{column};
673     $self->{column}++;
674     $self->{nc}
675     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
676     } else {
677     $self->{set_nc}->($self);
678     }
679    
680     redo A;
681     } elsif ($self->{nc} == -1) {
682    
683     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare etago');
684     $self->{state} = DATA_STATE;
685     # reconsume
686    
687     return ({type => CHARACTER_TOKEN, data => '</',
688     line => $l, column => $c,
689     });
690    
691     redo A;
692     } else {
693    
694     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus end tag');
695     $self->{state} = BOGUS_COMMENT_STATE;
696     $self->{ct} = {type => COMMENT_TOKEN, data => '',
697     line => $self->{line_prev}, # "<" of "</"
698     column => $self->{column_prev} - 1,
699     };
700     ## NOTE: $self->{nc} is intentionally left as is.
701     ## Although the "anything else" case of the spec not explicitly
702     ## states that the next input character is to be reconsumed,
703     ## it will be included to the |data| of the comment token
704     ## generated from the bogus end tag, as defined in the
705     ## "bogus comment state" entry.
706     redo A;
707     }
708     } elsif ($self->{state} == CDATA_RCDATA_CLOSE_TAG_STATE) {
709     my $ch = substr $self->{last_stag_name}, length $self->{s_kwd}, 1;
710     if (length $ch) {
711     my $CH = $ch;
712     $ch =~ tr/a-z/A-Z/;
713     my $nch = chr $self->{nc};
714     if ($nch eq $ch or $nch eq $CH) {
715    
716     ## Stay in the state.
717     $self->{s_kwd} .= $nch;
718    
719     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
720     $self->{line_prev} = $self->{line};
721     $self->{column_prev} = $self->{column};
722     $self->{column}++;
723     $self->{nc}
724     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
725     } else {
726     $self->{set_nc}->($self);
727     }
728    
729     redo A;
730     } else {
731    
732     $self->{state} = DATA_STATE;
733     ## Reconsume.
734     return ({type => CHARACTER_TOKEN,
735     data => '</' . $self->{s_kwd},
736     line => $self->{line_prev},
737     column => $self->{column_prev} - 1 - length $self->{s_kwd},
738     });
739     redo A;
740     }
741     } else { # after "<{tag-name}"
742     unless ($is_space->{$self->{nc}} or
743     {
744     0x003E => 1, # >
745     0x002F => 1, # /
746     -1 => 1, # EOF
747     }->{$self->{nc}}) {
748    
749     ## Reconsume.
750     $self->{state} = DATA_STATE;
751     return ({type => CHARACTER_TOKEN,
752     data => '</' . $self->{s_kwd},
753     line => $self->{line_prev},
754     column => $self->{column_prev} - 1 - length $self->{s_kwd},
755     });
756     redo A;
757     } else {
758    
759     $self->{ct}
760     = {type => END_TAG_TOKEN,
761     tag_name => $self->{last_stag_name},
762     line => $self->{line_prev},
763     column => $self->{column_prev} - 1 - length $self->{s_kwd}};
764     $self->{state} = TAG_NAME_STATE;
765     ## Reconsume.
766     redo A;
767     }
768     }
769     } elsif ($self->{state} == TAG_NAME_STATE) {
770     if ($is_space->{$self->{nc}}) {
771    
772     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
773    
774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
775     $self->{line_prev} = $self->{line};
776     $self->{column_prev} = $self->{column};
777     $self->{column}++;
778     $self->{nc}
779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
780     } else {
781     $self->{set_nc}->($self);
782     }
783    
784     redo A;
785     } elsif ($self->{nc} == 0x003E) { # >
786     if ($self->{ct}->{type} == START_TAG_TOKEN) {
787    
788     $self->{last_stag_name} = $self->{ct}->{tag_name};
789     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
790     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
791     #if ($self->{ct}->{attributes}) {
792     # ## NOTE: This should never be reached.
793     # !!! cp (36);
794     # !!! parse-error (type => 'end tag attribute');
795     #} else {
796    
797     #}
798     } else {
799     die "$0: $self->{ct}->{type}: Unknown token type";
800     }
801     $self->{state} = DATA_STATE;
802    
803     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
804     $self->{line_prev} = $self->{line};
805     $self->{column_prev} = $self->{column};
806     $self->{column}++;
807     $self->{nc}
808     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
809     } else {
810     $self->{set_nc}->($self);
811     }
812    
813    
814     return ($self->{ct}); # start tag or end tag
815    
816     redo A;
817     } elsif (0x0041 <= $self->{nc} and
818     $self->{nc} <= 0x005A) { # A..Z
819    
820     $self->{ct}->{tag_name} .= chr ($self->{nc} + 0x0020);
821     # start tag or end tag
822     ## Stay in this state
823    
824     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
825     $self->{line_prev} = $self->{line};
826     $self->{column_prev} = $self->{column};
827     $self->{column}++;
828     $self->{nc}
829     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
830     } else {
831     $self->{set_nc}->($self);
832     }
833    
834     redo A;
835     } elsif ($self->{nc} == -1) {
836     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
837     if ($self->{ct}->{type} == START_TAG_TOKEN) {
838    
839     $self->{last_stag_name} = $self->{ct}->{tag_name};
840     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
841     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
842     #if ($self->{ct}->{attributes}) {
843     # ## NOTE: This state should never be reached.
844     # !!! cp (40);
845     # !!! parse-error (type => 'end tag attribute');
846     #} else {
847    
848     #}
849     } else {
850     die "$0: $self->{ct}->{type}: Unknown token type";
851     }
852     $self->{state} = DATA_STATE;
853     # reconsume
854    
855     return ($self->{ct}); # start tag or end tag
856    
857     redo A;
858     } elsif ($self->{nc} == 0x002F) { # /
859    
860     $self->{state} = SELF_CLOSING_START_TAG_STATE;
861    
862     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
863     $self->{line_prev} = $self->{line};
864     $self->{column_prev} = $self->{column};
865     $self->{column}++;
866     $self->{nc}
867     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
868     } else {
869     $self->{set_nc}->($self);
870     }
871    
872     redo A;
873     } else {
874    
875     $self->{ct}->{tag_name} .= chr $self->{nc};
876     # start tag or end tag
877     ## Stay in the state
878    
879     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
880     $self->{line_prev} = $self->{line};
881     $self->{column_prev} = $self->{column};
882     $self->{column}++;
883     $self->{nc}
884     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
885     } else {
886     $self->{set_nc}->($self);
887     }
888    
889     redo A;
890     }
891     } elsif ($self->{state} == BEFORE_ATTRIBUTE_NAME_STATE) {
892     if ($is_space->{$self->{nc}}) {
893    
894     ## Stay in the state
895    
896     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
897     $self->{line_prev} = $self->{line};
898     $self->{column_prev} = $self->{column};
899     $self->{column}++;
900     $self->{nc}
901     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
902     } else {
903     $self->{set_nc}->($self);
904     }
905    
906     redo A;
907     } elsif ($self->{nc} == 0x003E) { # >
908     if ($self->{ct}->{type} == START_TAG_TOKEN) {
909    
910     $self->{last_stag_name} = $self->{ct}->{tag_name};
911     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
912     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
913     if ($self->{ct}->{attributes}) {
914    
915     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
916     } else {
917    
918     }
919     } else {
920     die "$0: $self->{ct}->{type}: Unknown token type";
921     }
922     $self->{state} = DATA_STATE;
923    
924     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
925     $self->{line_prev} = $self->{line};
926     $self->{column_prev} = $self->{column};
927     $self->{column}++;
928     $self->{nc}
929     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
930     } else {
931     $self->{set_nc}->($self);
932     }
933    
934    
935     return ($self->{ct}); # start tag or end tag
936    
937     redo A;
938     } elsif (0x0041 <= $self->{nc} and
939     $self->{nc} <= 0x005A) { # A..Z
940    
941     $self->{ca}
942     = {name => chr ($self->{nc} + 0x0020),
943     value => '',
944     line => $self->{line}, column => $self->{column}};
945     $self->{state} = ATTRIBUTE_NAME_STATE;
946    
947     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
948     $self->{line_prev} = $self->{line};
949     $self->{column_prev} = $self->{column};
950     $self->{column}++;
951     $self->{nc}
952     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
953     } else {
954     $self->{set_nc}->($self);
955     }
956    
957     redo A;
958     } elsif ($self->{nc} == 0x002F) { # /
959    
960     $self->{state} = SELF_CLOSING_START_TAG_STATE;
961    
962     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
963     $self->{line_prev} = $self->{line};
964     $self->{column_prev} = $self->{column};
965     $self->{column}++;
966     $self->{nc}
967     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
968     } else {
969     $self->{set_nc}->($self);
970     }
971    
972     redo A;
973     } elsif ($self->{nc} == -1) {
974     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
975     if ($self->{ct}->{type} == START_TAG_TOKEN) {
976    
977     $self->{last_stag_name} = $self->{ct}->{tag_name};
978     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
979     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
980     if ($self->{ct}->{attributes}) {
981    
982     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
983     } else {
984    
985     }
986     } else {
987     die "$0: $self->{ct}->{type}: Unknown token type";
988     }
989     $self->{state} = DATA_STATE;
990     # reconsume
991    
992     return ($self->{ct}); # start tag or end tag
993    
994     redo A;
995     } else {
996     if ({
997     0x0022 => 1, # "
998     0x0027 => 1, # '
999     0x003D => 1, # =
1000     }->{$self->{nc}}) {
1001    
1002     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1003     } else {
1004    
1005     }
1006     $self->{ca}
1007     = {name => chr ($self->{nc}),
1008     value => '',
1009     line => $self->{line}, column => $self->{column}};
1010     $self->{state} = ATTRIBUTE_NAME_STATE;
1011    
1012     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1013     $self->{line_prev} = $self->{line};
1014     $self->{column_prev} = $self->{column};
1015     $self->{column}++;
1016     $self->{nc}
1017     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1018     } else {
1019     $self->{set_nc}->($self);
1020     }
1021    
1022     redo A;
1023     }
1024     } elsif ($self->{state} == ATTRIBUTE_NAME_STATE) {
1025     my $before_leave = sub {
1026     if (exists $self->{ct}->{attributes} # start tag or end tag
1027     ->{$self->{ca}->{name}}) { # MUST
1028    
1029     $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column});
1030     ## Discard $self->{ca} # MUST
1031     } else {
1032    
1033     $self->{ct}->{attributes}->{$self->{ca}->{name}}
1034     = $self->{ca};
1035     }
1036     }; # $before_leave
1037    
1038     if ($is_space->{$self->{nc}}) {
1039    
1040     $before_leave->();
1041     $self->{state} = AFTER_ATTRIBUTE_NAME_STATE;
1042    
1043     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1044     $self->{line_prev} = $self->{line};
1045     $self->{column_prev} = $self->{column};
1046     $self->{column}++;
1047     $self->{nc}
1048     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1049     } else {
1050     $self->{set_nc}->($self);
1051     }
1052    
1053     redo A;
1054     } elsif ($self->{nc} == 0x003D) { # =
1055    
1056     $before_leave->();
1057     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1058    
1059     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1060     $self->{line_prev} = $self->{line};
1061     $self->{column_prev} = $self->{column};
1062     $self->{column}++;
1063     $self->{nc}
1064     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1065     } else {
1066     $self->{set_nc}->($self);
1067     }
1068    
1069     redo A;
1070     } elsif ($self->{nc} == 0x003E) { # >
1071     $before_leave->();
1072     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1073    
1074     $self->{last_stag_name} = $self->{ct}->{tag_name};
1075     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1076    
1077     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1078     if ($self->{ct}->{attributes}) {
1079     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1080     }
1081     } else {
1082     die "$0: $self->{ct}->{type}: Unknown token type";
1083     }
1084     $self->{state} = DATA_STATE;
1085    
1086     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1087     $self->{line_prev} = $self->{line};
1088     $self->{column_prev} = $self->{column};
1089     $self->{column}++;
1090     $self->{nc}
1091     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1092     } else {
1093     $self->{set_nc}->($self);
1094     }
1095    
1096    
1097     return ($self->{ct}); # start tag or end tag
1098    
1099     redo A;
1100     } elsif (0x0041 <= $self->{nc} and
1101     $self->{nc} <= 0x005A) { # A..Z
1102    
1103     $self->{ca}->{name} .= chr ($self->{nc} + 0x0020);
1104     ## Stay in the state
1105    
1106     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1107     $self->{line_prev} = $self->{line};
1108     $self->{column_prev} = $self->{column};
1109     $self->{column}++;
1110     $self->{nc}
1111     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1112     } else {
1113     $self->{set_nc}->($self);
1114     }
1115    
1116     redo A;
1117     } elsif ($self->{nc} == 0x002F) { # /
1118    
1119     $before_leave->();
1120     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1121    
1122     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1123     $self->{line_prev} = $self->{line};
1124     $self->{column_prev} = $self->{column};
1125     $self->{column}++;
1126     $self->{nc}
1127     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1128     } else {
1129     $self->{set_nc}->($self);
1130     }
1131    
1132     redo A;
1133     } elsif ($self->{nc} == -1) {
1134     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1135     $before_leave->();
1136     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1137    
1138     $self->{last_stag_name} = $self->{ct}->{tag_name};
1139     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1140     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1141     if ($self->{ct}->{attributes}) {
1142    
1143     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1144     } else {
1145     ## NOTE: This state should never be reached.
1146    
1147     }
1148     } else {
1149     die "$0: $self->{ct}->{type}: Unknown token type";
1150     }
1151     $self->{state} = DATA_STATE;
1152     # reconsume
1153    
1154     return ($self->{ct}); # start tag or end tag
1155    
1156     redo A;
1157     } else {
1158     if ($self->{nc} == 0x0022 or # "
1159     $self->{nc} == 0x0027) { # '
1160    
1161     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1162     } else {
1163    
1164     }
1165     $self->{ca}->{name} .= chr ($self->{nc});
1166     ## Stay in the state
1167    
1168     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1169     $self->{line_prev} = $self->{line};
1170     $self->{column_prev} = $self->{column};
1171     $self->{column}++;
1172     $self->{nc}
1173     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1174     } else {
1175     $self->{set_nc}->($self);
1176     }
1177    
1178     redo A;
1179     }
1180     } elsif ($self->{state} == AFTER_ATTRIBUTE_NAME_STATE) {
1181     if ($is_space->{$self->{nc}}) {
1182    
1183     ## Stay in the state
1184    
1185     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1186     $self->{line_prev} = $self->{line};
1187     $self->{column_prev} = $self->{column};
1188     $self->{column}++;
1189     $self->{nc}
1190     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1191     } else {
1192     $self->{set_nc}->($self);
1193     }
1194    
1195     redo A;
1196     } elsif ($self->{nc} == 0x003D) { # =
1197    
1198     $self->{state} = BEFORE_ATTRIBUTE_VALUE_STATE;
1199    
1200     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1201     $self->{line_prev} = $self->{line};
1202     $self->{column_prev} = $self->{column};
1203     $self->{column}++;
1204     $self->{nc}
1205     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1206     } else {
1207     $self->{set_nc}->($self);
1208     }
1209    
1210     redo A;
1211     } elsif ($self->{nc} == 0x003E) { # >
1212     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1213    
1214     $self->{last_stag_name} = $self->{ct}->{tag_name};
1215     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1216     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1217     if ($self->{ct}->{attributes}) {
1218    
1219     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1220     } else {
1221     ## NOTE: This state should never be reached.
1222    
1223     }
1224     } else {
1225     die "$0: $self->{ct}->{type}: Unknown token type";
1226     }
1227     $self->{state} = DATA_STATE;
1228    
1229     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1230     $self->{line_prev} = $self->{line};
1231     $self->{column_prev} = $self->{column};
1232     $self->{column}++;
1233     $self->{nc}
1234     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1235     } else {
1236     $self->{set_nc}->($self);
1237     }
1238    
1239    
1240     return ($self->{ct}); # start tag or end tag
1241    
1242     redo A;
1243     } elsif (0x0041 <= $self->{nc} and
1244     $self->{nc} <= 0x005A) { # A..Z
1245    
1246     $self->{ca}
1247     = {name => chr ($self->{nc} + 0x0020),
1248     value => '',
1249     line => $self->{line}, column => $self->{column}};
1250     $self->{state} = ATTRIBUTE_NAME_STATE;
1251    
1252     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1253     $self->{line_prev} = $self->{line};
1254     $self->{column_prev} = $self->{column};
1255     $self->{column}++;
1256     $self->{nc}
1257     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1258     } else {
1259     $self->{set_nc}->($self);
1260     }
1261    
1262     redo A;
1263     } elsif ($self->{nc} == 0x002F) { # /
1264    
1265     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1266    
1267     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1268     $self->{line_prev} = $self->{line};
1269     $self->{column_prev} = $self->{column};
1270     $self->{column}++;
1271     $self->{nc}
1272     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1273     } else {
1274     $self->{set_nc}->($self);
1275     }
1276    
1277     redo A;
1278     } elsif ($self->{nc} == -1) {
1279     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1280     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1281    
1282     $self->{last_stag_name} = $self->{ct}->{tag_name};
1283     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1284     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1285     if ($self->{ct}->{attributes}) {
1286    
1287     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1288     } else {
1289     ## NOTE: This state should never be reached.
1290    
1291     }
1292     } else {
1293     die "$0: $self->{ct}->{type}: Unknown token type";
1294     }
1295     $self->{state} = DATA_STATE;
1296     # reconsume
1297    
1298     return ($self->{ct}); # start tag or end tag
1299    
1300     redo A;
1301     } else {
1302     if ($self->{nc} == 0x0022 or # "
1303     $self->{nc} == 0x0027) { # '
1304    
1305     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute name');
1306     } else {
1307    
1308     }
1309     $self->{ca}
1310     = {name => chr ($self->{nc}),
1311     value => '',
1312     line => $self->{line}, column => $self->{column}};
1313     $self->{state} = ATTRIBUTE_NAME_STATE;
1314    
1315     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1316     $self->{line_prev} = $self->{line};
1317     $self->{column_prev} = $self->{column};
1318     $self->{column}++;
1319     $self->{nc}
1320     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1321     } else {
1322     $self->{set_nc}->($self);
1323     }
1324    
1325     redo A;
1326     }
1327     } elsif ($self->{state} == BEFORE_ATTRIBUTE_VALUE_STATE) {
1328     if ($is_space->{$self->{nc}}) {
1329    
1330     ## Stay in the state
1331    
1332     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1333     $self->{line_prev} = $self->{line};
1334     $self->{column_prev} = $self->{column};
1335     $self->{column}++;
1336     $self->{nc}
1337     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1338     } else {
1339     $self->{set_nc}->($self);
1340     }
1341    
1342     redo A;
1343     } elsif ($self->{nc} == 0x0022) { # "
1344    
1345     $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1346    
1347     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1348     $self->{line_prev} = $self->{line};
1349     $self->{column_prev} = $self->{column};
1350     $self->{column}++;
1351     $self->{nc}
1352     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1353     } else {
1354     $self->{set_nc}->($self);
1355     }
1356    
1357     redo A;
1358     } elsif ($self->{nc} == 0x0026) { # &
1359    
1360     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1361     ## reconsume
1362     redo A;
1363     } elsif ($self->{nc} == 0x0027) { # '
1364    
1365     $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1366    
1367     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1368     $self->{line_prev} = $self->{line};
1369     $self->{column_prev} = $self->{column};
1370     $self->{column}++;
1371     $self->{nc}
1372     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1373     } else {
1374     $self->{set_nc}->($self);
1375     }
1376    
1377     redo A;
1378     } elsif ($self->{nc} == 0x003E) { # >
1379     $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty unquoted attribute value');
1380     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1381    
1382     $self->{last_stag_name} = $self->{ct}->{tag_name};
1383     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1384     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1385     if ($self->{ct}->{attributes}) {
1386    
1387     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1388     } else {
1389     ## NOTE: This state should never be reached.
1390    
1391     }
1392     } else {
1393     die "$0: $self->{ct}->{type}: Unknown token type";
1394     }
1395     $self->{state} = DATA_STATE;
1396    
1397     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1398     $self->{line_prev} = $self->{line};
1399     $self->{column_prev} = $self->{column};
1400     $self->{column}++;
1401     $self->{nc}
1402     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1403     } else {
1404     $self->{set_nc}->($self);
1405     }
1406    
1407    
1408     return ($self->{ct}); # start tag or end tag
1409    
1410     redo A;
1411     } elsif ($self->{nc} == -1) {
1412     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1413     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1414    
1415     $self->{last_stag_name} = $self->{ct}->{tag_name};
1416     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1417     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1418     if ($self->{ct}->{attributes}) {
1419    
1420     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1421     } else {
1422     ## NOTE: This state should never be reached.
1423    
1424     }
1425     } else {
1426     die "$0: $self->{ct}->{type}: Unknown token type";
1427     }
1428     $self->{state} = DATA_STATE;
1429     ## reconsume
1430    
1431     return ($self->{ct}); # start tag or end tag
1432    
1433     redo A;
1434     } else {
1435     if ($self->{nc} == 0x003D) { # =
1436    
1437     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1438     } else {
1439    
1440     }
1441     $self->{ca}->{value} .= chr ($self->{nc});
1442     $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1443    
1444     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1445     $self->{line_prev} = $self->{line};
1446     $self->{column_prev} = $self->{column};
1447     $self->{column}++;
1448     $self->{nc}
1449     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1450     } else {
1451     $self->{set_nc}->($self);
1452     }
1453    
1454     redo A;
1455     }
1456     } elsif ($self->{state} == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) {
1457     if ($self->{nc} == 0x0022) { # "
1458    
1459     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1460    
1461     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1462     $self->{line_prev} = $self->{line};
1463     $self->{column_prev} = $self->{column};
1464     $self->{column}++;
1465     $self->{nc}
1466     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1467     } else {
1468     $self->{set_nc}->($self);
1469     }
1470    
1471     redo A;
1472     } elsif ($self->{nc} == 0x0026) { # &
1473    
1474     ## NOTE: In the spec, the tokenizer is switched to the
1475     ## "entity in attribute value state". In this implementation, the
1476     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1477     ## implementation of the "consume a character reference" algorithm.
1478     $self->{prev_state} = $self->{state};
1479     $self->{entity_add} = 0x0022; # "
1480     $self->{state} = ENTITY_STATE;
1481    
1482     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1483     $self->{line_prev} = $self->{line};
1484     $self->{column_prev} = $self->{column};
1485     $self->{column}++;
1486     $self->{nc}
1487     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1488     } else {
1489     $self->{set_nc}->($self);
1490     }
1491    
1492     redo A;
1493     } elsif ($self->{nc} == -1) {
1494     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1495     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1496    
1497     $self->{last_stag_name} = $self->{ct}->{tag_name};
1498     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1499     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1500     if ($self->{ct}->{attributes}) {
1501    
1502     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1503     } else {
1504     ## NOTE: This state should never be reached.
1505    
1506     }
1507     } else {
1508     die "$0: $self->{ct}->{type}: Unknown token type";
1509     }
1510     $self->{state} = DATA_STATE;
1511     ## reconsume
1512    
1513     return ($self->{ct}); # start tag or end tag
1514    
1515     redo A;
1516     } else {
1517    
1518     $self->{ca}->{value} .= chr ($self->{nc});
1519     $self->{read_until}->($self->{ca}->{value},
1520     q["&],
1521     length $self->{ca}->{value});
1522    
1523     ## Stay in the state
1524    
1525     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1526     $self->{line_prev} = $self->{line};
1527     $self->{column_prev} = $self->{column};
1528     $self->{column}++;
1529     $self->{nc}
1530     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1531     } else {
1532     $self->{set_nc}->($self);
1533     }
1534    
1535     redo A;
1536     }
1537     } elsif ($self->{state} == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) {
1538     if ($self->{nc} == 0x0027) { # '
1539    
1540     $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1541    
1542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1543     $self->{line_prev} = $self->{line};
1544     $self->{column_prev} = $self->{column};
1545     $self->{column}++;
1546     $self->{nc}
1547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1548     } else {
1549     $self->{set_nc}->($self);
1550     }
1551    
1552     redo A;
1553     } elsif ($self->{nc} == 0x0026) { # &
1554    
1555     ## NOTE: In the spec, the tokenizer is switched to the
1556     ## "entity in attribute value state". In this implementation, the
1557     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1558     ## implementation of the "consume a character reference" algorithm.
1559     $self->{entity_add} = 0x0027; # '
1560     $self->{prev_state} = $self->{state};
1561     $self->{state} = ENTITY_STATE;
1562    
1563     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1564     $self->{line_prev} = $self->{line};
1565     $self->{column_prev} = $self->{column};
1566     $self->{column}++;
1567     $self->{nc}
1568     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1569     } else {
1570     $self->{set_nc}->($self);
1571     }
1572    
1573     redo A;
1574     } elsif ($self->{nc} == -1) {
1575     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value');
1576     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1577    
1578     $self->{last_stag_name} = $self->{ct}->{tag_name};
1579     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1580     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1581     if ($self->{ct}->{attributes}) {
1582    
1583     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1584     } else {
1585     ## NOTE: This state should never be reached.
1586    
1587     }
1588     } else {
1589     die "$0: $self->{ct}->{type}: Unknown token type";
1590     }
1591     $self->{state} = DATA_STATE;
1592     ## reconsume
1593    
1594     return ($self->{ct}); # start tag or end tag
1595    
1596     redo A;
1597     } else {
1598    
1599     $self->{ca}->{value} .= chr ($self->{nc});
1600     $self->{read_until}->($self->{ca}->{value},
1601     q['&],
1602     length $self->{ca}->{value});
1603    
1604     ## Stay in the state
1605    
1606     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1607     $self->{line_prev} = $self->{line};
1608     $self->{column_prev} = $self->{column};
1609     $self->{column}++;
1610     $self->{nc}
1611     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1612     } else {
1613     $self->{set_nc}->($self);
1614     }
1615    
1616     redo A;
1617     }
1618     } elsif ($self->{state} == ATTRIBUTE_VALUE_UNQUOTED_STATE) {
1619     if ($is_space->{$self->{nc}}) {
1620    
1621     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1622    
1623     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1624     $self->{line_prev} = $self->{line};
1625     $self->{column_prev} = $self->{column};
1626     $self->{column}++;
1627     $self->{nc}
1628     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1629     } else {
1630     $self->{set_nc}->($self);
1631     }
1632    
1633     redo A;
1634     } elsif ($self->{nc} == 0x0026) { # &
1635    
1636     ## NOTE: In the spec, the tokenizer is switched to the
1637     ## "entity in attribute value state". In this implementation, the
1638     ## tokenizer is switched to the |ENTITY_STATE|, which is an
1639     ## implementation of the "consume a character reference" algorithm.
1640     $self->{entity_add} = -1;
1641     $self->{prev_state} = $self->{state};
1642     $self->{state} = ENTITY_STATE;
1643    
1644     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1645     $self->{line_prev} = $self->{line};
1646     $self->{column_prev} = $self->{column};
1647     $self->{column}++;
1648     $self->{nc}
1649     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1650     } else {
1651     $self->{set_nc}->($self);
1652     }
1653    
1654     redo A;
1655     } elsif ($self->{nc} == 0x003E) { # >
1656     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1657    
1658     $self->{last_stag_name} = $self->{ct}->{tag_name};
1659     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1660     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1661     if ($self->{ct}->{attributes}) {
1662    
1663     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1664     } else {
1665     ## NOTE: This state should never be reached.
1666    
1667     }
1668     } else {
1669     die "$0: $self->{ct}->{type}: Unknown token type";
1670     }
1671     $self->{state} = DATA_STATE;
1672    
1673     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1674     $self->{line_prev} = $self->{line};
1675     $self->{column_prev} = $self->{column};
1676     $self->{column}++;
1677     $self->{nc}
1678     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1679     } else {
1680     $self->{set_nc}->($self);
1681     }
1682    
1683    
1684     return ($self->{ct}); # start tag or end tag
1685    
1686     redo A;
1687     } elsif ($self->{nc} == -1) {
1688     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1689     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1690    
1691     $self->{last_stag_name} = $self->{ct}->{tag_name};
1692     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1693     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1694     if ($self->{ct}->{attributes}) {
1695    
1696     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1697     } else {
1698     ## NOTE: This state should never be reached.
1699    
1700     }
1701     } else {
1702     die "$0: $self->{ct}->{type}: Unknown token type";
1703     }
1704     $self->{state} = DATA_STATE;
1705     ## reconsume
1706    
1707     return ($self->{ct}); # start tag or end tag
1708    
1709     redo A;
1710     } else {
1711     if ({
1712     0x0022 => 1, # "
1713     0x0027 => 1, # '
1714     0x003D => 1, # =
1715     }->{$self->{nc}}) {
1716    
1717     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value');
1718     } else {
1719    
1720     }
1721     $self->{ca}->{value} .= chr ($self->{nc});
1722     $self->{read_until}->($self->{ca}->{value},
1723     q["'=& >],
1724     length $self->{ca}->{value});
1725    
1726     ## Stay in the state
1727    
1728     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1729     $self->{line_prev} = $self->{line};
1730     $self->{column_prev} = $self->{column};
1731     $self->{column}++;
1732     $self->{nc}
1733     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1734     } else {
1735     $self->{set_nc}->($self);
1736     }
1737    
1738     redo A;
1739     }
1740     } elsif ($self->{state} == AFTER_ATTRIBUTE_VALUE_QUOTED_STATE) {
1741     if ($is_space->{$self->{nc}}) {
1742    
1743     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1744    
1745     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1746     $self->{line_prev} = $self->{line};
1747     $self->{column_prev} = $self->{column};
1748     $self->{column}++;
1749     $self->{nc}
1750     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1751     } else {
1752     $self->{set_nc}->($self);
1753     }
1754    
1755     redo A;
1756     } elsif ($self->{nc} == 0x003E) { # >
1757     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1758    
1759     $self->{last_stag_name} = $self->{ct}->{tag_name};
1760     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1761     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1762     if ($self->{ct}->{attributes}) {
1763    
1764     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1765     } else {
1766     ## NOTE: This state should never be reached.
1767    
1768     }
1769     } else {
1770     die "$0: $self->{ct}->{type}: Unknown token type";
1771     }
1772     $self->{state} = DATA_STATE;
1773    
1774     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1775     $self->{line_prev} = $self->{line};
1776     $self->{column_prev} = $self->{column};
1777     $self->{column}++;
1778     $self->{nc}
1779     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1780     } else {
1781     $self->{set_nc}->($self);
1782     }
1783    
1784    
1785     return ($self->{ct}); # start tag or end tag
1786    
1787     redo A;
1788     } elsif ($self->{nc} == 0x002F) { # /
1789    
1790     $self->{state} = SELF_CLOSING_START_TAG_STATE;
1791    
1792     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1793     $self->{line_prev} = $self->{line};
1794     $self->{column_prev} = $self->{column};
1795     $self->{column}++;
1796     $self->{nc}
1797     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1798     } else {
1799     $self->{set_nc}->($self);
1800     }
1801    
1802     redo A;
1803     } elsif ($self->{nc} == -1) {
1804     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1805     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1806    
1807     $self->{last_stag_name} = $self->{ct}->{tag_name};
1808     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1809     if ($self->{ct}->{attributes}) {
1810    
1811     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1812     } else {
1813     ## NOTE: This state should never be reached.
1814    
1815     }
1816     } else {
1817     die "$0: $self->{ct}->{type}: Unknown token type";
1818     }
1819     $self->{state} = DATA_STATE;
1820     ## Reconsume.
1821     return ($self->{ct}); # start tag or end tag
1822     redo A;
1823     } else {
1824    
1825     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space between attributes');
1826     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1827     ## reconsume
1828     redo A;
1829     }
1830     } elsif ($self->{state} == SELF_CLOSING_START_TAG_STATE) {
1831     if ($self->{nc} == 0x003E) { # >
1832     if ($self->{ct}->{type} == END_TAG_TOKEN) {
1833    
1834     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct});
1835     ## TODO: Different type than slash in start tag
1836     $self->{content_model} = PCDATA_CONTENT_MODEL; # MUST
1837     if ($self->{ct}->{attributes}) {
1838    
1839     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1840     } else {
1841    
1842     }
1843     ## TODO: Test |<title></title/>|
1844     } else {
1845    
1846     $self->{self_closing} = 1;
1847     }
1848    
1849     $self->{state} = DATA_STATE;
1850    
1851     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1852     $self->{line_prev} = $self->{line};
1853     $self->{column_prev} = $self->{column};
1854     $self->{column}++;
1855     $self->{nc}
1856     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1857     } else {
1858     $self->{set_nc}->($self);
1859     }
1860    
1861    
1862     return ($self->{ct}); # start tag or end tag
1863    
1864     redo A;
1865     } elsif ($self->{nc} == -1) {
1866     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag');
1867     if ($self->{ct}->{type} == START_TAG_TOKEN) {
1868    
1869     $self->{last_stag_name} = $self->{ct}->{tag_name};
1870     } elsif ($self->{ct}->{type} == END_TAG_TOKEN) {
1871     if ($self->{ct}->{attributes}) {
1872    
1873     $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute');
1874     } else {
1875     ## NOTE: This state should never be reached.
1876    
1877     }
1878     } else {
1879     die "$0: $self->{ct}->{type}: Unknown token type";
1880     }
1881     $self->{state} = DATA_STATE;
1882     ## Reconsume.
1883     return ($self->{ct}); # start tag or end tag
1884     redo A;
1885     } else {
1886    
1887     $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc');
1888     ## TODO: This error type is wrong.
1889     $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE;
1890     ## Reconsume.
1891     redo A;
1892     }
1893     } elsif ($self->{state} == BOGUS_COMMENT_STATE) {
1894     ## (only happen if PCDATA state)
1895    
1896     ## NOTE: Unlike spec's "bogus comment state", this implementation
1897     ## consumes characters one-by-one basis.
1898    
1899     if ($self->{nc} == 0x003E) { # >
1900    
1901     $self->{state} = DATA_STATE;
1902    
1903     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1904     $self->{line_prev} = $self->{line};
1905     $self->{column_prev} = $self->{column};
1906     $self->{column}++;
1907     $self->{nc}
1908     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1909     } else {
1910     $self->{set_nc}->($self);
1911     }
1912    
1913    
1914     return ($self->{ct}); # comment
1915     redo A;
1916     } elsif ($self->{nc} == -1) {
1917    
1918     $self->{state} = DATA_STATE;
1919     ## reconsume
1920    
1921     return ($self->{ct}); # comment
1922     redo A;
1923     } else {
1924    
1925     $self->{ct}->{data} .= chr ($self->{nc}); # comment
1926     $self->{read_until}->($self->{ct}->{data},
1927     q[>],
1928     length $self->{ct}->{data});
1929    
1930     ## Stay in the state.
1931    
1932     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1933     $self->{line_prev} = $self->{line};
1934     $self->{column_prev} = $self->{column};
1935     $self->{column}++;
1936     $self->{nc}
1937     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1938     } else {
1939     $self->{set_nc}->($self);
1940     }
1941    
1942     redo A;
1943     }
1944     } elsif ($self->{state} == MARKUP_DECLARATION_OPEN_STATE) {
1945     ## (only happen if PCDATA state)
1946    
1947     if ($self->{nc} == 0x002D) { # -
1948    
1949     $self->{state} = MD_HYPHEN_STATE;
1950    
1951     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1952     $self->{line_prev} = $self->{line};
1953     $self->{column_prev} = $self->{column};
1954     $self->{column}++;
1955     $self->{nc}
1956     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1957     } else {
1958     $self->{set_nc}->($self);
1959     }
1960    
1961     redo A;
1962     } elsif ($self->{nc} == 0x0044 or # D
1963     $self->{nc} == 0x0064) { # d
1964     ## ASCII case-insensitive.
1965    
1966     $self->{state} = MD_DOCTYPE_STATE;
1967     $self->{s_kwd} = chr $self->{nc};
1968    
1969     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1970     $self->{line_prev} = $self->{line};
1971     $self->{column_prev} = $self->{column};
1972     $self->{column}++;
1973     $self->{nc}
1974     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1975     } else {
1976     $self->{set_nc}->($self);
1977     }
1978    
1979     redo A;
1980     } elsif ($self->{insertion_mode} & IN_FOREIGN_CONTENT_IM and
1981     $self->{open_elements}->[-1]->[1] & FOREIGN_EL and
1982     $self->{nc} == 0x005B) { # [
1983    
1984     $self->{state} = MD_CDATA_STATE;
1985     $self->{s_kwd} = '[';
1986    
1987     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
1988     $self->{line_prev} = $self->{line};
1989     $self->{column_prev} = $self->{column};
1990     $self->{column}++;
1991     $self->{nc}
1992     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
1993     } else {
1994     $self->{set_nc}->($self);
1995     }
1996    
1997     redo A;
1998     } else {
1999    
2000     }
2001    
2002     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2003     line => $self->{line_prev},
2004     column => $self->{column_prev} - 1);
2005     ## Reconsume.
2006     $self->{state} = BOGUS_COMMENT_STATE;
2007     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2008     line => $self->{line_prev},
2009     column => $self->{column_prev} - 1,
2010     };
2011     redo A;
2012     } elsif ($self->{state} == MD_HYPHEN_STATE) {
2013     if ($self->{nc} == 0x002D) { # -
2014    
2015     $self->{ct} = {type => COMMENT_TOKEN, data => '',
2016     line => $self->{line_prev},
2017     column => $self->{column_prev} - 2,
2018     };
2019     $self->{state} = COMMENT_START_STATE;
2020    
2021     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2022     $self->{line_prev} = $self->{line};
2023     $self->{column_prev} = $self->{column};
2024     $self->{column}++;
2025     $self->{nc}
2026     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2027     } else {
2028     $self->{set_nc}->($self);
2029     }
2030    
2031     redo A;
2032     } else {
2033    
2034     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2035     line => $self->{line_prev},
2036     column => $self->{column_prev} - 2);
2037     $self->{state} = BOGUS_COMMENT_STATE;
2038     ## Reconsume.
2039     $self->{ct} = {type => COMMENT_TOKEN,
2040     data => '-',
2041     line => $self->{line_prev},
2042     column => $self->{column_prev} - 2,
2043     };
2044     redo A;
2045     }
2046     } elsif ($self->{state} == MD_DOCTYPE_STATE) {
2047     ## ASCII case-insensitive.
2048     if ($self->{nc} == [
2049     undef,
2050     0x004F, # O
2051     0x0043, # C
2052     0x0054, # T
2053     0x0059, # Y
2054     0x0050, # P
2055     ]->[length $self->{s_kwd}] or
2056     $self->{nc} == [
2057     undef,
2058     0x006F, # o
2059     0x0063, # c
2060     0x0074, # t
2061     0x0079, # y
2062     0x0070, # p
2063     ]->[length $self->{s_kwd}]) {
2064    
2065     ## Stay in the state.
2066     $self->{s_kwd} .= chr $self->{nc};
2067    
2068     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2069     $self->{line_prev} = $self->{line};
2070     $self->{column_prev} = $self->{column};
2071     $self->{column}++;
2072     $self->{nc}
2073     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2074     } else {
2075     $self->{set_nc}->($self);
2076     }
2077    
2078     redo A;
2079     } elsif ((length $self->{s_kwd}) == 6 and
2080     ($self->{nc} == 0x0045 or # E
2081     $self->{nc} == 0x0065)) { # e
2082    
2083     $self->{state} = DOCTYPE_STATE;
2084     $self->{ct} = {type => DOCTYPE_TOKEN,
2085     quirks => 1,
2086     line => $self->{line_prev},
2087     column => $self->{column_prev} - 7,
2088     };
2089    
2090     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2091     $self->{line_prev} = $self->{line};
2092     $self->{column_prev} = $self->{column};
2093     $self->{column}++;
2094     $self->{nc}
2095     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2096     } else {
2097     $self->{set_nc}->($self);
2098     }
2099    
2100     redo A;
2101     } else {
2102    
2103     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2104     line => $self->{line_prev},
2105     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2106     $self->{state} = BOGUS_COMMENT_STATE;
2107     ## Reconsume.
2108     $self->{ct} = {type => COMMENT_TOKEN,
2109     data => $self->{s_kwd},
2110     line => $self->{line_prev},
2111     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2112     };
2113     redo A;
2114     }
2115     } elsif ($self->{state} == MD_CDATA_STATE) {
2116     if ($self->{nc} == {
2117     '[' => 0x0043, # C
2118     '[C' => 0x0044, # D
2119     '[CD' => 0x0041, # A
2120     '[CDA' => 0x0054, # T
2121     '[CDAT' => 0x0041, # A
2122     }->{$self->{s_kwd}}) {
2123    
2124     ## Stay in the state.
2125     $self->{s_kwd} .= chr $self->{nc};
2126    
2127     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2128     $self->{line_prev} = $self->{line};
2129     $self->{column_prev} = $self->{column};
2130     $self->{column}++;
2131     $self->{nc}
2132     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2133     } else {
2134     $self->{set_nc}->($self);
2135     }
2136    
2137     redo A;
2138     } elsif ($self->{s_kwd} eq '[CDATA' and
2139     $self->{nc} == 0x005B) { # [
2140    
2141     $self->{ct} = {type => CHARACTER_TOKEN,
2142     data => '',
2143     line => $self->{line_prev},
2144     column => $self->{column_prev} - 7};
2145     $self->{state} = CDATA_SECTION_STATE;
2146    
2147     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2148     $self->{line_prev} = $self->{line};
2149     $self->{column_prev} = $self->{column};
2150     $self->{column}++;
2151     $self->{nc}
2152     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2153     } else {
2154     $self->{set_nc}->($self);
2155     }
2156    
2157     redo A;
2158     } else {
2159    
2160     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment',
2161     line => $self->{line_prev},
2162     column => $self->{column_prev} - 1 - length $self->{s_kwd});
2163     $self->{state} = BOGUS_COMMENT_STATE;
2164     ## Reconsume.
2165     $self->{ct} = {type => COMMENT_TOKEN,
2166     data => $self->{s_kwd},
2167     line => $self->{line_prev},
2168     column => $self->{column_prev} - 1 - length $self->{s_kwd},
2169     };
2170     redo A;
2171     }
2172     } elsif ($self->{state} == COMMENT_START_STATE) {
2173     if ($self->{nc} == 0x002D) { # -
2174    
2175     $self->{state} = COMMENT_START_DASH_STATE;
2176    
2177     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2178     $self->{line_prev} = $self->{line};
2179     $self->{column_prev} = $self->{column};
2180     $self->{column}++;
2181     $self->{nc}
2182     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2183     } else {
2184     $self->{set_nc}->($self);
2185     }
2186    
2187     redo A;
2188     } elsif ($self->{nc} == 0x003E) { # >
2189    
2190     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2191     $self->{state} = DATA_STATE;
2192    
2193     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2194     $self->{line_prev} = $self->{line};
2195     $self->{column_prev} = $self->{column};
2196     $self->{column}++;
2197     $self->{nc}
2198     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2199     } else {
2200     $self->{set_nc}->($self);
2201     }
2202    
2203    
2204     return ($self->{ct}); # comment
2205    
2206     redo A;
2207     } elsif ($self->{nc} == -1) {
2208    
2209     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2210     $self->{state} = DATA_STATE;
2211     ## reconsume
2212    
2213     return ($self->{ct}); # comment
2214    
2215     redo A;
2216     } else {
2217    
2218     $self->{ct}->{data} # comment
2219     .= chr ($self->{nc});
2220     $self->{state} = COMMENT_STATE;
2221    
2222     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2223     $self->{line_prev} = $self->{line};
2224     $self->{column_prev} = $self->{column};
2225     $self->{column}++;
2226     $self->{nc}
2227     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2228     } else {
2229     $self->{set_nc}->($self);
2230     }
2231    
2232     redo A;
2233     }
2234     } elsif ($self->{state} == COMMENT_START_DASH_STATE) {
2235     if ($self->{nc} == 0x002D) { # -
2236    
2237     $self->{state} = COMMENT_END_STATE;
2238    
2239     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2240     $self->{line_prev} = $self->{line};
2241     $self->{column_prev} = $self->{column};
2242     $self->{column}++;
2243     $self->{nc}
2244     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2245     } else {
2246     $self->{set_nc}->($self);
2247     }
2248    
2249     redo A;
2250     } elsif ($self->{nc} == 0x003E) { # >
2251    
2252     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment');
2253     $self->{state} = DATA_STATE;
2254    
2255     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2256     $self->{line_prev} = $self->{line};
2257     $self->{column_prev} = $self->{column};
2258     $self->{column}++;
2259     $self->{nc}
2260     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2261     } else {
2262     $self->{set_nc}->($self);
2263     }
2264    
2265    
2266     return ($self->{ct}); # comment
2267    
2268     redo A;
2269     } elsif ($self->{nc} == -1) {
2270    
2271     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2272     $self->{state} = DATA_STATE;
2273     ## reconsume
2274    
2275     return ($self->{ct}); # comment
2276    
2277     redo A;
2278     } else {
2279    
2280     $self->{ct}->{data} # comment
2281     .= '-' . chr ($self->{nc});
2282     $self->{state} = COMMENT_STATE;
2283    
2284     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2285     $self->{line_prev} = $self->{line};
2286     $self->{column_prev} = $self->{column};
2287     $self->{column}++;
2288     $self->{nc}
2289     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2290     } else {
2291     $self->{set_nc}->($self);
2292     }
2293    
2294     redo A;
2295     }
2296     } elsif ($self->{state} == COMMENT_STATE) {
2297     if ($self->{nc} == 0x002D) { # -
2298    
2299     $self->{state} = COMMENT_END_DASH_STATE;
2300    
2301     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2302     $self->{line_prev} = $self->{line};
2303     $self->{column_prev} = $self->{column};
2304     $self->{column}++;
2305     $self->{nc}
2306     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2307     } else {
2308     $self->{set_nc}->($self);
2309     }
2310    
2311     redo A;
2312     } elsif ($self->{nc} == -1) {
2313    
2314     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2315     $self->{state} = DATA_STATE;
2316     ## reconsume
2317    
2318     return ($self->{ct}); # comment
2319    
2320     redo A;
2321     } else {
2322    
2323     $self->{ct}->{data} .= chr ($self->{nc}); # comment
2324     $self->{read_until}->($self->{ct}->{data},
2325     q[-],
2326     length $self->{ct}->{data});
2327    
2328     ## Stay in the state
2329    
2330     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2331     $self->{line_prev} = $self->{line};
2332     $self->{column_prev} = $self->{column};
2333     $self->{column}++;
2334     $self->{nc}
2335     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2336     } else {
2337     $self->{set_nc}->($self);
2338     }
2339    
2340     redo A;
2341     }
2342     } elsif ($self->{state} == COMMENT_END_DASH_STATE) {
2343     if ($self->{nc} == 0x002D) { # -
2344    
2345     $self->{state} = COMMENT_END_STATE;
2346    
2347     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2348     $self->{line_prev} = $self->{line};
2349     $self->{column_prev} = $self->{column};
2350     $self->{column}++;
2351     $self->{nc}
2352     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2353     } else {
2354     $self->{set_nc}->($self);
2355     }
2356    
2357     redo A;
2358     } elsif ($self->{nc} == -1) {
2359    
2360     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2361     $self->{state} = DATA_STATE;
2362     ## reconsume
2363    
2364     return ($self->{ct}); # comment
2365    
2366     redo A;
2367     } else {
2368    
2369     $self->{ct}->{data} .= '-' . chr ($self->{nc}); # comment
2370     $self->{state} = COMMENT_STATE;
2371    
2372     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2373     $self->{line_prev} = $self->{line};
2374     $self->{column_prev} = $self->{column};
2375     $self->{column}++;
2376     $self->{nc}
2377     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2378     } else {
2379     $self->{set_nc}->($self);
2380     }
2381    
2382     redo A;
2383     }
2384     } elsif ($self->{state} == COMMENT_END_STATE) {
2385     if ($self->{nc} == 0x003E) { # >
2386    
2387     $self->{state} = DATA_STATE;
2388    
2389     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2390     $self->{line_prev} = $self->{line};
2391     $self->{column_prev} = $self->{column};
2392     $self->{column}++;
2393     $self->{nc}
2394     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2395     } else {
2396     $self->{set_nc}->($self);
2397     }
2398    
2399    
2400     return ($self->{ct}); # comment
2401    
2402     redo A;
2403     } elsif ($self->{nc} == 0x002D) { # -
2404    
2405     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2406     line => $self->{line_prev},
2407     column => $self->{column_prev});
2408     $self->{ct}->{data} .= '-'; # comment
2409     ## Stay in the state
2410    
2411     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2412     $self->{line_prev} = $self->{line};
2413     $self->{column_prev} = $self->{column};
2414     $self->{column}++;
2415     $self->{nc}
2416     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2417     } else {
2418     $self->{set_nc}->($self);
2419     }
2420    
2421     redo A;
2422     } elsif ($self->{nc} == -1) {
2423    
2424     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment');
2425     $self->{state} = DATA_STATE;
2426     ## reconsume
2427    
2428     return ($self->{ct}); # comment
2429    
2430     redo A;
2431     } else {
2432    
2433     $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment',
2434     line => $self->{line_prev},
2435     column => $self->{column_prev});
2436     $self->{ct}->{data} .= '--' . chr ($self->{nc}); # comment
2437     $self->{state} = COMMENT_STATE;
2438    
2439     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2440     $self->{line_prev} = $self->{line};
2441     $self->{column_prev} = $self->{column};
2442     $self->{column}++;
2443     $self->{nc}
2444     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2445     } else {
2446     $self->{set_nc}->($self);
2447     }
2448    
2449     redo A;
2450     }
2451     } elsif ($self->{state} == DOCTYPE_STATE) {
2452     if ($is_space->{$self->{nc}}) {
2453    
2454     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2455    
2456     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2457     $self->{line_prev} = $self->{line};
2458     $self->{column_prev} = $self->{column};
2459     $self->{column}++;
2460     $self->{nc}
2461     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2462     } else {
2463     $self->{set_nc}->($self);
2464     }
2465    
2466     redo A;
2467     } else {
2468    
2469     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name');
2470     $self->{state} = BEFORE_DOCTYPE_NAME_STATE;
2471     ## reconsume
2472     redo A;
2473     }
2474     } elsif ($self->{state} == BEFORE_DOCTYPE_NAME_STATE) {
2475     if ($is_space->{$self->{nc}}) {
2476    
2477     ## Stay in the state
2478    
2479     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2480     $self->{line_prev} = $self->{line};
2481     $self->{column_prev} = $self->{column};
2482     $self->{column}++;
2483     $self->{nc}
2484     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2485     } else {
2486     $self->{set_nc}->($self);
2487     }
2488    
2489     redo A;
2490     } elsif ($self->{nc} == 0x003E) { # >
2491    
2492     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2493     $self->{state} = DATA_STATE;
2494    
2495     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2496     $self->{line_prev} = $self->{line};
2497     $self->{column_prev} = $self->{column};
2498     $self->{column}++;
2499     $self->{nc}
2500     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2501     } else {
2502     $self->{set_nc}->($self);
2503     }
2504    
2505    
2506     return ($self->{ct}); # DOCTYPE (quirks)
2507    
2508     redo A;
2509     } elsif ($self->{nc} == -1) {
2510    
2511     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name');
2512     $self->{state} = DATA_STATE;
2513     ## reconsume
2514    
2515     return ($self->{ct}); # DOCTYPE (quirks)
2516    
2517     redo A;
2518     } else {
2519    
2520     $self->{ct}->{name} = chr $self->{nc};
2521     delete $self->{ct}->{quirks};
2522     $self->{state} = DOCTYPE_NAME_STATE;
2523    
2524     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2525     $self->{line_prev} = $self->{line};
2526     $self->{column_prev} = $self->{column};
2527     $self->{column}++;
2528     $self->{nc}
2529     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2530     } else {
2531     $self->{set_nc}->($self);
2532     }
2533    
2534     redo A;
2535     }
2536     } elsif ($self->{state} == DOCTYPE_NAME_STATE) {
2537     ## ISSUE: Redundant "First," in the spec.
2538     if ($is_space->{$self->{nc}}) {
2539    
2540     $self->{state} = AFTER_DOCTYPE_NAME_STATE;
2541    
2542     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2543     $self->{line_prev} = $self->{line};
2544     $self->{column_prev} = $self->{column};
2545     $self->{column}++;
2546     $self->{nc}
2547     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2548     } else {
2549     $self->{set_nc}->($self);
2550     }
2551    
2552     redo A;
2553     } elsif ($self->{nc} == 0x003E) { # >
2554    
2555     $self->{state} = DATA_STATE;
2556    
2557     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2558     $self->{line_prev} = $self->{line};
2559     $self->{column_prev} = $self->{column};
2560     $self->{column}++;
2561     $self->{nc}
2562     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2563     } else {
2564     $self->{set_nc}->($self);
2565     }
2566    
2567    
2568     return ($self->{ct}); # DOCTYPE
2569    
2570     redo A;
2571     } elsif ($self->{nc} == -1) {
2572    
2573     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2574     $self->{state} = DATA_STATE;
2575     ## reconsume
2576    
2577     $self->{ct}->{quirks} = 1;
2578     return ($self->{ct}); # DOCTYPE
2579    
2580     redo A;
2581     } else {
2582    
2583     $self->{ct}->{name}
2584     .= chr ($self->{nc}); # DOCTYPE
2585     ## Stay in the state
2586    
2587     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2588     $self->{line_prev} = $self->{line};
2589     $self->{column_prev} = $self->{column};
2590     $self->{column}++;
2591     $self->{nc}
2592     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2593     } else {
2594     $self->{set_nc}->($self);
2595     }
2596    
2597     redo A;
2598     }
2599     } elsif ($self->{state} == AFTER_DOCTYPE_NAME_STATE) {
2600     if ($is_space->{$self->{nc}}) {
2601    
2602     ## Stay in the state
2603    
2604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2605     $self->{line_prev} = $self->{line};
2606     $self->{column_prev} = $self->{column};
2607     $self->{column}++;
2608     $self->{nc}
2609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2610     } else {
2611     $self->{set_nc}->($self);
2612     }
2613    
2614     redo A;
2615     } elsif ($self->{nc} == 0x003E) { # >
2616    
2617     $self->{state} = DATA_STATE;
2618    
2619     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2620     $self->{line_prev} = $self->{line};
2621     $self->{column_prev} = $self->{column};
2622     $self->{column}++;
2623     $self->{nc}
2624     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2625     } else {
2626     $self->{set_nc}->($self);
2627     }
2628    
2629    
2630     return ($self->{ct}); # DOCTYPE
2631    
2632     redo A;
2633     } elsif ($self->{nc} == -1) {
2634    
2635     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2636     $self->{state} = DATA_STATE;
2637     ## reconsume
2638    
2639     $self->{ct}->{quirks} = 1;
2640     return ($self->{ct}); # DOCTYPE
2641    
2642     redo A;
2643     } elsif ($self->{nc} == 0x0050 or # P
2644     $self->{nc} == 0x0070) { # p
2645     $self->{state} = PUBLIC_STATE;
2646     $self->{s_kwd} = chr $self->{nc};
2647    
2648     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2649     $self->{line_prev} = $self->{line};
2650     $self->{column_prev} = $self->{column};
2651     $self->{column}++;
2652     $self->{nc}
2653     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2654     } else {
2655     $self->{set_nc}->($self);
2656     }
2657    
2658     redo A;
2659     } elsif ($self->{nc} == 0x0053 or # S
2660     $self->{nc} == 0x0073) { # s
2661     $self->{state} = SYSTEM_STATE;
2662     $self->{s_kwd} = chr $self->{nc};
2663    
2664     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2665     $self->{line_prev} = $self->{line};
2666     $self->{column_prev} = $self->{column};
2667     $self->{column}++;
2668     $self->{nc}
2669     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2670     } else {
2671     $self->{set_nc}->($self);
2672     }
2673    
2674     redo A;
2675     } else {
2676    
2677     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name');
2678     $self->{ct}->{quirks} = 1;
2679    
2680     $self->{state} = BOGUS_DOCTYPE_STATE;
2681    
2682     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2683     $self->{line_prev} = $self->{line};
2684     $self->{column_prev} = $self->{column};
2685     $self->{column}++;
2686     $self->{nc}
2687     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2688     } else {
2689     $self->{set_nc}->($self);
2690     }
2691    
2692     redo A;
2693     }
2694     } elsif ($self->{state} == PUBLIC_STATE) {
2695     ## ASCII case-insensitive
2696     if ($self->{nc} == [
2697     undef,
2698     0x0055, # U
2699     0x0042, # B
2700     0x004C, # L
2701     0x0049, # I
2702     ]->[length $self->{s_kwd}] or
2703     $self->{nc} == [
2704     undef,
2705     0x0075, # u
2706     0x0062, # b
2707     0x006C, # l
2708     0x0069, # i
2709     ]->[length $self->{s_kwd}]) {
2710    
2711     ## Stay in the state.
2712     $self->{s_kwd} .= chr $self->{nc};
2713    
2714     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2715     $self->{line_prev} = $self->{line};
2716     $self->{column_prev} = $self->{column};
2717     $self->{column}++;
2718     $self->{nc}
2719     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2720     } else {
2721     $self->{set_nc}->($self);
2722     }
2723    
2724     redo A;
2725     } elsif ((length $self->{s_kwd}) == 5 and
2726     ($self->{nc} == 0x0043 or # C
2727     $self->{nc} == 0x0063)) { # c
2728    
2729     $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2730    
2731     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2732     $self->{line_prev} = $self->{line};
2733     $self->{column_prev} = $self->{column};
2734     $self->{column}++;
2735     $self->{nc}
2736     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2737     } else {
2738     $self->{set_nc}->($self);
2739     }
2740    
2741     redo A;
2742     } else {
2743    
2744     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2745     line => $self->{line_prev},
2746     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2747     $self->{ct}->{quirks} = 1;
2748    
2749     $self->{state} = BOGUS_DOCTYPE_STATE;
2750     ## Reconsume.
2751     redo A;
2752     }
2753     } elsif ($self->{state} == SYSTEM_STATE) {
2754     ## ASCII case-insensitive
2755     if ($self->{nc} == [
2756     undef,
2757     0x0059, # Y
2758     0x0053, # S
2759     0x0054, # T
2760     0x0045, # E
2761     ]->[length $self->{s_kwd}] or
2762     $self->{nc} == [
2763     undef,
2764     0x0079, # y
2765     0x0073, # s
2766     0x0074, # t
2767     0x0065, # e
2768     ]->[length $self->{s_kwd}]) {
2769    
2770     ## Stay in the state.
2771     $self->{s_kwd} .= chr $self->{nc};
2772    
2773     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2774     $self->{line_prev} = $self->{line};
2775     $self->{column_prev} = $self->{column};
2776     $self->{column}++;
2777     $self->{nc}
2778     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2779     } else {
2780     $self->{set_nc}->($self);
2781     }
2782    
2783     redo A;
2784     } elsif ((length $self->{s_kwd}) == 5 and
2785     ($self->{nc} == 0x004D or # M
2786     $self->{nc} == 0x006D)) { # m
2787    
2788     $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2789    
2790     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2791     $self->{line_prev} = $self->{line};
2792     $self->{column_prev} = $self->{column};
2793     $self->{column}++;
2794     $self->{nc}
2795     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2796     } else {
2797     $self->{set_nc}->($self);
2798     }
2799    
2800     redo A;
2801     } else {
2802    
2803     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name',
2804     line => $self->{line_prev},
2805     column => $self->{column_prev} + 1 - length $self->{s_kwd});
2806     $self->{ct}->{quirks} = 1;
2807    
2808     $self->{state} = BOGUS_DOCTYPE_STATE;
2809     ## Reconsume.
2810     redo A;
2811     }
2812     } elsif ($self->{state} == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
2813     if ($is_space->{$self->{nc}}) {
2814    
2815     ## Stay in the state
2816    
2817     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2818     $self->{line_prev} = $self->{line};
2819     $self->{column_prev} = $self->{column};
2820     $self->{column}++;
2821     $self->{nc}
2822     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2823     } else {
2824     $self->{set_nc}->($self);
2825     }
2826    
2827     redo A;
2828     } elsif ($self->{nc} eq 0x0022) { # "
2829    
2830     $self->{ct}->{pubid} = ''; # DOCTYPE
2831     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
2832    
2833     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2834     $self->{line_prev} = $self->{line};
2835     $self->{column_prev} = $self->{column};
2836     $self->{column}++;
2837     $self->{nc}
2838     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2839     } else {
2840     $self->{set_nc}->($self);
2841     }
2842    
2843     redo A;
2844     } elsif ($self->{nc} eq 0x0027) { # '
2845    
2846     $self->{ct}->{pubid} = ''; # DOCTYPE
2847     $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
2848    
2849     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2850     $self->{line_prev} = $self->{line};
2851     $self->{column_prev} = $self->{column};
2852     $self->{column}++;
2853     $self->{nc}
2854     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2855     } else {
2856     $self->{set_nc}->($self);
2857     }
2858    
2859     redo A;
2860     } elsif ($self->{nc} eq 0x003E) { # >
2861    
2862     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal');
2863    
2864     $self->{state} = DATA_STATE;
2865    
2866     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2867     $self->{line_prev} = $self->{line};
2868     $self->{column_prev} = $self->{column};
2869     $self->{column}++;
2870     $self->{nc}
2871     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2872     } else {
2873     $self->{set_nc}->($self);
2874     }
2875    
2876    
2877     $self->{ct}->{quirks} = 1;
2878     return ($self->{ct}); # DOCTYPE
2879    
2880     redo A;
2881     } elsif ($self->{nc} == -1) {
2882    
2883     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
2884    
2885     $self->{state} = DATA_STATE;
2886     ## reconsume
2887    
2888     $self->{ct}->{quirks} = 1;
2889     return ($self->{ct}); # DOCTYPE
2890    
2891     redo A;
2892     } else {
2893    
2894     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC');
2895     $self->{ct}->{quirks} = 1;
2896    
2897     $self->{state} = BOGUS_DOCTYPE_STATE;
2898    
2899     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2900     $self->{line_prev} = $self->{line};
2901     $self->{column_prev} = $self->{column};
2902     $self->{column}++;
2903     $self->{nc}
2904     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2905     } else {
2906     $self->{set_nc}->($self);
2907     }
2908    
2909     redo A;
2910     }
2911     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) {
2912     if ($self->{nc} == 0x0022) { # "
2913    
2914     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2915    
2916     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2917     $self->{line_prev} = $self->{line};
2918     $self->{column_prev} = $self->{column};
2919     $self->{column}++;
2920     $self->{nc}
2921     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2922     } else {
2923     $self->{set_nc}->($self);
2924     }
2925    
2926     redo A;
2927     } elsif ($self->{nc} == 0x003E) { # >
2928    
2929     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2930    
2931     $self->{state} = DATA_STATE;
2932    
2933     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2934     $self->{line_prev} = $self->{line};
2935     $self->{column_prev} = $self->{column};
2936     $self->{column}++;
2937     $self->{nc}
2938     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2939     } else {
2940     $self->{set_nc}->($self);
2941     }
2942    
2943    
2944     $self->{ct}->{quirks} = 1;
2945     return ($self->{ct}); # DOCTYPE
2946    
2947     redo A;
2948     } elsif ($self->{nc} == -1) {
2949    
2950     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2951    
2952     $self->{state} = DATA_STATE;
2953     ## reconsume
2954    
2955     $self->{ct}->{quirks} = 1;
2956     return ($self->{ct}); # DOCTYPE
2957    
2958     redo A;
2959     } else {
2960    
2961     $self->{ct}->{pubid} # DOCTYPE
2962     .= chr $self->{nc};
2963     $self->{read_until}->($self->{ct}->{pubid}, q[">],
2964     length $self->{ct}->{pubid});
2965    
2966     ## Stay in the state
2967    
2968     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2969     $self->{line_prev} = $self->{line};
2970     $self->{column_prev} = $self->{column};
2971     $self->{column}++;
2972     $self->{nc}
2973     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2974     } else {
2975     $self->{set_nc}->($self);
2976     }
2977    
2978     redo A;
2979     }
2980     } elsif ($self->{state} == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) {
2981     if ($self->{nc} == 0x0027) { # '
2982    
2983     $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2984    
2985     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
2986     $self->{line_prev} = $self->{line};
2987     $self->{column_prev} = $self->{column};
2988     $self->{column}++;
2989     $self->{nc}
2990     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
2991     } else {
2992     $self->{set_nc}->($self);
2993     }
2994    
2995     redo A;
2996     } elsif ($self->{nc} == 0x003E) { # >
2997    
2998     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
2999    
3000     $self->{state} = DATA_STATE;
3001    
3002     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3003     $self->{line_prev} = $self->{line};
3004     $self->{column_prev} = $self->{column};
3005     $self->{column}++;
3006     $self->{nc}
3007     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3008     } else {
3009     $self->{set_nc}->($self);
3010     }
3011    
3012    
3013     $self->{ct}->{quirks} = 1;
3014     return ($self->{ct}); # DOCTYPE
3015    
3016     redo A;
3017     } elsif ($self->{nc} == -1) {
3018    
3019     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal');
3020    
3021     $self->{state} = DATA_STATE;
3022     ## reconsume
3023    
3024     $self->{ct}->{quirks} = 1;
3025     return ($self->{ct}); # DOCTYPE
3026    
3027     redo A;
3028     } else {
3029    
3030     $self->{ct}->{pubid} # DOCTYPE
3031     .= chr $self->{nc};
3032     $self->{read_until}->($self->{ct}->{pubid}, q['>],
3033     length $self->{ct}->{pubid});
3034    
3035     ## Stay in the state
3036    
3037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3038     $self->{line_prev} = $self->{line};
3039     $self->{column_prev} = $self->{column};
3040     $self->{column}++;
3041     $self->{nc}
3042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3043     } else {
3044     $self->{set_nc}->($self);
3045     }
3046    
3047     redo A;
3048     }
3049     } elsif ($self->{state} == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) {
3050     if ($is_space->{$self->{nc}}) {
3051    
3052     ## Stay in the state
3053    
3054     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3055     $self->{line_prev} = $self->{line};
3056     $self->{column_prev} = $self->{column};
3057     $self->{column}++;
3058     $self->{nc}
3059     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3060     } else {
3061     $self->{set_nc}->($self);
3062     }
3063    
3064     redo A;
3065     } elsif ($self->{nc} == 0x0022) { # "
3066    
3067     $self->{ct}->{sysid} = ''; # DOCTYPE
3068     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3069    
3070     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3071     $self->{line_prev} = $self->{line};
3072     $self->{column_prev} = $self->{column};
3073     $self->{column}++;
3074     $self->{nc}
3075     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3076     } else {
3077     $self->{set_nc}->($self);
3078     }
3079    
3080     redo A;
3081     } elsif ($self->{nc} == 0x0027) { # '
3082    
3083     $self->{ct}->{sysid} = ''; # DOCTYPE
3084     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3085    
3086     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3087     $self->{line_prev} = $self->{line};
3088     $self->{column_prev} = $self->{column};
3089     $self->{column}++;
3090     $self->{nc}
3091     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3092     } else {
3093     $self->{set_nc}->($self);
3094     }
3095    
3096     redo A;
3097     } elsif ($self->{nc} == 0x003E) { # >
3098    
3099     $self->{state} = DATA_STATE;
3100    
3101     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3102     $self->{line_prev} = $self->{line};
3103     $self->{column_prev} = $self->{column};
3104     $self->{column}++;
3105     $self->{nc}
3106     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3107     } else {
3108     $self->{set_nc}->($self);
3109     }
3110    
3111    
3112     return ($self->{ct}); # DOCTYPE
3113    
3114     redo A;
3115     } elsif ($self->{nc} == -1) {
3116    
3117     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3118    
3119     $self->{state} = DATA_STATE;
3120     ## reconsume
3121    
3122     $self->{ct}->{quirks} = 1;
3123     return ($self->{ct}); # DOCTYPE
3124    
3125     redo A;
3126     } else {
3127    
3128     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal');
3129     $self->{ct}->{quirks} = 1;
3130    
3131     $self->{state} = BOGUS_DOCTYPE_STATE;
3132    
3133     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3134     $self->{line_prev} = $self->{line};
3135     $self->{column_prev} = $self->{column};
3136     $self->{column}++;
3137     $self->{nc}
3138     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3139     } else {
3140     $self->{set_nc}->($self);
3141     }
3142    
3143     redo A;
3144     }
3145     } elsif ($self->{state} == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3146     if ($is_space->{$self->{nc}}) {
3147    
3148     ## Stay in the state
3149    
3150     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3151     $self->{line_prev} = $self->{line};
3152     $self->{column_prev} = $self->{column};
3153     $self->{column}++;
3154     $self->{nc}
3155     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3156     } else {
3157     $self->{set_nc}->($self);
3158     }
3159    
3160     redo A;
3161     } elsif ($self->{nc} == 0x0022) { # "
3162    
3163     $self->{ct}->{sysid} = ''; # DOCTYPE
3164     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
3165    
3166     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3167     $self->{line_prev} = $self->{line};
3168     $self->{column_prev} = $self->{column};
3169     $self->{column}++;
3170     $self->{nc}
3171     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3172     } else {
3173     $self->{set_nc}->($self);
3174     }
3175    
3176     redo A;
3177     } elsif ($self->{nc} == 0x0027) { # '
3178    
3179     $self->{ct}->{sysid} = ''; # DOCTYPE
3180     $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
3181    
3182     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3183     $self->{line_prev} = $self->{line};
3184     $self->{column_prev} = $self->{column};
3185     $self->{column}++;
3186     $self->{nc}
3187     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3188     } else {
3189     $self->{set_nc}->($self);
3190     }
3191    
3192     redo A;
3193     } elsif ($self->{nc} == 0x003E) { # >
3194    
3195     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal');
3196     $self->{state} = DATA_STATE;
3197    
3198     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3199     $self->{line_prev} = $self->{line};
3200     $self->{column_prev} = $self->{column};
3201     $self->{column}++;
3202     $self->{nc}
3203     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3204     } else {
3205     $self->{set_nc}->($self);
3206     }
3207    
3208    
3209     $self->{ct}->{quirks} = 1;
3210     return ($self->{ct}); # DOCTYPE
3211    
3212     redo A;
3213     } elsif ($self->{nc} == -1) {
3214    
3215     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3216    
3217     $self->{state} = DATA_STATE;
3218     ## reconsume
3219    
3220     $self->{ct}->{quirks} = 1;
3221     return ($self->{ct}); # DOCTYPE
3222    
3223     redo A;
3224     } else {
3225    
3226     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM');
3227     $self->{ct}->{quirks} = 1;
3228    
3229     $self->{state} = BOGUS_DOCTYPE_STATE;
3230    
3231     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3232     $self->{line_prev} = $self->{line};
3233     $self->{column_prev} = $self->{column};
3234     $self->{column}++;
3235     $self->{nc}
3236     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3237     } else {
3238     $self->{set_nc}->($self);
3239     }
3240    
3241     redo A;
3242     }
3243     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) {
3244     if ($self->{nc} == 0x0022) { # "
3245    
3246     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3247    
3248     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3249     $self->{line_prev} = $self->{line};
3250     $self->{column_prev} = $self->{column};
3251     $self->{column}++;
3252     $self->{nc}
3253     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3254     } else {
3255     $self->{set_nc}->($self);
3256     }
3257    
3258     redo A;
3259     } elsif ($self->{nc} == 0x003E) { # >
3260    
3261     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3262    
3263     $self->{state} = DATA_STATE;
3264    
3265     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3266     $self->{line_prev} = $self->{line};
3267     $self->{column_prev} = $self->{column};
3268     $self->{column}++;
3269     $self->{nc}
3270     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3271     } else {
3272     $self->{set_nc}->($self);
3273     }
3274    
3275    
3276     $self->{ct}->{quirks} = 1;
3277     return ($self->{ct}); # DOCTYPE
3278    
3279     redo A;
3280     } elsif ($self->{nc} == -1) {
3281    
3282     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3283    
3284     $self->{state} = DATA_STATE;
3285     ## reconsume
3286    
3287     $self->{ct}->{quirks} = 1;
3288     return ($self->{ct}); # DOCTYPE
3289    
3290     redo A;
3291     } else {
3292    
3293     $self->{ct}->{sysid} # DOCTYPE
3294     .= chr $self->{nc};
3295     $self->{read_until}->($self->{ct}->{sysid}, q[">],
3296     length $self->{ct}->{sysid});
3297    
3298     ## Stay in the state
3299    
3300     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3301     $self->{line_prev} = $self->{line};
3302     $self->{column_prev} = $self->{column};
3303     $self->{column}++;
3304     $self->{nc}
3305     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3306     } else {
3307     $self->{set_nc}->($self);
3308     }
3309    
3310     redo A;
3311     }
3312     } elsif ($self->{state} == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) {
3313     if ($self->{nc} == 0x0027) { # '
3314    
3315     $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
3316    
3317     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3318     $self->{line_prev} = $self->{line};
3319     $self->{column_prev} = $self->{column};
3320     $self->{column}++;
3321     $self->{nc}
3322     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3323     } else {
3324     $self->{set_nc}->($self);
3325     }
3326    
3327     redo A;
3328     } elsif ($self->{nc} == 0x003E) { # >
3329    
3330     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3331    
3332     $self->{state} = DATA_STATE;
3333    
3334     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3335     $self->{line_prev} = $self->{line};
3336     $self->{column_prev} = $self->{column};
3337     $self->{column}++;
3338     $self->{nc}
3339     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3340     } else {
3341     $self->{set_nc}->($self);
3342     }
3343    
3344    
3345     $self->{ct}->{quirks} = 1;
3346     return ($self->{ct}); # DOCTYPE
3347    
3348     redo A;
3349     } elsif ($self->{nc} == -1) {
3350    
3351     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal');
3352    
3353     $self->{state} = DATA_STATE;
3354     ## reconsume
3355    
3356     $self->{ct}->{quirks} = 1;
3357     return ($self->{ct}); # DOCTYPE
3358    
3359     redo A;
3360     } else {
3361    
3362     $self->{ct}->{sysid} # DOCTYPE
3363     .= chr $self->{nc};
3364     $self->{read_until}->($self->{ct}->{sysid}, q['>],
3365     length $self->{ct}->{sysid});
3366    
3367     ## Stay in the state
3368    
3369     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3370     $self->{line_prev} = $self->{line};
3371     $self->{column_prev} = $self->{column};
3372     $self->{column}++;
3373     $self->{nc}
3374     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3375     } else {
3376     $self->{set_nc}->($self);
3377     }
3378    
3379     redo A;
3380     }
3381     } elsif ($self->{state} == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) {
3382     if ($is_space->{$self->{nc}}) {
3383    
3384     ## Stay in the state
3385    
3386     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3387     $self->{line_prev} = $self->{line};
3388     $self->{column_prev} = $self->{column};
3389     $self->{column}++;
3390     $self->{nc}
3391     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3392     } else {
3393     $self->{set_nc}->($self);
3394     }
3395    
3396     redo A;
3397     } elsif ($self->{nc} == 0x003E) { # >
3398    
3399     $self->{state} = DATA_STATE;
3400    
3401     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3402     $self->{line_prev} = $self->{line};
3403     $self->{column_prev} = $self->{column};
3404     $self->{column}++;
3405     $self->{nc}
3406     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3407     } else {
3408     $self->{set_nc}->($self);
3409     }
3410    
3411    
3412     return ($self->{ct}); # DOCTYPE
3413    
3414     redo A;
3415     } elsif ($self->{nc} == -1) {
3416    
3417     $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE');
3418     $self->{state} = DATA_STATE;
3419     ## reconsume
3420    
3421     $self->{ct}->{quirks} = 1;
3422     return ($self->{ct}); # DOCTYPE
3423    
3424     redo A;
3425     } else {
3426    
3427     $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal');
3428     #$self->{ct}->{quirks} = 1;
3429    
3430     $self->{state} = BOGUS_DOCTYPE_STATE;
3431    
3432     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3433     $self->{line_prev} = $self->{line};
3434     $self->{column_prev} = $self->{column};
3435     $self->{column}++;
3436     $self->{nc}
3437     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3438     } else {
3439     $self->{set_nc}->($self);
3440     }
3441    
3442     redo A;
3443     }
3444     } elsif ($self->{state} == BOGUS_DOCTYPE_STATE) {
3445     if ($self->{nc} == 0x003E) { # >
3446    
3447     $self->{state} = DATA_STATE;
3448    
3449     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3450     $self->{line_prev} = $self->{line};
3451     $self->{column_prev} = $self->{column};
3452     $self->{column}++;
3453     $self->{nc}
3454     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3455     } else {
3456     $self->{set_nc}->($self);
3457     }
3458    
3459    
3460     return ($self->{ct}); # DOCTYPE
3461    
3462     redo A;
3463     } elsif ($self->{nc} == -1) {
3464    
3465     $self->{state} = DATA_STATE;
3466     ## reconsume
3467    
3468     return ($self->{ct}); # DOCTYPE
3469    
3470     redo A;
3471     } else {
3472    
3473     my $s = '';
3474     $self->{read_until}->($s, q[>], 0);
3475    
3476     ## Stay in the state
3477    
3478     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3479     $self->{line_prev} = $self->{line};
3480     $self->{column_prev} = $self->{column};
3481     $self->{column}++;
3482     $self->{nc}
3483     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3484     } else {
3485     $self->{set_nc}->($self);
3486     }
3487    
3488     redo A;
3489     }
3490     } elsif ($self->{state} == CDATA_SECTION_STATE) {
3491     ## NOTE: "CDATA section state" in the state is jointly implemented
3492     ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|,
3493     ## and |CDATA_SECTION_MSE2_STATE|.
3494    
3495     if ($self->{nc} == 0x005D) { # ]
3496    
3497     $self->{state} = CDATA_SECTION_MSE1_STATE;
3498    
3499     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3500     $self->{line_prev} = $self->{line};
3501     $self->{column_prev} = $self->{column};
3502     $self->{column}++;
3503     $self->{nc}
3504     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3505     } else {
3506     $self->{set_nc}->($self);
3507     }
3508    
3509     redo A;
3510     } elsif ($self->{nc} == -1) {
3511     $self->{state} = DATA_STATE;
3512    
3513     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3514     $self->{line_prev} = $self->{line};
3515     $self->{column_prev} = $self->{column};
3516     $self->{column}++;
3517     $self->{nc}
3518     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3519     } else {
3520     $self->{set_nc}->($self);
3521     }
3522    
3523     if (length $self->{ct}->{data}) { # character
3524    
3525     return ($self->{ct}); # character
3526     } else {
3527    
3528     ## No token to emit. $self->{ct} is discarded.
3529     }
3530     redo A;
3531     } else {
3532    
3533     $self->{ct}->{data} .= chr $self->{nc};
3534     $self->{read_until}->($self->{ct}->{data},
3535     q<]>,
3536     length $self->{ct}->{data});
3537    
3538     ## Stay in the state.
3539    
3540     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3541     $self->{line_prev} = $self->{line};
3542     $self->{column_prev} = $self->{column};
3543     $self->{column}++;
3544     $self->{nc}
3545     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3546     } else {
3547     $self->{set_nc}->($self);
3548     }
3549    
3550     redo A;
3551     }
3552    
3553     ## ISSUE: "text tokens" in spec.
3554     } elsif ($self->{state} == CDATA_SECTION_MSE1_STATE) {
3555     if ($self->{nc} == 0x005D) { # ]
3556    
3557     $self->{state} = CDATA_SECTION_MSE2_STATE;
3558    
3559     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3560     $self->{line_prev} = $self->{line};
3561     $self->{column_prev} = $self->{column};
3562     $self->{column}++;
3563     $self->{nc}
3564     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3565     } else {
3566     $self->{set_nc}->($self);
3567     }
3568    
3569     redo A;
3570     } else {
3571    
3572     $self->{ct}->{data} .= ']';
3573     $self->{state} = CDATA_SECTION_STATE;
3574     ## Reconsume.
3575     redo A;
3576     }
3577     } elsif ($self->{state} == CDATA_SECTION_MSE2_STATE) {
3578     if ($self->{nc} == 0x003E) { # >
3579     $self->{state} = DATA_STATE;
3580    
3581     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3582     $self->{line_prev} = $self->{line};
3583     $self->{column_prev} = $self->{column};
3584     $self->{column}++;
3585     $self->{nc}
3586     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3587     } else {
3588     $self->{set_nc}->($self);
3589     }
3590    
3591     if (length $self->{ct}->{data}) { # character
3592    
3593     return ($self->{ct}); # character
3594     } else {
3595    
3596     ## No token to emit. $self->{ct} is discarded.
3597     }
3598     redo A;
3599     } elsif ($self->{nc} == 0x005D) { # ]
3600     # character
3601     $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]".
3602     ## Stay in the state.
3603    
3604     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3605     $self->{line_prev} = $self->{line};
3606     $self->{column_prev} = $self->{column};
3607     $self->{column}++;
3608     $self->{nc}
3609     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3610     } else {
3611     $self->{set_nc}->($self);
3612     }
3613    
3614     redo A;
3615     } else {
3616    
3617     $self->{ct}->{data} .= ']]'; # character
3618     $self->{state} = CDATA_SECTION_STATE;
3619     ## Reconsume.
3620     redo A;
3621     }
3622     } elsif ($self->{state} == ENTITY_STATE) {
3623     if ($is_space->{$self->{nc}} or
3624     {
3625     0x003C => 1, 0x0026 => 1, -1 => 1, # <, &
3626     $self->{entity_add} => 1,
3627     }->{$self->{nc}}) {
3628    
3629     ## Don't consume
3630     ## No error
3631     ## Return nothing.
3632     #
3633     } elsif ($self->{nc} == 0x0023) { # #
3634    
3635     $self->{state} = ENTITY_HASH_STATE;
3636     $self->{s_kwd} = '#';
3637    
3638     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3639     $self->{line_prev} = $self->{line};
3640     $self->{column_prev} = $self->{column};
3641     $self->{column}++;
3642     $self->{nc}
3643     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3644     } else {
3645     $self->{set_nc}->($self);
3646     }
3647    
3648     redo A;
3649     } elsif ((0x0041 <= $self->{nc} and
3650     $self->{nc} <= 0x005A) or # A..Z
3651     (0x0061 <= $self->{nc} and
3652     $self->{nc} <= 0x007A)) { # a..z
3653    
3654     require Whatpm::_NamedEntityList;
3655     $self->{state} = ENTITY_NAME_STATE;
3656     $self->{s_kwd} = chr $self->{nc};
3657     $self->{entity__value} = $self->{s_kwd};
3658     $self->{entity__match} = 0;
3659    
3660     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3661     $self->{line_prev} = $self->{line};
3662     $self->{column_prev} = $self->{column};
3663     $self->{column}++;
3664     $self->{nc}
3665     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3666     } else {
3667     $self->{set_nc}->($self);
3668     }
3669    
3670     redo A;
3671     } else {
3672    
3673     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero');
3674     ## Return nothing.
3675     #
3676     }
3677    
3678     ## NOTE: No character is consumed by the "consume a character
3679     ## reference" algorithm. In other word, there is an "&" character
3680     ## that does not introduce a character reference, which would be
3681     ## appended to the parent element or the attribute value in later
3682     ## process of the tokenizer.
3683    
3684     if ($self->{prev_state} == DATA_STATE) {
3685    
3686     $self->{state} = $self->{prev_state};
3687     ## Reconsume.
3688     return ({type => CHARACTER_TOKEN, data => '&',
3689     line => $self->{line_prev},
3690     column => $self->{column_prev},
3691     });
3692     redo A;
3693     } else {
3694    
3695     $self->{ca}->{value} .= '&';
3696     $self->{state} = $self->{prev_state};
3697     ## Reconsume.
3698     redo A;
3699     }
3700     } elsif ($self->{state} == ENTITY_HASH_STATE) {
3701     if ($self->{nc} == 0x0078 or # x
3702     $self->{nc} == 0x0058) { # X
3703    
3704     $self->{state} = HEXREF_X_STATE;
3705     $self->{s_kwd} .= chr $self->{nc};
3706    
3707     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3708     $self->{line_prev} = $self->{line};
3709     $self->{column_prev} = $self->{column};
3710     $self->{column}++;
3711     $self->{nc}
3712     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3713     } else {
3714     $self->{set_nc}->($self);
3715     }
3716    
3717     redo A;
3718     } elsif (0x0030 <= $self->{nc} and
3719     $self->{nc} <= 0x0039) { # 0..9
3720    
3721     $self->{state} = NCR_NUM_STATE;
3722     $self->{s_kwd} = $self->{nc} - 0x0030;
3723    
3724     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3725     $self->{line_prev} = $self->{line};
3726     $self->{column_prev} = $self->{column};
3727     $self->{column}++;
3728     $self->{nc}
3729     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3730     } else {
3731     $self->{set_nc}->($self);
3732     }
3733    
3734     redo A;
3735     } else {
3736     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero',
3737     line => $self->{line_prev},
3738     column => $self->{column_prev} - 1);
3739    
3740     ## NOTE: According to the spec algorithm, nothing is returned,
3741     ## and then "&#" is appended to the parent element or the attribute
3742     ## value in the later processing.
3743    
3744     if ($self->{prev_state} == DATA_STATE) {
3745    
3746     $self->{state} = $self->{prev_state};
3747     ## Reconsume.
3748     return ({type => CHARACTER_TOKEN,
3749     data => '&#',
3750     line => $self->{line_prev},
3751     column => $self->{column_prev} - 1,
3752     });
3753     redo A;
3754     } else {
3755    
3756     $self->{ca}->{value} .= '&#';
3757     $self->{state} = $self->{prev_state};
3758     ## Reconsume.
3759     redo A;
3760     }
3761     }
3762     } elsif ($self->{state} == NCR_NUM_STATE) {
3763     if (0x0030 <= $self->{nc} and
3764     $self->{nc} <= 0x0039) { # 0..9
3765    
3766     $self->{s_kwd} *= 10;
3767     $self->{s_kwd} += $self->{nc} - 0x0030;
3768    
3769     ## Stay in the state.
3770    
3771     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3772     $self->{line_prev} = $self->{line};
3773     $self->{column_prev} = $self->{column};
3774     $self->{column}++;
3775     $self->{nc}
3776     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3777     } else {
3778     $self->{set_nc}->($self);
3779     }
3780    
3781     redo A;
3782     } elsif ($self->{nc} == 0x003B) { # ;
3783    
3784    
3785     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3786     $self->{line_prev} = $self->{line};
3787     $self->{column_prev} = $self->{column};
3788     $self->{column}++;
3789     $self->{nc}
3790     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3791     } else {
3792     $self->{set_nc}->($self);
3793     }
3794    
3795     #
3796     } else {
3797    
3798     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
3799     ## Reconsume.
3800     #
3801     }
3802    
3803     my $code = $self->{s_kwd};
3804     my $l = $self->{line_prev};
3805     my $c = $self->{column_prev};
3806     if ($charref_map->{$code}) {
3807    
3808     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3809     text => (sprintf 'U+%04X', $code),
3810     line => $l, column => $c);
3811     $code = $charref_map->{$code};
3812     } elsif ($code > 0x10FFFF) {
3813    
3814     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3815     text => (sprintf 'U-%08X', $code),
3816     line => $l, column => $c);
3817     $code = 0xFFFD;
3818     }
3819    
3820     if ($self->{prev_state} == DATA_STATE) {
3821    
3822     $self->{state} = $self->{prev_state};
3823     ## Reconsume.
3824     return ({type => CHARACTER_TOKEN, data => chr $code,
3825     line => $l, column => $c,
3826     });
3827     redo A;
3828     } else {
3829    
3830     $self->{ca}->{value} .= chr $code;
3831     $self->{ca}->{has_reference} = 1;
3832     $self->{state} = $self->{prev_state};
3833     ## Reconsume.
3834     redo A;
3835     }
3836     } elsif ($self->{state} == HEXREF_X_STATE) {
3837     if ((0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) or
3838     (0x0041 <= $self->{nc} and $self->{nc} <= 0x0046) or
3839     (0x0061 <= $self->{nc} and $self->{nc} <= 0x0066)) {
3840     # 0..9, A..F, a..f
3841    
3842     $self->{state} = HEXREF_HEX_STATE;
3843     $self->{s_kwd} = 0;
3844     ## Reconsume.
3845     redo A;
3846     } else {
3847     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro',
3848     line => $self->{line_prev},
3849     column => $self->{column_prev} - 2);
3850    
3851     ## NOTE: According to the spec algorithm, nothing is returned,
3852     ## and then "&#" followed by "X" or "x" is appended to the parent
3853     ## element or the attribute value in the later processing.
3854    
3855     if ($self->{prev_state} == DATA_STATE) {
3856    
3857     $self->{state} = $self->{prev_state};
3858     ## Reconsume.
3859     return ({type => CHARACTER_TOKEN,
3860     data => '&' . $self->{s_kwd},
3861     line => $self->{line_prev},
3862     column => $self->{column_prev} - length $self->{s_kwd},
3863     });
3864     redo A;
3865     } else {
3866    
3867     $self->{ca}->{value} .= '&' . $self->{s_kwd};
3868     $self->{state} = $self->{prev_state};
3869     ## Reconsume.
3870     redo A;
3871     }
3872     }
3873     } elsif ($self->{state} == HEXREF_HEX_STATE) {
3874     if (0x0030 <= $self->{nc} and $self->{nc} <= 0x0039) {
3875     # 0..9
3876    
3877     $self->{s_kwd} *= 0x10;
3878     $self->{s_kwd} += $self->{nc} - 0x0030;
3879     ## Stay in the state.
3880    
3881     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3882     $self->{line_prev} = $self->{line};
3883     $self->{column_prev} = $self->{column};
3884     $self->{column}++;
3885     $self->{nc}
3886     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3887     } else {
3888     $self->{set_nc}->($self);
3889     }
3890    
3891     redo A;
3892     } elsif (0x0061 <= $self->{nc} and
3893     $self->{nc} <= 0x0066) { # a..f
3894    
3895     $self->{s_kwd} *= 0x10;
3896     $self->{s_kwd} += $self->{nc} - 0x0060 + 9;
3897     ## Stay in the state.
3898    
3899     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3900     $self->{line_prev} = $self->{line};
3901     $self->{column_prev} = $self->{column};
3902     $self->{column}++;
3903     $self->{nc}
3904     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3905     } else {
3906     $self->{set_nc}->($self);
3907     }
3908    
3909     redo A;
3910     } elsif (0x0041 <= $self->{nc} and
3911     $self->{nc} <= 0x0046) { # A..F
3912    
3913     $self->{s_kwd} *= 0x10;
3914     $self->{s_kwd} += $self->{nc} - 0x0040 + 9;
3915     ## Stay in the state.
3916    
3917     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3918     $self->{line_prev} = $self->{line};
3919     $self->{column_prev} = $self->{column};
3920     $self->{column}++;
3921     $self->{nc}
3922     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3923     } else {
3924     $self->{set_nc}->($self);
3925     }
3926    
3927     redo A;
3928     } elsif ($self->{nc} == 0x003B) { # ;
3929    
3930    
3931     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
3932     $self->{line_prev} = $self->{line};
3933     $self->{column_prev} = $self->{column};
3934     $self->{column}++;
3935     $self->{nc}
3936     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
3937     } else {
3938     $self->{set_nc}->($self);
3939     }
3940    
3941     #
3942     } else {
3943    
3944     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc',
3945     line => $self->{line},
3946     column => $self->{column});
3947     ## Reconsume.
3948     #
3949     }
3950    
3951     my $code = $self->{s_kwd};
3952     my $l = $self->{line_prev};
3953     my $c = $self->{column_prev};
3954     if ($charref_map->{$code}) {
3955    
3956     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3957     text => (sprintf 'U+%04X', $code),
3958     line => $l, column => $c);
3959     $code = $charref_map->{$code};
3960     } elsif ($code > 0x10FFFF) {
3961    
3962     $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference',
3963     text => (sprintf 'U-%08X', $code),
3964     line => $l, column => $c);
3965     $code = 0xFFFD;
3966     }
3967    
3968     if ($self->{prev_state} == DATA_STATE) {
3969    
3970     $self->{state} = $self->{prev_state};
3971     ## Reconsume.
3972     return ({type => CHARACTER_TOKEN, data => chr $code,
3973     line => $l, column => $c,
3974     });
3975     redo A;
3976     } else {
3977    
3978     $self->{ca}->{value} .= chr $code;
3979     $self->{ca}->{has_reference} = 1;
3980     $self->{state} = $self->{prev_state};
3981     ## Reconsume.
3982     redo A;
3983     }
3984     } elsif ($self->{state} == ENTITY_NAME_STATE) {
3985     if (length $self->{s_kwd} < 30 and
3986     ## NOTE: Some number greater than the maximum length of entity name
3987     ((0x0041 <= $self->{nc} and # a
3988     $self->{nc} <= 0x005A) or # x
3989     (0x0061 <= $self->{nc} and # a
3990     $self->{nc} <= 0x007A) or # z
3991     (0x0030 <= $self->{nc} and # 0
3992     $self->{nc} <= 0x0039) or # 9
3993     $self->{nc} == 0x003B)) { # ;
3994     our $EntityChar;
3995     $self->{s_kwd} .= chr $self->{nc};
3996     if (defined $EntityChar->{$self->{s_kwd}}) {
3997     if ($self->{nc} == 0x003B) { # ;
3998    
3999     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4000     $self->{entity__match} = 1;
4001    
4002     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4003     $self->{line_prev} = $self->{line};
4004     $self->{column_prev} = $self->{column};
4005     $self->{column}++;
4006     $self->{nc}
4007     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4008     } else {
4009     $self->{set_nc}->($self);
4010     }
4011    
4012     #
4013     } else {
4014    
4015     $self->{entity__value} = $EntityChar->{$self->{s_kwd}};
4016     $self->{entity__match} = -1;
4017     ## Stay in the state.
4018    
4019     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4020     $self->{line_prev} = $self->{line};
4021     $self->{column_prev} = $self->{column};
4022     $self->{column}++;
4023     $self->{nc}
4024     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4025     } else {
4026     $self->{set_nc}->($self);
4027     }
4028    
4029     redo A;
4030     }
4031     } else {
4032    
4033     $self->{entity__value} .= chr $self->{nc};
4034     $self->{entity__match} *= 2;
4035     ## Stay in the state.
4036    
4037     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
4038     $self->{line_prev} = $self->{line};
4039     $self->{column_prev} = $self->{column};
4040     $self->{column}++;
4041     $self->{nc}
4042     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
4043     } else {
4044     $self->{set_nc}->($self);
4045     }
4046    
4047     redo A;
4048     }
4049     }
4050    
4051     my $data;
4052     my $has_ref;
4053     if ($self->{entity__match} > 0) {
4054    
4055     $data = $self->{entity__value};
4056     $has_ref = 1;
4057     #
4058     } elsif ($self->{entity__match} < 0) {
4059     $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc');
4060     if ($self->{prev_state} != DATA_STATE and # in attribute
4061     $self->{entity__match} < -1) {
4062    
4063     $data = '&' . $self->{s_kwd};
4064     #
4065     } else {
4066    
4067     $data = $self->{entity__value};
4068     $has_ref = 1;
4069     #
4070     }
4071     } else {
4072    
4073     $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero',
4074     line => $self->{line_prev},
4075     column => $self->{column_prev} - length $self->{s_kwd});
4076     $data = '&' . $self->{s_kwd};
4077     #
4078     }
4079    
4080     ## NOTE: In these cases, when a character reference is found,
4081     ## it is consumed and a character token is returned, or, otherwise,
4082     ## nothing is consumed and returned, according to the spec algorithm.
4083     ## In this implementation, anything that has been examined by the
4084     ## tokenizer is appended to the parent element or the attribute value
4085     ## as string, either literal string when no character reference or
4086     ## entity-replaced string otherwise, in this stage, since any characters
4087     ## that would not be consumed are appended in the data state or in an
4088     ## appropriate attribute value state anyway.
4089    
4090     if ($self->{prev_state} == DATA_STATE) {
4091    
4092     $self->{state} = $self->{prev_state};
4093     ## Reconsume.
4094     return ({type => CHARACTER_TOKEN,
4095     data => $data,
4096     line => $self->{line_prev},
4097     column => $self->{column_prev} + 1 - length $self->{s_kwd},
4098     });
4099     redo A;
4100     } else {
4101    
4102     $self->{ca}->{value} .= $data;
4103     $self->{ca}->{has_reference} = 1 if $has_ref;
4104     $self->{state} = $self->{prev_state};
4105     ## Reconsume.
4106     redo A;
4107     }
4108     } else {
4109     die "$0: $self->{state}: Unknown state";
4110     }
4111     } # A
4112    
4113     die "$0: _get_next_token: unexpected case";
4114     } # _get_next_token
4115    
4116     1;
4117     ## $Date:$

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24